xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/rs6000/xmmintrin.h (revision e7ac2a8b5bd66fa2e050809de09a075c36a7014d)
1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 /* Implemented from the specification included in the Intel C++ Compiler
25    User Guide and Reference, version 9.0.  */
26 
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29    makes explicit use of Intel intrinsics to powerpc64le.
30    It is the user's responsibility to determine if the results are
31    acceptable and make additional changes as necessary.
32    Note that much code that uses Intel intrinsics can be rewritten in
33    standard C or GNU C extensions, which are more portable and better
34    optimized across multiple targets.
35 
36    In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37    VMX/VSX ISA is a good match for vector float SIMD operations.
38    However scalar float operations in vector (XMM) registers require
39    the POWER8 VSX ISA (2.07) level. Also there are important
40    differences for data format and placement of float scalars in the
41    vector register. For PowerISA Scalar floats in FPRs (left most
42    64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43    uses the right most 32-bits of the XMM. These differences require
44    extra steps on POWER to match the SSE scalar float semantics.
45 
46    Most SSE scalar float intrinsic operations can be performed more
47    efficiently as C language float scalar operations or optimized to
48    use vector SIMD operations.  We recommend this for new applications.
49 
50    Another difference is the format and details of the X86_64 MXSCR vs
51    the PowerISA FPSCR / VSCR registers. We recommend applications
52    replace direct access to the MXSCR with the more portable <fenv.h>
53    Posix APIs. */
54 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #endif
56 
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
59 
60 /* Define four value permute mask */
61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
62 
63 #include <altivec.h>
64 
65 /* Avoid collisions between altivec.h and strict adherence to C++ and
66    C11 standards.  This should eventually be done inside altivec.h itself,
67    but only after testing a full distro build.  */
68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 				 (defined(__STDC_VERSION__) &&	\
70 				  __STDC_VERSION__ >= 201112L))
71 #undef vector
72 #undef pixel
73 #undef bool
74 #endif
75 
76 #include <assert.h>
77 
78 /* We need type definitions from the MMX header file.  */
79 #include <mmintrin.h>
80 
81 /* Get _mm_malloc () and _mm_free ().  */
82 #include <mm_malloc.h>
83 
84 /* The Intel API is flexible enough that we must allow aliasing with other
85    vector types, and their scalar components.  */
86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
87 
88 /* Internal data types for implementing the intrinsics.  */
89 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
90 
91 /* Create an undefined vector.  */
92 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
93 _mm_undefined_ps (void)
94 {
95   __m128 __Y = __Y;
96   return __Y;
97 }
98 
99 /* Create a vector of zeros.  */
100 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101 _mm_setzero_ps (void)
102 {
103   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
104 }
105 
106 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
107 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
108 _mm_load_ps (float const *__P)
109 {
110   assert(((unsigned long)__P & 0xfUL) == 0UL);
111   return ((__m128)vec_ld(0, (__v4sf*)__P));
112 }
113 
114 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
115 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
116 _mm_loadu_ps (float const *__P)
117 {
118   return (vec_vsx_ld(0, __P));
119 }
120 
121 /* Load four SPFP values in reverse order.  The address must be aligned.  */
122 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
123 _mm_loadr_ps (float const *__P)
124 {
125   __v4sf   __tmp;
126   __m128 result;
127   static const __vector unsigned char permute_vector =
128     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
129 	0x17, 0x10, 0x11, 0x12, 0x13 };
130 
131   __tmp = vec_ld (0, (__v4sf *) __P);
132   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
133   return result;
134 }
135 
136 /* Create a vector with all four elements equal to F.  */
137 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
138 _mm_set1_ps (float __F)
139 {
140   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
141 }
142 
143 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
144 _mm_set_ps1 (float __F)
145 {
146   return _mm_set1_ps (__F);
147 }
148 
149 /* Create the vector [Z Y X W].  */
150 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
152 {
153   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
154 }
155 
156 /* Create the vector [W X Y Z].  */
157 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
159 {
160   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
161 }
162 
163 /* Store four SPFP values.  The address must be 16-byte aligned.  */
164 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _mm_store_ps (float *__P, __m128 __A)
166 {
167   assert(((unsigned long)__P & 0xfUL) == 0UL);
168   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
169 }
170 
171 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
172 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
173 _mm_storeu_ps (float *__P, __m128 __A)
174 {
175   *(__m128 *)__P = __A;
176 }
177 
178 /* Store four SPFP values in reverse order.  The address must be aligned.  */
179 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
180 _mm_storer_ps (float *__P, __m128 __A)
181 {
182   __v4sf   __tmp;
183   static const __vector unsigned char permute_vector =
184     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
185 	0x17, 0x10, 0x11, 0x12, 0x13 };
186 
187   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
188 
189   _mm_store_ps (__P, __tmp);
190 }
191 
192 /* Store the lower SPFP value across four words.  */
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_store1_ps (float *__P, __m128 __A)
195 {
196   __v4sf __va = vec_splat((__v4sf)__A, 0);
197   _mm_store_ps (__P, __va);
198 }
199 
200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_store_ps1 (float *__P, __m128 __A)
202 {
203   _mm_store1_ps (__P, __A);
204 }
205 
206 /* Create a vector with element 0 as F and the rest zero.  */
207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_set_ss (float __F)
209 {
210   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
211 }
212 
213 /* Sets the low SPFP value of A from the low value of B.  */
214 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
215 _mm_move_ss (__m128 __A, __m128 __B)
216 {
217   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
218 
219   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
220 }
221 
222 /* Create a vector with element 0 as *P and the rest zero.  */
223 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
224 _mm_load_ss (float const *__P)
225 {
226   return _mm_set_ss (*__P);
227 }
228 
229 /* Stores the lower SPFP value.  */
230 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231 _mm_store_ss (float *__P, __m128 __A)
232 {
233   *__P = ((__v4sf)__A)[0];
234 }
235 
236 /* Perform the respective operation on the lower SPFP (single-precision
237    floating-point) values of A and B; the upper three SPFP values are
238    passed through from A.  */
239 
240 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_add_ss (__m128 __A, __m128 __B)
242 {
243 #ifdef _ARCH_PWR7
244   __m128 a, b, c;
245   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
246   /* PowerISA VSX does not allow partial (for just lower double)
247      results. So to insure we don't generate spurious exceptions
248      (from the upper double values) we splat the lower double
249      before we to the operation.  */
250   a = vec_splat (__A, 0);
251   b = vec_splat (__B, 0);
252   c = a + b;
253   /* Then we merge the lower float result with the original upper
254      float elements from __A.  */
255   return (vec_sel (__A, c, mask));
256 #else
257   __A[0] = __A[0] + __B[0];
258   return (__A);
259 #endif
260 }
261 
262 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_sub_ss (__m128 __A, __m128 __B)
264 {
265 #ifdef _ARCH_PWR7
266   __m128 a, b, c;
267   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
268   /* PowerISA VSX does not allow partial (for just lower double)
269      results. So to insure we don't generate spurious exceptions
270      (from the upper double values) we splat the lower double
271      before we to the operation.  */
272   a = vec_splat (__A, 0);
273   b = vec_splat (__B, 0);
274   c = a - b;
275   /* Then we merge the lower float result with the original upper
276      float elements from __A.  */
277   return (vec_sel (__A, c, mask));
278 #else
279   __A[0] = __A[0] - __B[0];
280   return (__A);
281 #endif
282 }
283 
284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
285 _mm_mul_ss (__m128 __A, __m128 __B)
286 {
287 #ifdef _ARCH_PWR7
288   __m128 a, b, c;
289   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
290   /* PowerISA VSX does not allow partial (for just lower double)
291      results. So to insure we don't generate spurious exceptions
292      (from the upper double values) we splat the lower double
293      before we to the operation.  */
294   a = vec_splat (__A, 0);
295   b = vec_splat (__B, 0);
296   c = a * b;
297   /* Then we merge the lower float result with the original upper
298      float elements from __A.  */
299   return (vec_sel (__A, c, mask));
300 #else
301   __A[0] = __A[0] * __B[0];
302   return (__A);
303 #endif
304 }
305 
306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 _mm_div_ss (__m128 __A, __m128 __B)
308 {
309 #ifdef _ARCH_PWR7
310   __m128 a, b, c;
311   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
312   /* PowerISA VSX does not allow partial (for just lower double)
313      results. So to insure we don't generate spurious exceptions
314      (from the upper double values) we splat the lower double
315      before we to the operation.  */
316   a = vec_splat (__A, 0);
317   b = vec_splat (__B, 0);
318   c = a / b;
319   /* Then we merge the lower float result with the original upper
320      float elements from __A.  */
321   return (vec_sel (__A, c, mask));
322 #else
323   __A[0] = __A[0] / __B[0];
324   return (__A);
325 #endif
326 }
327 
328 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
329 _mm_sqrt_ss (__m128 __A)
330 {
331   __m128 a, c;
332   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
333   /* PowerISA VSX does not allow partial (for just lower double)
334    * results. So to insure we don't generate spurious exceptions
335    * (from the upper double values) we splat the lower double
336    * before we to the operation. */
337   a = vec_splat (__A, 0);
338   c = vec_sqrt (a);
339   /* Then we merge the lower float result with the original upper
340    * float elements from __A.  */
341   return (vec_sel (__A, c, mask));
342 }
343 
344 /* Perform the respective operation on the four SPFP values in A and B.  */
345 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346 _mm_add_ps (__m128 __A, __m128 __B)
347 {
348   return (__m128) ((__v4sf)__A + (__v4sf)__B);
349 }
350 
351 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352 _mm_sub_ps (__m128 __A, __m128 __B)
353 {
354   return (__m128) ((__v4sf)__A - (__v4sf)__B);
355 }
356 
357 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
358 _mm_mul_ps (__m128 __A, __m128 __B)
359 {
360   return (__m128) ((__v4sf)__A * (__v4sf)__B);
361 }
362 
363 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
364 _mm_div_ps (__m128 __A, __m128 __B)
365 {
366   return (__m128) ((__v4sf)__A / (__v4sf)__B);
367 }
368 
369 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
370 _mm_sqrt_ps (__m128 __A)
371 {
372   return (vec_sqrt ((__v4sf)__A));
373 }
374 
375 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
376 _mm_rcp_ps (__m128 __A)
377 {
378   return (vec_re ((__v4sf)__A));
379 }
380 
381 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382 _mm_rsqrt_ps (__m128 __A)
383 {
384   return (vec_rsqrte (__A));
385 }
386 
387 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_rcp_ss (__m128 __A)
389 {
390   __m128 a, c;
391   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
392   /* PowerISA VSX does not allow partial (for just lower double)
393    * results. So to insure we don't generate spurious exceptions
394    * (from the upper double values) we splat the lower double
395    * before we to the operation. */
396   a = vec_splat (__A, 0);
397   c = _mm_rcp_ps (a);
398   /* Then we merge the lower float result with the original upper
399    * float elements from __A.  */
400   return (vec_sel (__A, c, mask));
401 }
402 
403 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_rsqrt_ss (__m128 __A)
405 {
406   __m128 a, c;
407   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
408   /* PowerISA VSX does not allow partial (for just lower double)
409    * results. So to insure we don't generate spurious exceptions
410    * (from the upper double values) we splat the lower double
411    * before we to the operation. */
412   a = vec_splat (__A, 0);
413   c = vec_rsqrte (a);
414   /* Then we merge the lower float result with the original upper
415    * float elements from __A.  */
416   return (vec_sel (__A, c, mask));
417 }
418 
419 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
420 _mm_min_ss (__m128 __A, __m128 __B)
421 {
422   __v4sf a, b, c;
423   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
424   /* PowerISA VSX does not allow partial (for just lower float)
425    * results. So to insure we don't generate spurious exceptions
426    * (from the upper float values) we splat the lower float
427    * before we to the operation. */
428   a = vec_splat ((__v4sf)__A, 0);
429   b = vec_splat ((__v4sf)__B, 0);
430   c = vec_min (a, b);
431   /* Then we merge the lower float result with the original upper
432    * float elements from __A.  */
433   return (vec_sel ((__v4sf)__A, c, mask));
434 }
435 
436 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
437 _mm_max_ss (__m128 __A, __m128 __B)
438 {
439   __v4sf a, b, c;
440   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
441   /* PowerISA VSX does not allow partial (for just lower float)
442    * results. So to insure we don't generate spurious exceptions
443    * (from the upper float values) we splat the lower float
444    * before we to the operation. */
445   a = vec_splat (__A, 0);
446   b = vec_splat (__B, 0);
447   c = vec_max (a, b);
448   /* Then we merge the lower float result with the original upper
449    * float elements from __A.  */
450   return (vec_sel ((__v4sf)__A, c, mask));
451 }
452 
453 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454 _mm_min_ps (__m128 __A, __m128 __B)
455 {
456   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __B, (__v4sf) __A);
457   return vec_sel (__B, __A, m);
458 }
459 
460 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
461 _mm_max_ps (__m128 __A, __m128 __B)
462 {
463   __m128 m = (__m128) vec_vcmpgtfp ((__v4sf) __A, (__v4sf) __B);
464   return vec_sel (__B, __A, m);
465 }
466 
467 /* Perform logical bit-wise operations on 128-bit values.  */
468 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469 _mm_and_ps (__m128 __A, __m128 __B)
470 {
471   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
472 //  return __builtin_ia32_andps (__A, __B);
473 }
474 
475 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476 _mm_andnot_ps (__m128 __A, __m128 __B)
477 {
478   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
479 }
480 
481 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482 _mm_or_ps (__m128 __A, __m128 __B)
483 {
484   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
485 }
486 
487 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 _mm_xor_ps (__m128 __A, __m128 __B)
489 {
490   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
491 }
492 
493 /* Perform a comparison on the four SPFP values of A and B.  For each
494    element, if the comparison is true, place a mask of all ones in the
495    result, otherwise a mask of zeros.  */
496 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
497 _mm_cmpeq_ps (__m128 __A, __m128 __B)
498 {
499   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
500 }
501 
502 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
503 _mm_cmplt_ps (__m128 __A, __m128 __B)
504 {
505   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
506 }
507 
508 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509 _mm_cmple_ps (__m128 __A, __m128 __B)
510 {
511   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
512 }
513 
514 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515 _mm_cmpgt_ps (__m128 __A, __m128 __B)
516 {
517   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
518 }
519 
520 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm_cmpge_ps (__m128 __A, __m128 __B)
522 {
523   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
524 }
525 
526 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_cmpneq_ps (__m128  __A, __m128  __B)
528 {
529   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
530   return ((__m128)vec_nor (temp, temp));
531 }
532 
533 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
534 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
535 {
536   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
537 }
538 
539 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
540 _mm_cmpnle_ps (__m128 __A, __m128 __B)
541 {
542   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
543 }
544 
545 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546 _mm_cmpngt_ps (__m128 __A, __m128 __B)
547 {
548   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
549 }
550 
551 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
552 _mm_cmpnge_ps (__m128 __A, __m128 __B)
553 {
554   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
555 }
556 
557 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558 _mm_cmpord_ps (__m128  __A, __m128  __B)
559 {
560   __vector unsigned int a, b;
561   __vector unsigned int c, d;
562   static const __vector unsigned int float_exp_mask =
563     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
564 
565   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
566   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
567   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
568   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
569   return ((__m128 ) vec_and (c, d));
570 }
571 
572 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
573 _mm_cmpunord_ps (__m128 __A, __m128 __B)
574 {
575   __vector unsigned int a, b;
576   __vector unsigned int c, d;
577   static const __vector unsigned int float_exp_mask =
578     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
579 
580   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
581   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
582   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
583   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
584   return ((__m128 ) vec_or (c, d));
585 }
586 
587 /* Perform a comparison on the lower SPFP values of A and B.  If the
588    comparison is true, place a mask of all ones in the result, otherwise a
589    mask of zeros.  The upper three SPFP values are passed through from A.  */
590 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm_cmpeq_ss (__m128  __A, __m128  __B)
592 {
593   static const __vector unsigned int mask =
594     { 0xffffffff, 0, 0, 0 };
595   __v4sf a, b, c;
596   /* PowerISA VMX does not allow partial (for just element 0)
597    * results. So to insure we don't generate spurious exceptions
598    * (from the upper elements) we splat the lower float
599    * before we to the operation. */
600   a = vec_splat ((__v4sf) __A, 0);
601   b = vec_splat ((__v4sf) __B, 0);
602   c = (__v4sf) vec_cmpeq(a, b);
603   /* Then we merge the lower float result with the original upper
604    * float elements from __A.  */
605   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
606 }
607 
608 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
609 _mm_cmplt_ss (__m128 __A, __m128 __B)
610 {
611   static const __vector unsigned int mask =
612     { 0xffffffff, 0, 0, 0 };
613   __v4sf a, b, c;
614   /* PowerISA VMX does not allow partial (for just element 0)
615    * results. So to insure we don't generate spurious exceptions
616    * (from the upper elements) we splat the lower float
617    * before we to the operation. */
618   a = vec_splat ((__v4sf) __A, 0);
619   b = vec_splat ((__v4sf) __B, 0);
620   c = (__v4sf) vec_cmplt(a, b);
621   /* Then we merge the lower float result with the original upper
622    * float elements from __A.  */
623   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
624 }
625 
626 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627 _mm_cmple_ss (__m128 __A, __m128 __B)
628 {
629   static const __vector unsigned int mask =
630     { 0xffffffff, 0, 0, 0 };
631   __v4sf a, b, c;
632   /* PowerISA VMX does not allow partial (for just element 0)
633    * results. So to insure we don't generate spurious exceptions
634    * (from the upper elements) we splat the lower float
635    * before we to the operation. */
636   a = vec_splat ((__v4sf) __A, 0);
637   b = vec_splat ((__v4sf) __B, 0);
638   c = (__v4sf) vec_cmple(a, b);
639   /* Then we merge the lower float result with the original upper
640    * float elements from __A.  */
641   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
642 }
643 
644 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
645 _mm_cmpgt_ss (__m128 __A, __m128 __B)
646 {
647   static const __vector unsigned int mask =
648     { 0xffffffff, 0, 0, 0 };
649   __v4sf a, b, c;
650   /* PowerISA VMX does not allow partial (for just element 0)
651    * results. So to insure we don't generate spurious exceptions
652    * (from the upper elements) we splat the lower float
653    * before we to the operation. */
654   a = vec_splat ((__v4sf) __A, 0);
655   b = vec_splat ((__v4sf) __B, 0);
656   c = (__v4sf) vec_cmpgt(a, b);
657   /* Then we merge the lower float result with the original upper
658    * float elements from __A.  */
659   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
660 }
661 
662 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
663 _mm_cmpge_ss (__m128 __A, __m128 __B)
664 {
665   static const __vector unsigned int mask =
666     { 0xffffffff, 0, 0, 0 };
667   __v4sf a, b, c;
668   /* PowerISA VMX does not allow partial (for just element 0)
669    * results. So to insure we don't generate spurious exceptions
670    * (from the upper elements) we splat the lower float
671    * before we to the operation. */
672   a = vec_splat ((__v4sf) __A, 0);
673   b = vec_splat ((__v4sf) __B, 0);
674   c = (__v4sf) vec_cmpge(a, b);
675   /* Then we merge the lower float result with the original upper
676    * float elements from __A.  */
677   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
678 }
679 
680 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681 _mm_cmpneq_ss (__m128 __A, __m128 __B)
682 {
683   static const __vector unsigned int mask =
684     { 0xffffffff, 0, 0, 0 };
685   __v4sf a, b, c;
686   /* PowerISA VMX does not allow partial (for just element 0)
687    * results. So to insure we don't generate spurious exceptions
688    * (from the upper elements) we splat the lower float
689    * before we to the operation. */
690   a = vec_splat ((__v4sf) __A, 0);
691   b = vec_splat ((__v4sf) __B, 0);
692   c = (__v4sf) vec_cmpeq(a, b);
693   c = vec_nor (c, c);
694   /* Then we merge the lower float result with the original upper
695    * float elements from __A.  */
696   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
697 }
698 
699 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
700 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
701 {
702   static const __vector unsigned int mask =
703     { 0xffffffff, 0, 0, 0 };
704   __v4sf a, b, c;
705   /* PowerISA VMX does not allow partial (for just element 0)
706    * results. So to insure we don't generate spurious exceptions
707    * (from the upper elements) we splat the lower float
708    * before we to the operation. */
709   a = vec_splat ((__v4sf) __A, 0);
710   b = vec_splat ((__v4sf) __B, 0);
711   c = (__v4sf) vec_cmpge(a, b);
712   /* Then we merge the lower float result with the original upper
713    * float elements from __A.  */
714   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
715 }
716 
717 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
718 _mm_cmpnle_ss (__m128 __A, __m128 __B)
719 {
720   static const __vector unsigned int mask =
721     { 0xffffffff, 0, 0, 0 };
722   __v4sf a, b, c;
723   /* PowerISA VMX does not allow partial (for just element 0)
724    * results. So to insure we don't generate spurious exceptions
725    * (from the upper elements) we splat the lower float
726    * before we to the operation. */
727   a = vec_splat ((__v4sf) __A, 0);
728   b = vec_splat ((__v4sf) __B, 0);
729   c = (__v4sf) vec_cmpgt(a, b);
730   /* Then we merge the lower float result with the original upper
731    * float elements from __A.  */
732   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
733 }
734 
735 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
736 _mm_cmpngt_ss (__m128 __A, __m128 __B)
737 {
738   static const __vector unsigned int mask =
739     { 0xffffffff, 0, 0, 0 };
740   __v4sf a, b, c;
741   /* PowerISA VMX does not allow partial (for just element 0)
742    * results. So to insure we don't generate spurious exceptions
743    * (from the upper elements) we splat the lower float
744    * before we to the operation. */
745   a = vec_splat ((__v4sf) __A, 0);
746   b = vec_splat ((__v4sf) __B, 0);
747   c = (__v4sf) vec_cmple(a, b);
748   /* Then we merge the lower float result with the original upper
749    * float elements from __A.  */
750   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
751 }
752 
753 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
754 _mm_cmpnge_ss (__m128 __A, __m128 __B)
755 {
756   static const __vector unsigned int mask =
757     { 0xffffffff, 0, 0, 0 };
758   __v4sf a, b, c;
759   /* PowerISA VMX does not allow partial (for just element 0)
760    * results. So to insure we don't generate spurious exceptions
761    * (from the upper elements) we splat the lower float
762    * before we do the operation. */
763   a = vec_splat ((__v4sf) __A, 0);
764   b = vec_splat ((__v4sf) __B, 0);
765   c = (__v4sf) vec_cmplt(a, b);
766   /* Then we merge the lower float result with the original upper
767    * float elements from __A.  */
768   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
769 }
770 
771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _mm_cmpord_ss (__m128 __A, __m128 __B)
773 {
774   __vector unsigned int a, b;
775   __vector unsigned int c, d;
776   static const __vector unsigned int float_exp_mask =
777     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
778   static const __vector unsigned int mask =
779     { 0xffffffff, 0, 0, 0 };
780 
781   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
782   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
783   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
784   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
785   c = vec_and (c, d);
786   /* Then we merge the lower float result with the original upper
787    * float elements from __A.  */
788   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
789 }
790 
791 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
792 _mm_cmpunord_ss (__m128 __A, __m128 __B)
793 {
794   __vector unsigned int a, b;
795   __vector unsigned int c, d;
796   static const __vector unsigned int float_exp_mask =
797     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
798   static const __vector unsigned int mask =
799     { 0xffffffff, 0, 0, 0 };
800 
801   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
802   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
803   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
804   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
805   c = vec_or (c, d);
806   /* Then we merge the lower float result with the original upper
807    * float elements from __A.  */
808   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
809 }
810 
811 /* Compare the lower SPFP values of A and B and return 1 if true
812    and 0 if false.  */
813 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
814 _mm_comieq_ss (__m128 __A, __m128 __B)
815 {
816   return (__A[0] == __B[0]);
817 }
818 
819 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
820 _mm_comilt_ss (__m128 __A, __m128 __B)
821 {
822   return (__A[0] < __B[0]);
823 }
824 
825 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
826 _mm_comile_ss (__m128 __A, __m128 __B)
827 {
828   return (__A[0] <= __B[0]);
829 }
830 
831 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832 _mm_comigt_ss (__m128 __A, __m128 __B)
833 {
834   return (__A[0] > __B[0]);
835 }
836 
837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_comige_ss (__m128 __A, __m128 __B)
839 {
840   return (__A[0] >= __B[0]);
841 }
842 
843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 _mm_comineq_ss (__m128 __A, __m128 __B)
845 {
846   return (__A[0] != __B[0]);
847 }
848 
849 /* FIXME
850  * The __mm_ucomi??_ss implementations below are exactly the same as
851  * __mm_comi??_ss because GCC for PowerPC only generates unordered
852  * compares (scalar and vector).
853  * Technically __mm_comieq_ss et al should be using the ordered
854  * compare and signal for QNaNs.
855  * The __mm_ucomieq_sd et all should be OK, as is.
856  */
857 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
858 _mm_ucomieq_ss (__m128 __A, __m128 __B)
859 {
860   return (__A[0] == __B[0]);
861 }
862 
863 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
864 _mm_ucomilt_ss (__m128 __A, __m128 __B)
865 {
866   return (__A[0] < __B[0]);
867 }
868 
869 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
870 _mm_ucomile_ss (__m128 __A, __m128 __B)
871 {
872   return (__A[0] <= __B[0]);
873 }
874 
875 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876 _mm_ucomigt_ss (__m128 __A, __m128 __B)
877 {
878   return (__A[0] > __B[0]);
879 }
880 
881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882 _mm_ucomige_ss (__m128 __A, __m128 __B)
883 {
884   return (__A[0] >= __B[0]);
885 }
886 
887 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
888 _mm_ucomineq_ss (__m128 __A, __m128 __B)
889 {
890   return (__A[0] != __B[0]);
891 }
892 
893 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894 _mm_cvtss_f32 (__m128 __A)
895 {
896   return ((__v4sf)__A)[0];
897 }
898 
899 /* Convert the lower SPFP value to a 32-bit integer according to the current
900    rounding mode.  */
901 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
902 _mm_cvtss_si32 (__m128 __A)
903 {
904   __m64 res = 0;
905 #ifdef _ARCH_PWR8
906   __m128 vtmp;
907   __asm__(
908       "xxsldwi %x1,%x2,%x2,3;\n"
909       "xscvspdp %x1,%x1;\n"
910       "fctiw  %1,%1;\n"
911       "mfvsrd  %0,%x1;\n"
912       : "=r" (res),
913 	"=&wi" (vtmp)
914       : "wa" (__A)
915       : );
916 #else
917   res = __builtin_rint(__A[0]);
918 #endif
919   return (res);
920 }
921 
922 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923 _mm_cvt_ss2si (__m128 __A)
924 {
925   return _mm_cvtss_si32 (__A);
926 }
927 
928 /* Convert the lower SPFP value to a 32-bit integer according to the
929    current rounding mode.  */
930 
931 /* Intel intrinsic.  */
932 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
933 _mm_cvtss_si64 (__m128 __A)
934 {
935   __m64 res = 0;
936 #ifdef _ARCH_PWR8
937   __m128 vtmp;
938   __asm__(
939       "xxsldwi %x1,%x2,%x2,3;\n"
940       "xscvspdp %x1,%x1;\n"
941       "fctid  %1,%1;\n"
942       "mfvsrd  %0,%x1;\n"
943       : "=r" (res),
944 	"=&wi" (vtmp)
945       : "wa" (__A)
946       : );
947 #else
948   res = __builtin_llrint(__A[0]);
949 #endif
950   return (res);
951 }
952 
953 /* Microsoft intrinsic.  */
954 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
955 _mm_cvtss_si64x (__m128 __A)
956 {
957   return _mm_cvtss_si64 ((__v4sf) __A);
958 }
959 
960 /* Constants for use with _mm_prefetch.  */
961 enum _mm_hint
962 {
963   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
964   _MM_HINT_ET0 = 7,
965   _MM_HINT_ET1 = 6,
966   _MM_HINT_T0 = 3,
967   _MM_HINT_T1 = 2,
968   _MM_HINT_T2 = 1,
969   _MM_HINT_NTA = 0
970 };
971 
972 /* Loads one cache line from address P to a location "closer" to the
973    processor.  The selector I specifies the type of prefetch operation.  */
974 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm_prefetch (const void *__P, enum _mm_hint __I)
976 {
977   /* Current PowerPC will ignores the hint parameters.  */
978   __builtin_prefetch (__P);
979 }
980 
981 /* Convert the two lower SPFP values to 32-bit integers according to the
982    current rounding mode.  Return the integers in packed form.  */
983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm_cvtps_pi32 (__m128 __A)
985 {
986   /* Splat two lower SPFP values to both halves.  */
987   __v4sf temp, rounded;
988   __vector __m64 result;
989 
990   /* Splat two lower SPFP values to both halves.  */
991   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
992   rounded = vec_rint(temp);
993   result = (__vector __m64) vec_cts (rounded, 0);
994 
995   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
996 }
997 
998 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm_cvt_ps2pi (__m128 __A)
1000 {
1001   return _mm_cvtps_pi32 (__A);
1002 }
1003 
1004 /* Truncate the lower SPFP value to a 32-bit integer.  */
1005 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm_cvttss_si32 (__m128 __A)
1007 {
1008   /* Extract the lower float element.  */
1009   float temp = __A[0];
1010   /* truncate to 32-bit integer and return.  */
1011   return temp;
1012 }
1013 
1014 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_cvtt_ss2si (__m128 __A)
1016 {
1017   return _mm_cvttss_si32 (__A);
1018 }
1019 
1020 /* Intel intrinsic.  */
1021 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1022 _mm_cvttss_si64 (__m128 __A)
1023 {
1024   /* Extract the lower float element.  */
1025   float temp = __A[0];
1026   /* truncate to 32-bit integer and return.  */
1027   return temp;
1028 }
1029 
1030 /* Microsoft intrinsic.  */
1031 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _mm_cvttss_si64x (__m128 __A)
1033 {
1034   /* Extract the lower float element.  */
1035   float temp = __A[0];
1036   /* truncate to 32-bit integer and return.  */
1037   return temp;
1038 }
1039 
1040 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1041    integers in packed form.  */
1042 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043 _mm_cvttps_pi32 (__m128 __A)
1044 {
1045   __v4sf temp;
1046   __vector __m64 result;
1047 
1048   /* Splat two lower SPFP values to both halves.  */
1049   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1050   result = (__vector __m64) vec_cts (temp, 0);
1051 
1052   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1053 }
1054 
1055 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvtt_ps2pi (__m128 __A)
1057 {
1058   return _mm_cvttps_pi32 (__A);
1059 }
1060 
1061 /* Convert B to a SPFP value and insert it as element zero in A.  */
1062 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1063 _mm_cvtsi32_ss (__m128 __A, int __B)
1064 {
1065   float temp = __B;
1066   __A[0] = temp;
1067 
1068   return __A;
1069 }
1070 
1071 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_cvt_si2ss (__m128 __A, int __B)
1073 {
1074   return _mm_cvtsi32_ss (__A, __B);
1075 }
1076 
1077 /* Convert B to a SPFP value and insert it as element zero in A.  */
1078 /* Intel intrinsic.  */
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1080 _mm_cvtsi64_ss (__m128 __A, long long __B)
1081 {
1082   float temp = __B;
1083   __A[0] = temp;
1084 
1085   return __A;
1086 }
1087 
1088 /* Microsoft intrinsic.  */
1089 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1090 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1091 {
1092   return _mm_cvtsi64_ss (__A, __B);
1093 }
1094 
1095 /* Convert the two 32-bit values in B to SPFP form and insert them
1096    as the two lower elements in A.  */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1099 {
1100   __vector signed int vm1;
1101   __vector float vf1;
1102 
1103   vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
1104   vf1 = (__vector float) vec_ctf (vm1, 0);
1105 
1106   return ((__m128) (__vector __m64)
1107     { ((__vector __m64)vf1) [0], ((__vector __m64)__A) [1]});
1108 }
1109 
1110 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1111 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1112 {
1113   return _mm_cvtpi32_ps (__A, __B);
1114 }
1115 
1116 /* Convert the four signed 16-bit values in A to SPFP form.  */
1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_cvtpi16_ps (__m64 __A)
1119 {
1120   __vector signed short vs8;
1121   __vector signed int vi4;
1122   __vector float vf1;
1123 
1124   vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
1125   vi4 = vec_vupklsh (vs8);
1126   vf1 = (__vector float) vec_ctf (vi4, 0);
1127 
1128   return (__m128) vf1;
1129 }
1130 
1131 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1133 _mm_cvtpu16_ps (__m64 __A)
1134 {
1135   const __vector unsigned short zero =
1136     { 0, 0, 0, 0, 0, 0, 0, 0 };
1137   __vector unsigned short vs8;
1138   __vector unsigned int vi4;
1139   __vector float vf1;
1140 
1141   vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
1142   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
1143   vf1 = (__vector float) vec_ctf (vi4, 0);
1144 
1145   return (__m128) vf1;
1146 }
1147 
1148 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1149 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1150 _mm_cvtpi8_ps (__m64 __A)
1151 {
1152   __vector signed char vc16;
1153   __vector signed short vs8;
1154   __vector signed int vi4;
1155   __vector float vf1;
1156 
1157   vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
1158   vs8 = vec_vupkhsb (vc16);
1159   vi4 = vec_vupkhsh (vs8);
1160   vf1 = (__vector float) vec_ctf (vi4, 0);
1161 
1162   return (__m128) vf1;
1163 }
1164 
1165 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1166 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1167 
1168 _mm_cvtpu8_ps (__m64  __A)
1169 {
1170   const __vector unsigned char zero =
1171     { 0, 0, 0, 0, 0, 0, 0, 0 };
1172   __vector unsigned char vc16;
1173   __vector unsigned short vs8;
1174   __vector unsigned int vi4;
1175   __vector float vf1;
1176 
1177   vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
1178   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
1179   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
1180 					    (__vector unsigned short) zero);
1181   vf1 = (__vector float) vec_ctf (vi4, 0);
1182 
1183   return (__m128) vf1;
1184 }
1185 
1186 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
1189 {
1190   __vector signed int vi4;
1191   __vector float vf4;
1192 
1193   vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
1194   vf4 = (__vector float) vec_ctf (vi4, 0);
1195   return (__m128) vf4;
1196 }
1197 
1198 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm_cvtps_pi16(__m128 __A)
1201 {
1202   __v4sf rounded;
1203   __vector signed int temp;
1204   __vector __m64 result;
1205 
1206   rounded = vec_rint(__A);
1207   temp = vec_cts (rounded, 0);
1208   result = (__vector __m64) vec_pack (temp, temp);
1209 
1210   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
1211 }
1212 
1213 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215 _mm_cvtps_pi8(__m128 __A)
1216 {
1217   __v4sf rounded;
1218   __vector signed int tmp_i;
1219   static const __vector signed int zero = {0, 0, 0, 0};
1220   __vector signed short tmp_s;
1221   __vector signed char res_v;
1222   __m64 result;
1223 
1224   rounded = vec_rint(__A);
1225   tmp_i = vec_cts (rounded, 0);
1226   tmp_s = vec_pack (tmp_i, zero);
1227   res_v = vec_pack (tmp_s, tmp_s);
1228   result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
1229 
1230   return (result);
1231 }
1232 
1233 /* Selects four specific SPFP values from A and B based on MASK.  */
1234 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 
1236 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1237 {
1238   unsigned long element_selector_10 = __mask & 0x03;
1239   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1240   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1241   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1242   static const unsigned int permute_selectors[4] =
1243     {
1244 #ifdef __LITTLE_ENDIAN__
1245       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1246 #elif __BIG_ENDIAN__
1247       0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203
1248 #endif
1249     };
1250   __vector unsigned int t;
1251 
1252 #ifdef __LITTLE_ENDIAN__
1253   t[0] = permute_selectors[element_selector_10];
1254   t[1] = permute_selectors[element_selector_32];
1255   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1256   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1257 #elif __BIG_ENDIAN__
1258   t[3] = permute_selectors[element_selector_10] + 0x10101010;
1259   t[2] = permute_selectors[element_selector_32] + 0x10101010;
1260   t[1] = permute_selectors[element_selector_54];
1261   t[0] = permute_selectors[element_selector_76];
1262 #endif
1263   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1264 }
1265 
1266 /* Selects and interleaves the upper two SPFP values from A and B.  */
1267 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1268 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1269 {
1270   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1271 }
1272 
1273 /* Selects and interleaves the lower two SPFP values from A and B.  */
1274 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1276 {
1277   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1278 }
1279 
1280 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1281    the lower two values are passed through from A.  */
1282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1284 {
1285   __vector __m64 __a = (__vector __m64)__A;
1286   __vector __m64 __p = vec_splats(*__P);
1287   __a [1] = __p [1];
1288 
1289   return (__m128)__a;
1290 }
1291 
1292 /* Stores the upper two SPFP values of A into P.  */
1293 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1294 _mm_storeh_pi (__m64 *__P, __m128 __A)
1295 {
1296   __vector __m64 __a = (__vector __m64) __A;
1297 
1298   *__P = __a[1];
1299 }
1300 
1301 /* Moves the upper two values of B into the lower two values of A.  */
1302 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_movehl_ps (__m128 __A, __m128 __B)
1304 {
1305   return (__m128) vec_mergel ((__vector __m64)__B, (__vector __m64)__A);
1306 }
1307 
1308 /* Moves the lower two values of B into the upper two values of A.  */
1309 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1310 _mm_movelh_ps (__m128 __A, __m128 __B)
1311 {
1312   return (__m128) vec_mergeh ((__vector __m64)__A, (__vector __m64)__B);
1313 }
1314 
1315 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1316    the upper two values are passed through from A.  */
1317 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1319 {
1320   __vector __m64 __a = (__vector __m64)__A;
1321   __vector __m64 __p = vec_splats(*__P);
1322   __a [0] = __p [0];
1323 
1324   return (__m128)__a;
1325 }
1326 
1327 /* Stores the lower two SPFP values of A into P.  */
1328 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_storel_pi (__m64 *__P, __m128 __A)
1330 {
1331   __vector __m64 __a = (__vector __m64) __A;
1332 
1333   *__P = __a[0];
1334 }
1335 
1336 #ifdef _ARCH_PWR8
1337 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1338 
1339 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1340 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341 _mm_movemask_ps (__m128  __A)
1342 {
1343   __vector __m64 result;
1344   static const __vector unsigned int perm_mask =
1345     {
1346 #ifdef __LITTLE_ENDIAN__
1347 	0x00204060, 0x80808080, 0x80808080, 0x80808080
1348 #elif __BIG_ENDIAN__
1349       0x80808080, 0x80808080, 0x80808080, 0x00204060
1350 #endif
1351     };
1352 
1353   result = (__vector __m64) vec_vbpermq ((__vector unsigned char) __A,
1354 					 (__vector unsigned char) perm_mask);
1355 
1356 #ifdef __LITTLE_ENDIAN__
1357   return result[1];
1358 #elif __BIG_ENDIAN__
1359   return result[0];
1360 #endif
1361 }
1362 #endif /* _ARCH_PWR8 */
1363 
1364 /* Create a vector with all four elements equal to *P.  */
1365 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1366 _mm_load1_ps (float const *__P)
1367 {
1368   return _mm_set1_ps (*__P);
1369 }
1370 
1371 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372 _mm_load_ps1 (float const *__P)
1373 {
1374   return _mm_load1_ps (__P);
1375 }
1376 
1377 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _mm_extract_pi16 (__m64 const __A, int const __N)
1380 {
1381   unsigned int shiftr = __N & 3;
1382 #ifdef __BIG_ENDIAN__
1383   shiftr = 3 - shiftr;
1384 #endif
1385 
1386   return ((__A >> (shiftr * 16)) & 0xffff);
1387 }
1388 
1389 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1390 _m_pextrw (__m64 const __A, int const __N)
1391 {
1392   return _mm_extract_pi16 (__A, __N);
1393 }
1394 
1395 /* Inserts word D into one of four words of A.  The selector N must be
1396    immediate.  */
1397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1399 {
1400   const int shiftl = (__N & 3) * 16;
1401   const __m64 shiftD = (const __m64) __D << shiftl;
1402   const __m64 mask = 0xffffUL << shiftl;
1403   __m64 result = (__A & (~mask)) | (shiftD & mask);
1404 
1405   return (result);
1406 }
1407 
1408 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1409 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1410 {
1411   return _mm_insert_pi16 (__A, __D, __N);
1412 }
1413 
1414 /* Compute the element-wise maximum of signed 16-bit values.  */
1415 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1416 
1417 _mm_max_pi16 (__m64 __A, __m64 __B)
1418 {
1419 #if _ARCH_PWR8
1420   __vector signed short a, b, r;
1421   __vector __bool short c;
1422 
1423   a = (__vector signed short)vec_splats (__A);
1424   b = (__vector signed short)vec_splats (__B);
1425   c = (__vector __bool short)vec_cmpgt (a, b);
1426   r = vec_sel (b, a, c);
1427   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1428 #else
1429   __m64_union m1, m2, res;
1430 
1431   m1.as_m64 = __A;
1432   m2.as_m64 = __B;
1433 
1434   res.as_short[0] =
1435       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1436   res.as_short[1] =
1437       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1438   res.as_short[2] =
1439       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1440   res.as_short[3] =
1441       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1442 
1443   return (__m64) res.as_m64;
1444 #endif
1445 }
1446 
1447 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1448 _m_pmaxsw (__m64 __A, __m64 __B)
1449 {
1450   return _mm_max_pi16 (__A, __B);
1451 }
1452 
1453 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1455 _mm_max_pu8 (__m64 __A, __m64 __B)
1456 {
1457 #if _ARCH_PWR8
1458   __vector unsigned char a, b, r;
1459   __vector __bool char c;
1460 
1461   a = (__vector unsigned char)vec_splats (__A);
1462   b = (__vector unsigned char)vec_splats (__B);
1463   c = (__vector __bool char)vec_cmpgt (a, b);
1464   r = vec_sel (b, a, c);
1465   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1466 #else
1467   __m64_union m1, m2, res;
1468   long i;
1469 
1470   m1.as_m64 = __A;
1471   m2.as_m64 = __B;
1472 
1473 
1474   for (i = 0; i < 8; i++)
1475   res.as_char[i] =
1476       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1477 	  m1.as_char[i] : m2.as_char[i];
1478 
1479   return (__m64) res.as_m64;
1480 #endif
1481 }
1482 
1483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1484 _m_pmaxub (__m64 __A, __m64 __B)
1485 {
1486   return _mm_max_pu8 (__A, __B);
1487 }
1488 
1489 /* Compute the element-wise minimum of signed 16-bit values.  */
1490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1491 _mm_min_pi16 (__m64 __A, __m64 __B)
1492 {
1493 #if _ARCH_PWR8
1494   __vector signed short a, b, r;
1495   __vector __bool short c;
1496 
1497   a = (__vector signed short)vec_splats (__A);
1498   b = (__vector signed short)vec_splats (__B);
1499   c = (__vector __bool short)vec_cmplt (a, b);
1500   r = vec_sel (b, a, c);
1501   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1502 #else
1503   __m64_union m1, m2, res;
1504 
1505   m1.as_m64 = __A;
1506   m2.as_m64 = __B;
1507 
1508   res.as_short[0] =
1509       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1510   res.as_short[1] =
1511       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1512   res.as_short[2] =
1513       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1514   res.as_short[3] =
1515       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1516 
1517   return (__m64) res.as_m64;
1518 #endif
1519 }
1520 
1521 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 _m_pminsw (__m64 __A, __m64 __B)
1523 {
1524   return _mm_min_pi16 (__A, __B);
1525 }
1526 
1527 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1528 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1529 _mm_min_pu8 (__m64 __A, __m64 __B)
1530 {
1531 #if _ARCH_PWR8
1532   __vector unsigned char a, b, r;
1533   __vector __bool char c;
1534 
1535   a = (__vector unsigned char)vec_splats (__A);
1536   b = (__vector unsigned char)vec_splats (__B);
1537   c = (__vector __bool char)vec_cmplt (a, b);
1538   r = vec_sel (b, a, c);
1539   return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
1540 #else
1541   __m64_union m1, m2, res;
1542   long i;
1543 
1544   m1.as_m64 = __A;
1545   m2.as_m64 = __B;
1546 
1547 
1548   for (i = 0; i < 8; i++)
1549   res.as_char[i] =
1550       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1551 	  m1.as_char[i] : m2.as_char[i];
1552 
1553   return (__m64) res.as_m64;
1554 #endif
1555 }
1556 
1557 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1558 _m_pminub (__m64 __A, __m64 __B)
1559 {
1560   return _mm_min_pu8 (__A, __B);
1561 }
1562 
1563 /* Create an 8-bit mask of the signs of 8-bit values.  */
1564 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1565 _mm_movemask_pi8 (__m64 __A)
1566 {
1567   unsigned long p = 0x0008101820283038UL; // permute control for sign bits
1568 
1569   return __builtin_bpermd (p, __A);
1570 }
1571 
1572 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1573 _m_pmovmskb (__m64 __A)
1574 {
1575   return _mm_movemask_pi8 (__A);
1576 }
1577 
1578 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1579    in B and produce the high 16 bits of the 32-bit results.  */
1580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1581 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1582 {
1583   __vector unsigned short a, b;
1584   __vector unsigned short c;
1585   __vector unsigned int w0, w1;
1586   __vector unsigned char xform1 = {
1587       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1588       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1589     };
1590 
1591   a = (__vector unsigned short)vec_splats (__A);
1592   b = (__vector unsigned short)vec_splats (__B);
1593 
1594   w0 = vec_vmuleuh (a, b);
1595   w1 = vec_vmulouh (a, b);
1596   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1597 
1598   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1599 }
1600 
1601 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1602 _m_pmulhuw (__m64 __A, __m64 __B)
1603 {
1604   return _mm_mulhi_pu16 (__A, __B);
1605 }
1606 
1607 /* Return a combination of the four 16-bit values in A.  The selector
1608    must be an immediate.  */
1609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1610 _mm_shuffle_pi16 (__m64 __A, int const __N)
1611 {
1612   unsigned long element_selector_10 = __N & 0x03;
1613   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1614   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1615   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1616   static const unsigned short permute_selectors[4] =
1617     {
1618 #ifdef __LITTLE_ENDIAN__
1619 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1620 #elif __BIG_ENDIAN__
1621 	      0x0607, 0x0405, 0x0203, 0x0001
1622 #endif
1623     };
1624   __m64_union t;
1625   __vector __m64 a, p, r;
1626 
1627 #ifdef __LITTLE_ENDIAN__
1628   t.as_short[0] = permute_selectors[element_selector_10];
1629   t.as_short[1] = permute_selectors[element_selector_32];
1630   t.as_short[2] = permute_selectors[element_selector_54];
1631   t.as_short[3] = permute_selectors[element_selector_76];
1632 #elif __BIG_ENDIAN__
1633   t.as_short[3] = permute_selectors[element_selector_10];
1634   t.as_short[2] = permute_selectors[element_selector_32];
1635   t.as_short[1] = permute_selectors[element_selector_54];
1636   t.as_short[0] = permute_selectors[element_selector_76];
1637 #endif
1638   p = vec_splats (t.as_m64);
1639   a = vec_splats (__A);
1640   r = vec_perm (a, a, (__vector unsigned char)p);
1641   return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
1642 }
1643 
1644 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1645 _m_pshufw (__m64 __A, int const __N)
1646 {
1647   return _mm_shuffle_pi16 (__A, __N);
1648 }
1649 
1650 /* Conditionally store byte elements of A into P.  The high bit of each
1651    byte in the selector N determines whether the corresponding byte from
1652    A is stored.  */
1653 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1654 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1655 {
1656   __m64 hibit = 0x8080808080808080UL;
1657   __m64 mask, tmp;
1658   __m64 *p = (__m64*)__P;
1659 
1660   tmp = *p;
1661   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1662   tmp = (tmp & (~mask)) | (__A & mask);
1663   *p = tmp;
1664 }
1665 
1666 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1667 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1668 {
1669   _mm_maskmove_si64 (__A, __N, __P);
1670 }
1671 
1672 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1674 _mm_avg_pu8 (__m64 __A, __m64 __B)
1675 {
1676   __vector unsigned char a, b, c;
1677 
1678   a = (__vector unsigned char)vec_splats (__A);
1679   b = (__vector unsigned char)vec_splats (__B);
1680   c = vec_avg (a, b);
1681   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1682 }
1683 
1684 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1685 _m_pavgb (__m64 __A, __m64 __B)
1686 {
1687   return _mm_avg_pu8 (__A, __B);
1688 }
1689 
1690 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1691 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1692 _mm_avg_pu16 (__m64 __A, __m64 __B)
1693 {
1694   __vector unsigned short a, b, c;
1695 
1696   a = (__vector unsigned short)vec_splats (__A);
1697   b = (__vector unsigned short)vec_splats (__B);
1698   c = vec_avg (a, b);
1699   return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
1700 }
1701 
1702 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1703 _m_pavgw (__m64 __A, __m64 __B)
1704 {
1705   return _mm_avg_pu16 (__A, __B);
1706 }
1707 
1708 /* Compute the sum of the absolute differences of the unsigned 8-bit
1709    values in A and B.  Return the value in the lower 16-bit word; the
1710    upper words are cleared.  */
1711 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1712 _mm_sad_pu8 (__m64  __A, __m64  __B)
1713 {
1714   __vector unsigned char a, b;
1715   __vector unsigned char vmin, vmax, vabsdiff;
1716   __vector signed int vsum;
1717   const __vector unsigned int zero =
1718     { 0, 0, 0, 0 };
1719   unsigned short result;
1720 
1721   a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
1722   b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
1723   vmin = vec_min (a, b);
1724   vmax = vec_max (a, b);
1725   vabsdiff = vec_sub (vmax, vmin);
1726   /* Sum four groups of bytes into integers.  */
1727   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1728   /* Sum across four integers with integer result.  */
1729   vsum = vec_sums (vsum, (__vector signed int) zero);
1730   /* The sum is in the right most 32-bits of the vector result.
1731      Transfer to a GPR and truncate to 16 bits.  */
1732   result = vsum[3];
1733   return (result);
1734 }
1735 
1736 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1737 _m_psadbw (__m64 __A, __m64 __B)
1738 {
1739   return _mm_sad_pu8 (__A, __B);
1740 }
1741 
1742 /* Stores the data in A to the address P without polluting the caches.  */
1743 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1744 _mm_stream_pi (__m64 *__P, __m64 __A)
1745 {
1746   /* Use the data cache block touch for store transient.  */
1747   __asm__ (
1748     "	dcbtstt	0,%0"
1749     :
1750     : "b" (__P)
1751     : "memory"
1752   );
1753   *__P = __A;
1754 }
1755 
1756 /* Likewise.  The address must be 16-byte aligned.  */
1757 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1758 _mm_stream_ps (float *__P, __m128 __A)
1759 {
1760   /* Use the data cache block touch for store transient.  */
1761   __asm__ (
1762     "	dcbtstt	0,%0"
1763     :
1764     : "b" (__P)
1765     : "memory"
1766   );
1767   _mm_store_ps (__P, __A);
1768 }
1769 
1770 /* Guarantees that every preceding store is globally visible before
1771    any subsequent store.  */
1772 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1773 _mm_sfence (void)
1774 {
1775   /* Generate a light weight sync.  */
1776   __atomic_thread_fence (__ATOMIC_RELEASE);
1777 }
1778 
1779 /* The execution of the next instruction is delayed by an implementation
1780    specific amount of time.  The instruction does not modify the
1781    architectural state.  This is after the pop_options pragma because
1782    it does not require SSE support in the processor--the encoding is a
1783    nop on processors that do not support it.  */
1784 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1785 _mm_pause (void)
1786 {
1787   /* There is no exact match with this construct, but the following is
1788      close to the desired effect.  */
1789 #if _ARCH_PWR8
1790   /* On power8 and later processors we can depend on Program Priority
1791      (PRI) and associated "very low" PPI setting.  Since we don't know
1792      what PPI this thread is running at we: 1) save the current PRI
1793      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1794      via the special or 31,31,31 encoding. 3) issue an "isync" to
1795      insure the PRI change takes effect before we execute any more
1796      instructions.
1797      Now we can execute a lwsync (release barrier) while we execute
1798      this thread at "very low" PRI.  Finally we restore the original
1799      PRI and continue execution.  */
1800   unsigned long __PPR;
1801 
1802   __asm__ volatile (
1803     "	mfppr	%0;"
1804     "   or 31,31,31;"
1805     "   isync;"
1806     "   lwsync;"
1807     "   isync;"
1808     "   mtppr	%0;"
1809     : "=r" (__PPR)
1810     :
1811     : "memory"
1812   );
1813 #else
1814   /* For older processor where we may not even have Program Priority
1815      controls we can only depend on Heavy Weight Sync.  */
1816   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1817 #endif
1818 }
1819 
1820 /* Transpose the 4x4 matrix composed of row[0-3].  */
1821 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1822 do {									\
1823   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1824   __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
1825   __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
1826   __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
1827   __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
1828   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
1829 			       (__vector long long)__t1);	\
1830   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
1831 			       (__vector long long)__t1);	\
1832   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
1833 			       (__vector long long)__t3);	\
1834   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
1835 			       (__vector long long)__t3);	\
1836 } while (0)
1837 
1838 /* For backward source compatibility.  */
1839 //# include <emmintrin.h>
1840 
1841 #endif /* _XMMINTRIN_H_INCLUDED */
1842