1 /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 Under Section 7 of GPL version 3, you are granted additional
16 permissions described in the GCC Runtime Library Exception, version
17 3.1, as published by the Free Software Foundation.
18
19 You should have received a copy of the GNU General Public License and
20 a copy of the GCC Runtime Library Exception along with this program;
21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 <http://www.gnu.org/licenses/>. */
23
24 /* Implemented from the specification included in the Intel C++ Compiler
25 User Guide and Reference, version 9.0. */
26
27 #ifndef NO_WARN_X86_INTRINSICS
28 /* This header is distributed to simplify porting x86_64 code that
29 makes explicit use of Intel intrinsics to powerpc64le.
30 It is the user's responsibility to determine if the results are
31 acceptable and make additional changes as necessary.
32 Note that much code that uses Intel intrinsics can be rewritten in
33 standard C or GNU C extensions, which are more portable and better
34 optimized across multiple targets.
35
36 In the specific case of X86 SSE (__m128) intrinsics, the PowerPC
37 VMX/VSX ISA is a good match for vector float SIMD operations.
38 However scalar float operations in vector (XMM) registers require
39 the POWER8 VSX ISA (2.07) level. Also there are important
40 differences for data format and placement of float scalars in the
41 vector register. For PowerISA Scalar floats in FPRs (left most
42 64-bits of the low 32 VSRs) is in double format, while X86_64 SSE
43 uses the right most 32-bits of the XMM. These differences require
44 extra steps on POWER to match the SSE scalar float semantics.
45
46 Most SSE scalar float intrinsic operations can be performed more
47 efficiently as C language float scalar operations or optimized to
48 use vector SIMD operations. We recommend this for new applications.
49
50 Another difference is the format and details of the X86_64 MXSCR vs
51 the PowerISA FPSCR / VSCR registers. We recommend applications
52 replace direct access to the MXSCR with the more portable <fenv.h>
53 Posix APIs. */
54 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
55 #endif
56
57 #ifndef _XMMINTRIN_H_INCLUDED
58 #define _XMMINTRIN_H_INCLUDED
59
60 /* Define four value permute mask */
61 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
62
63 #include <altivec.h>
64
65 /* Avoid collisions between altivec.h and strict adherence to C++ and
66 C11 standards. This should eventually be done inside altivec.h itself,
67 but only after testing a full distro build. */
68 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
69 (defined(__STDC_VERSION__) && \
70 __STDC_VERSION__ >= 201112L))
71 #undef vector
72 #undef pixel
73 #undef bool
74 #endif
75
76 #include <assert.h>
77
78 /* We need type definitions from the MMX header file. */
79 #include <mmintrin.h>
80
81 /* Get _mm_malloc () and _mm_free (). */
82 #include <mm_malloc.h>
83
84 /* The Intel API is flexible enough that we must allow aliasing with other
85 vector types, and their scalar components. */
86 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
87
88 /* Unaligned version of the same type. */
89 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
90 __aligned__ (1)));
91
92 /* Internal data types for implementing the intrinsics. */
93 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
94
95 /* Create an undefined vector. */
96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_ps(void)97 _mm_undefined_ps (void)
98 {
99 __m128 __Y = __Y;
100 return __Y;
101 }
102
103 /* Create a vector of zeros. */
104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_ps(void)105 _mm_setzero_ps (void)
106 {
107 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
108 }
109
110 /* Load four SPFP values from P. The address must be 16-byte aligned. */
111 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps(float const * __P)112 _mm_load_ps (float const *__P)
113 {
114 assert(((unsigned long)__P & 0xfUL) == 0UL);
115 return ((__m128)vec_ld(0, (__v4sf*)__P));
116 }
117
118 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_ps(float const * __P)120 _mm_loadu_ps (float const *__P)
121 {
122 return (vec_vsx_ld(0, __P));
123 }
124
125 /* Load four SPFP values in reverse order. The address must be aligned. */
126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_ps(float const * __P)127 _mm_loadr_ps (float const *__P)
128 {
129 __v4sf __tmp;
130 __m128 __result;
131 static const __vector unsigned char __permute_vector =
132 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
133 0x17, 0x10, 0x11, 0x12, 0x13 };
134
135 __tmp = vec_ld (0, (__v4sf *) __P);
136 __result = (__m128) vec_perm (__tmp, __tmp, __permute_vector);
137 return __result;
138 }
139
140 /* Create a vector with all four elements equal to F. */
141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_ps(float __F)142 _mm_set1_ps (float __F)
143 {
144 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
145 }
146
147 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps1(float __F)148 _mm_set_ps1 (float __F)
149 {
150 return _mm_set1_ps (__F);
151 }
152
153 /* Create the vector [Z Y X W]. */
154 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ps(const float __Z,const float __Y,const float __X,const float __W)155 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
156 {
157 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
158 }
159
160 /* Create the vector [W X Y Z]. */
161 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_ps(float __Z,float __Y,float __X,float __W)162 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
163 {
164 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
165 }
166
167 /* Store four SPFP values. The address must be 16-byte aligned. */
168 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps(float * __P,__m128 __A)169 _mm_store_ps (float *__P, __m128 __A)
170 {
171 assert(((unsigned long)__P & 0xfUL) == 0UL);
172 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
173 }
174
175 /* Store four SPFP values. The address need not be 16-byte aligned. */
176 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_ps(float * __P,__m128 __A)177 _mm_storeu_ps (float *__P, __m128 __A)
178 {
179 *(__m128_u *)__P = __A;
180 }
181
182 /* Store four SPFP values in reverse order. The address must be aligned. */
183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_ps(float * __P,__m128 __A)184 _mm_storer_ps (float *__P, __m128 __A)
185 {
186 __v4sf __tmp;
187 static const __vector unsigned char __permute_vector =
188 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
189 0x17, 0x10, 0x11, 0x12, 0x13 };
190
191 __tmp = (__m128) vec_perm (__A, __A, __permute_vector);
192
193 _mm_store_ps (__P, __tmp);
194 }
195
196 /* Store the lower SPFP value across four words. */
197 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_ps(float * __P,__m128 __A)198 _mm_store1_ps (float *__P, __m128 __A)
199 {
200 __v4sf __va = vec_splat((__v4sf)__A, 0);
201 _mm_store_ps (__P, __va);
202 }
203
204 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ps1(float * __P,__m128 __A)205 _mm_store_ps1 (float *__P, __m128 __A)
206 {
207 _mm_store1_ps (__P, __A);
208 }
209
210 /* Create a vector with element 0 as F and the rest zero. */
211 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ss(float __F)212 _mm_set_ss (float __F)
213 {
214 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
215 }
216
217 /* Sets the low SPFP value of A from the low value of B. */
218 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_ss(__m128 __A,__m128 __B)219 _mm_move_ss (__m128 __A, __m128 __B)
220 {
221 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
222
223 return (vec_sel ((__v4sf)__A, (__v4sf)__B, __mask));
224 }
225
226 /* Create a vector with element 0 as *P and the rest zero. */
227 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ss(float const * __P)228 _mm_load_ss (float const *__P)
229 {
230 return _mm_set_ss (*__P);
231 }
232
233 /* Stores the lower SPFP value. */
234 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss(float * __P,__m128 __A)235 _mm_store_ss (float *__P, __m128 __A)
236 {
237 *__P = ((__v4sf)__A)[0];
238 }
239
240 /* Perform the respective operation on the lower SPFP (single-precision
241 floating-point) values of A and B; the upper three SPFP values are
242 passed through from A. */
243
244 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ss(__m128 __A,__m128 __B)245 _mm_add_ss (__m128 __A, __m128 __B)
246 {
247 #ifdef _ARCH_PWR7
248 __m128 __a, __b, __c;
249 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
250 /* PowerISA VSX does not allow partial (for just lower double)
251 results. So to insure we don't generate spurious exceptions
252 (from the upper double values) we splat the lower double
253 before we to the operation. */
254 __a = vec_splat (__A, 0);
255 __b = vec_splat (__B, 0);
256 __c = __a + __b;
257 /* Then we merge the lower float result with the original upper
258 float elements from __A. */
259 return (vec_sel (__A, __c, __mask));
260 #else
261 __A[0] = __A[0] + __B[0];
262 return (__A);
263 #endif
264 }
265
266 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ss(__m128 __A,__m128 __B)267 _mm_sub_ss (__m128 __A, __m128 __B)
268 {
269 #ifdef _ARCH_PWR7
270 __m128 __a, __b, __c;
271 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
272 /* PowerISA VSX does not allow partial (for just lower double)
273 results. So to insure we don't generate spurious exceptions
274 (from the upper double values) we splat the lower double
275 before we to the operation. */
276 __a = vec_splat (__A, 0);
277 __b = vec_splat (__B, 0);
278 __c = __a - __b;
279 /* Then we merge the lower float result with the original upper
280 float elements from __A. */
281 return (vec_sel (__A, __c, __mask));
282 #else
283 __A[0] = __A[0] - __B[0];
284 return (__A);
285 #endif
286 }
287
288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ss(__m128 __A,__m128 __B)289 _mm_mul_ss (__m128 __A, __m128 __B)
290 {
291 #ifdef _ARCH_PWR7
292 __m128 __a, __b, __c;
293 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
294 /* PowerISA VSX does not allow partial (for just lower double)
295 results. So to insure we don't generate spurious exceptions
296 (from the upper double values) we splat the lower double
297 before we to the operation. */
298 __a = vec_splat (__A, 0);
299 __b = vec_splat (__B, 0);
300 __c = __a * __b;
301 /* Then we merge the lower float result with the original upper
302 float elements from __A. */
303 return (vec_sel (__A, __c, __mask));
304 #else
305 __A[0] = __A[0] * __B[0];
306 return (__A);
307 #endif
308 }
309
310 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ss(__m128 __A,__m128 __B)311 _mm_div_ss (__m128 __A, __m128 __B)
312 {
313 #ifdef _ARCH_PWR7
314 __m128 __a, __b, __c;
315 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
316 /* PowerISA VSX does not allow partial (for just lower double)
317 results. So to insure we don't generate spurious exceptions
318 (from the upper double values) we splat the lower double
319 before we to the operation. */
320 __a = vec_splat (__A, 0);
321 __b = vec_splat (__B, 0);
322 __c = __a / __b;
323 /* Then we merge the lower float result with the original upper
324 float elements from __A. */
325 return (vec_sel (__A, __c, __mask));
326 #else
327 __A[0] = __A[0] / __B[0];
328 return (__A);
329 #endif
330 }
331
332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ss(__m128 __A)333 _mm_sqrt_ss (__m128 __A)
334 {
335 __m128 __a, __c;
336 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
337 /* PowerISA VSX does not allow partial (for just lower double)
338 * results. So to insure we don't generate spurious exceptions
339 * (from the upper double values) we splat the lower double
340 * before we to the operation. */
341 __a = vec_splat (__A, 0);
342 __c = vec_sqrt (__a);
343 /* Then we merge the lower float result with the original upper
344 * float elements from __A. */
345 return (vec_sel (__A, __c, __mask));
346 }
347
348 /* Perform the respective operation on the four SPFP values in A and B. */
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps(__m128 __A,__m128 __B)350 _mm_add_ps (__m128 __A, __m128 __B)
351 {
352 return (__m128) ((__v4sf)__A + (__v4sf)__B);
353 }
354
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps(__m128 __A,__m128 __B)356 _mm_sub_ps (__m128 __A, __m128 __B)
357 {
358 return (__m128) ((__v4sf)__A - (__v4sf)__B);
359 }
360
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps(__m128 __A,__m128 __B)362 _mm_mul_ps (__m128 __A, __m128 __B)
363 {
364 return (__m128) ((__v4sf)__A * (__v4sf)__B);
365 }
366
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps(__m128 __A,__m128 __B)368 _mm_div_ps (__m128 __A, __m128 __B)
369 {
370 return (__m128) ((__v4sf)__A / (__v4sf)__B);
371 }
372
373 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_ps(__m128 __A)374 _mm_sqrt_ps (__m128 __A)
375 {
376 return (vec_sqrt ((__v4sf)__A));
377 }
378
379 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ps(__m128 __A)380 _mm_rcp_ps (__m128 __A)
381 {
382 return (vec_re ((__v4sf)__A));
383 }
384
385 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ps(__m128 __A)386 _mm_rsqrt_ps (__m128 __A)
387 {
388 return (vec_rsqrte (__A));
389 }
390
391 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp_ss(__m128 __A)392 _mm_rcp_ss (__m128 __A)
393 {
394 __m128 __a, __c;
395 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
396 /* PowerISA VSX does not allow partial (for just lower double)
397 * results. So to insure we don't generate spurious exceptions
398 * (from the upper double values) we splat the lower double
399 * before we to the operation. */
400 __a = vec_splat (__A, 0);
401 __c = _mm_rcp_ps (__a);
402 /* Then we merge the lower float result with the original upper
403 * float elements from __A. */
404 return (vec_sel (__A, __c, __mask));
405 }
406
407 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt_ss(__m128 __A)408 _mm_rsqrt_ss (__m128 __A)
409 {
410 __m128 __a, __c;
411 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
412 /* PowerISA VSX does not allow partial (for just lower double)
413 * results. So to insure we don't generate spurious exceptions
414 * (from the upper double values) we splat the lower double
415 * before we to the operation. */
416 __a = vec_splat (__A, 0);
417 __c = vec_rsqrte (__a);
418 /* Then we merge the lower float result with the original upper
419 * float elements from __A. */
420 return (vec_sel (__A, __c, __mask));
421 }
422
423 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ss(__m128 __A,__m128 __B)424 _mm_min_ss (__m128 __A, __m128 __B)
425 {
426 __v4sf __a, __b, __c;
427 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
428 /* PowerISA VSX does not allow partial (for just lower float)
429 * results. So to insure we don't generate spurious exceptions
430 * (from the upper float values) we splat the lower float
431 * before we to the operation. */
432 __a = vec_splat ((__v4sf)__A, 0);
433 __b = vec_splat ((__v4sf)__B, 0);
434 __c = vec_min (__a, __b);
435 /* Then we merge the lower float result with the original upper
436 * float elements from __A. */
437 return (vec_sel ((__v4sf)__A, __c, __mask));
438 }
439
440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ss(__m128 __A,__m128 __B)441 _mm_max_ss (__m128 __A, __m128 __B)
442 {
443 __v4sf __a, __b, __c;
444 static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0};
445 /* PowerISA VSX does not allow partial (for just lower float)
446 * results. So to insure we don't generate spurious exceptions
447 * (from the upper float values) we splat the lower float
448 * before we to the operation. */
449 __a = vec_splat (__A, 0);
450 __b = vec_splat (__B, 0);
451 __c = vec_max (__a, __b);
452 /* Then we merge the lower float result with the original upper
453 * float elements from __A. */
454 return (vec_sel ((__v4sf)__A, __c, __mask));
455 }
456
457 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_ps(__m128 __A,__m128 __B)458 _mm_min_ps (__m128 __A, __m128 __B)
459 {
460 __vector __bool int __m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
461 return vec_sel (__B, __A, __m);
462 }
463
464 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_ps(__m128 __A,__m128 __B)465 _mm_max_ps (__m128 __A, __m128 __B)
466 {
467 __vector __bool int __m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
468 return vec_sel (__B, __A, __m);
469 }
470
471 /* Perform logical bit-wise operations on 128-bit values. */
472 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_ps(__m128 __A,__m128 __B)473 _mm_and_ps (__m128 __A, __m128 __B)
474 {
475 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
476 // return __builtin_ia32_andps (__A, __B);
477 }
478
479 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_ps(__m128 __A,__m128 __B)480 _mm_andnot_ps (__m128 __A, __m128 __B)
481 {
482 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
483 }
484
485 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_ps(__m128 __A,__m128 __B)486 _mm_or_ps (__m128 __A, __m128 __B)
487 {
488 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
489 }
490
491 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_ps(__m128 __A,__m128 __B)492 _mm_xor_ps (__m128 __A, __m128 __B)
493 {
494 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
495 }
496
497 /* Perform a comparison on the four SPFP values of A and B. For each
498 element, if the comparison is true, place a mask of all ones in the
499 result, otherwise a mask of zeros. */
500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ps(__m128 __A,__m128 __B)501 _mm_cmpeq_ps (__m128 __A, __m128 __B)
502 {
503 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
504 }
505
506 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ps(__m128 __A,__m128 __B)507 _mm_cmplt_ps (__m128 __A, __m128 __B)
508 {
509 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
510 }
511
512 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ps(__m128 __A,__m128 __B)513 _mm_cmple_ps (__m128 __A, __m128 __B)
514 {
515 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
516 }
517
518 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ps(__m128 __A,__m128 __B)519 _mm_cmpgt_ps (__m128 __A, __m128 __B)
520 {
521 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
522 }
523
524 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ps(__m128 __A,__m128 __B)525 _mm_cmpge_ps (__m128 __A, __m128 __B)
526 {
527 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
528 }
529
530 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ps(__m128 __A,__m128 __B)531 _mm_cmpneq_ps (__m128 __A, __m128 __B)
532 {
533 __v4sf __temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
534 return ((__m128)vec_nor (__temp, __temp));
535 }
536
537 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ps(__m128 __A,__m128 __B)538 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
539 {
540 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
541 }
542
543 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ps(__m128 __A,__m128 __B)544 _mm_cmpnle_ps (__m128 __A, __m128 __B)
545 {
546 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
547 }
548
549 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ps(__m128 __A,__m128 __B)550 _mm_cmpngt_ps (__m128 __A, __m128 __B)
551 {
552 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
553 }
554
555 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ps(__m128 __A,__m128 __B)556 _mm_cmpnge_ps (__m128 __A, __m128 __B)
557 {
558 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
559 }
560
561 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ps(__m128 __A,__m128 __B)562 _mm_cmpord_ps (__m128 __A, __m128 __B)
563 {
564 __vector unsigned int __a, __b;
565 __vector unsigned int __c, __d;
566 static const __vector unsigned int __float_exp_mask =
567 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
568
569 __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
570 __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
571 __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
572 __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
573 return ((__m128 ) vec_and (__c, __d));
574 }
575
576 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ps(__m128 __A,__m128 __B)577 _mm_cmpunord_ps (__m128 __A, __m128 __B)
578 {
579 __vector unsigned int __a, __b;
580 __vector unsigned int __c, __d;
581 static const __vector unsigned int __float_exp_mask =
582 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
583
584 __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
585 __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
586 __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
587 __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
588 return ((__m128 ) vec_or (__c, __d));
589 }
590
591 /* Perform a comparison on the lower SPFP values of A and B. If the
592 comparison is true, place a mask of all ones in the result, otherwise a
593 mask of zeros. The upper three SPFP values are passed through from A. */
594 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_ss(__m128 __A,__m128 __B)595 _mm_cmpeq_ss (__m128 __A, __m128 __B)
596 {
597 static const __vector unsigned int __mask =
598 { 0xffffffff, 0, 0, 0 };
599 __v4sf __a, __b, __c;
600 /* PowerISA VMX does not allow partial (for just element 0)
601 * results. So to insure we don't generate spurious exceptions
602 * (from the upper elements) we splat the lower float
603 * before we to the operation. */
604 __a = vec_splat ((__v4sf) __A, 0);
605 __b = vec_splat ((__v4sf) __B, 0);
606 __c = (__v4sf) vec_cmpeq (__a, __b);
607 /* Then we merge the lower float result with the original upper
608 * float elements from __A. */
609 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
610 }
611
612 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_ss(__m128 __A,__m128 __B)613 _mm_cmplt_ss (__m128 __A, __m128 __B)
614 {
615 static const __vector unsigned int __mask =
616 { 0xffffffff, 0, 0, 0 };
617 __v4sf __a, __b, __c;
618 /* PowerISA VMX does not allow partial (for just element 0)
619 * results. So to insure we don't generate spurious exceptions
620 * (from the upper elements) we splat the lower float
621 * before we to the operation. */
622 __a = vec_splat ((__v4sf) __A, 0);
623 __b = vec_splat ((__v4sf) __B, 0);
624 __c = (__v4sf) vec_cmplt(__a, __b);
625 /* Then we merge the lower float result with the original upper
626 * float elements from __A. */
627 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
628 }
629
630 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_ss(__m128 __A,__m128 __B)631 _mm_cmple_ss (__m128 __A, __m128 __B)
632 {
633 static const __vector unsigned int __mask =
634 { 0xffffffff, 0, 0, 0 };
635 __v4sf __a, __b, __c;
636 /* PowerISA VMX does not allow partial (for just element 0)
637 * results. So to insure we don't generate spurious exceptions
638 * (from the upper elements) we splat the lower float
639 * before we to the operation. */
640 __a = vec_splat ((__v4sf) __A, 0);
641 __b = vec_splat ((__v4sf) __B, 0);
642 __c = (__v4sf) vec_cmple(__a, __b);
643 /* Then we merge the lower float result with the original upper
644 * float elements from __A. */
645 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
646 }
647
648 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_ss(__m128 __A,__m128 __B)649 _mm_cmpgt_ss (__m128 __A, __m128 __B)
650 {
651 static const __vector unsigned int __mask =
652 { 0xffffffff, 0, 0, 0 };
653 __v4sf __a, __b, __c;
654 /* PowerISA VMX does not allow partial (for just element 0)
655 * results. So to insure we don't generate spurious exceptions
656 * (from the upper elements) we splat the lower float
657 * before we to the operation. */
658 __a = vec_splat ((__v4sf) __A, 0);
659 __b = vec_splat ((__v4sf) __B, 0);
660 __c = (__v4sf) vec_cmpgt(__a, __b);
661 /* Then we merge the lower float result with the original upper
662 * float elements from __A. */
663 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
664 }
665
666 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_ss(__m128 __A,__m128 __B)667 _mm_cmpge_ss (__m128 __A, __m128 __B)
668 {
669 static const __vector unsigned int __mask =
670 { 0xffffffff, 0, 0, 0 };
671 __v4sf __a, __b, __c;
672 /* PowerISA VMX does not allow partial (for just element 0)
673 * results. So to insure we don't generate spurious exceptions
674 * (from the upper elements) we splat the lower float
675 * before we to the operation. */
676 __a = vec_splat ((__v4sf) __A, 0);
677 __b = vec_splat ((__v4sf) __B, 0);
678 __c = (__v4sf) vec_cmpge(__a, __b);
679 /* Then we merge the lower float result with the original upper
680 * float elements from __A. */
681 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
682 }
683
684 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_ss(__m128 __A,__m128 __B)685 _mm_cmpneq_ss (__m128 __A, __m128 __B)
686 {
687 static const __vector unsigned int __mask =
688 { 0xffffffff, 0, 0, 0 };
689 __v4sf __a, __b, __c;
690 /* PowerISA VMX does not allow partial (for just element 0)
691 * results. So to insure we don't generate spurious exceptions
692 * (from the upper elements) we splat the lower float
693 * before we to the operation. */
694 __a = vec_splat ((__v4sf) __A, 0);
695 __b = vec_splat ((__v4sf) __B, 0);
696 __c = (__v4sf) vec_cmpeq(__a, __b);
697 __c = vec_nor (__c, __c);
698 /* Then we merge the lower float result with the original upper
699 * float elements from __A. */
700 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
701 }
702
703 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_ss(__m128 __A,__m128 __B)704 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
705 {
706 static const __vector unsigned int __mask =
707 { 0xffffffff, 0, 0, 0 };
708 __v4sf __a, __b, __c;
709 /* PowerISA VMX does not allow partial (for just element 0)
710 * results. So to insure we don't generate spurious exceptions
711 * (from the upper elements) we splat the lower float
712 * before we to the operation. */
713 __a = vec_splat ((__v4sf) __A, 0);
714 __b = vec_splat ((__v4sf) __B, 0);
715 __c = (__v4sf) vec_cmpge(__a, __b);
716 /* Then we merge the lower float result with the original upper
717 * float elements from __A. */
718 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
719 }
720
721 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_ss(__m128 __A,__m128 __B)722 _mm_cmpnle_ss (__m128 __A, __m128 __B)
723 {
724 static const __vector unsigned int __mask =
725 { 0xffffffff, 0, 0, 0 };
726 __v4sf __a, __b, __c;
727 /* PowerISA VMX does not allow partial (for just element 0)
728 * results. So to insure we don't generate spurious exceptions
729 * (from the upper elements) we splat the lower float
730 * before we to the operation. */
731 __a = vec_splat ((__v4sf) __A, 0);
732 __b = vec_splat ((__v4sf) __B, 0);
733 __c = (__v4sf) vec_cmpgt(__a, __b);
734 /* Then we merge the lower float result with the original upper
735 * float elements from __A. */
736 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
737 }
738
739 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_ss(__m128 __A,__m128 __B)740 _mm_cmpngt_ss (__m128 __A, __m128 __B)
741 {
742 static const __vector unsigned int __mask =
743 { 0xffffffff, 0, 0, 0 };
744 __v4sf __a, __b, __c;
745 /* PowerISA VMX does not allow partial (for just element 0)
746 * results. So to insure we don't generate spurious exceptions
747 * (from the upper elements) we splat the lower float
748 * before we to the operation. */
749 __a = vec_splat ((__v4sf) __A, 0);
750 __b = vec_splat ((__v4sf) __B, 0);
751 __c = (__v4sf) vec_cmple(__a, __b);
752 /* Then we merge the lower float result with the original upper
753 * float elements from __A. */
754 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
755 }
756
757 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_ss(__m128 __A,__m128 __B)758 _mm_cmpnge_ss (__m128 __A, __m128 __B)
759 {
760 static const __vector unsigned int __mask =
761 { 0xffffffff, 0, 0, 0 };
762 __v4sf __a, __b, __c;
763 /* PowerISA VMX does not allow partial (for just element 0)
764 * results. So to insure we don't generate spurious exceptions
765 * (from the upper elements) we splat the lower float
766 * before we do the operation. */
767 __a = vec_splat ((__v4sf) __A, 0);
768 __b = vec_splat ((__v4sf) __B, 0);
769 __c = (__v4sf) vec_cmplt(__a, __b);
770 /* Then we merge the lower float result with the original upper
771 * float elements from __A. */
772 return ((__m128)vec_sel ((__v4sf)__A, __c, __mask));
773 }
774
775 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_ss(__m128 __A,__m128 __B)776 _mm_cmpord_ss (__m128 __A, __m128 __B)
777 {
778 __vector unsigned int __a, __b;
779 __vector unsigned int __c, __d;
780 static const __vector unsigned int __float_exp_mask =
781 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
782 static const __vector unsigned int __mask =
783 { 0xffffffff, 0, 0, 0 };
784
785 __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
786 __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
787 __c = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __a);
788 __d = (__vector unsigned int) vec_cmpgt (__float_exp_mask, __b);
789 __c = vec_and (__c, __d);
790 /* Then we merge the lower float result with the original upper
791 * float elements from __A. */
792 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
793 }
794
795 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_ss(__m128 __A,__m128 __B)796 _mm_cmpunord_ss (__m128 __A, __m128 __B)
797 {
798 __vector unsigned int __a, __b;
799 __vector unsigned int __c, __d;
800 static const __vector unsigned int __float_exp_mask =
801 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
802 static const __vector unsigned int __mask =
803 { 0xffffffff, 0, 0, 0 };
804
805 __a = (__vector unsigned int) vec_abs ((__v4sf)__A);
806 __b = (__vector unsigned int) vec_abs ((__v4sf)__B);
807 __c = (__vector unsigned int) vec_cmpgt (__a, __float_exp_mask);
808 __d = (__vector unsigned int) vec_cmpgt (__b, __float_exp_mask);
809 __c = vec_or (__c, __d);
810 /* Then we merge the lower float result with the original upper
811 * float elements from __A. */
812 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)__c, __mask));
813 }
814
815 /* Compare the lower SPFP values of A and B and return 1 if true
816 and 0 if false. */
817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_ss(__m128 __A,__m128 __B)818 _mm_comieq_ss (__m128 __A, __m128 __B)
819 {
820 return (__A[0] == __B[0]);
821 }
822
823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_ss(__m128 __A,__m128 __B)824 _mm_comilt_ss (__m128 __A, __m128 __B)
825 {
826 return (__A[0] < __B[0]);
827 }
828
829 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_ss(__m128 __A,__m128 __B)830 _mm_comile_ss (__m128 __A, __m128 __B)
831 {
832 return (__A[0] <= __B[0]);
833 }
834
835 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_ss(__m128 __A,__m128 __B)836 _mm_comigt_ss (__m128 __A, __m128 __B)
837 {
838 return (__A[0] > __B[0]);
839 }
840
841 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_ss(__m128 __A,__m128 __B)842 _mm_comige_ss (__m128 __A, __m128 __B)
843 {
844 return (__A[0] >= __B[0]);
845 }
846
847 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_ss(__m128 __A,__m128 __B)848 _mm_comineq_ss (__m128 __A, __m128 __B)
849 {
850 return (__A[0] != __B[0]);
851 }
852
853 /* FIXME
854 * The __mm_ucomi??_ss implementations below are exactly the same as
855 * __mm_comi??_ss because GCC for PowerPC only generates unordered
856 * compares (scalar and vector).
857 * Technically __mm_comieq_ss et al should be using the ordered
858 * compare and signal for QNaNs.
859 * The __mm_ucomieq_sd et all should be OK, as is.
860 */
861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_ss(__m128 __A,__m128 __B)862 _mm_ucomieq_ss (__m128 __A, __m128 __B)
863 {
864 return (__A[0] == __B[0]);
865 }
866
867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_ss(__m128 __A,__m128 __B)868 _mm_ucomilt_ss (__m128 __A, __m128 __B)
869 {
870 return (__A[0] < __B[0]);
871 }
872
873 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_ss(__m128 __A,__m128 __B)874 _mm_ucomile_ss (__m128 __A, __m128 __B)
875 {
876 return (__A[0] <= __B[0]);
877 }
878
879 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_ss(__m128 __A,__m128 __B)880 _mm_ucomigt_ss (__m128 __A, __m128 __B)
881 {
882 return (__A[0] > __B[0]);
883 }
884
885 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_ss(__m128 __A,__m128 __B)886 _mm_ucomige_ss (__m128 __A, __m128 __B)
887 {
888 return (__A[0] >= __B[0]);
889 }
890
891 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_ss(__m128 __A,__m128 __B)892 _mm_ucomineq_ss (__m128 __A, __m128 __B)
893 {
894 return (__A[0] != __B[0]);
895 }
896
897 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32(__m128 __A)898 _mm_cvtss_f32 (__m128 __A)
899 {
900 return ((__v4sf)__A)[0];
901 }
902
903 /* Convert the lower SPFP value to a 32-bit integer according to the current
904 rounding mode. */
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32(__m128 __A)906 _mm_cvtss_si32 (__m128 __A)
907 {
908 int __res;
909 #ifdef _ARCH_PWR8
910 double __dtmp;
911 __asm__(
912 #ifdef __LITTLE_ENDIAN__
913 "xxsldwi %x0,%x0,%x0,3;\n"
914 #endif
915 "xscvspdp %x2,%x0;\n"
916 "fctiw %2,%2;\n"
917 "mfvsrd %1,%x2;\n"
918 : "+wa" (__A),
919 "=r" (__res),
920 "=f" (__dtmp)
921 : );
922 #else
923 __res = __builtin_rint(__A[0]);
924 #endif
925 return __res;
926 }
927
928 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ss2si(__m128 __A)929 _mm_cvt_ss2si (__m128 __A)
930 {
931 return _mm_cvtss_si32 (__A);
932 }
933
934 /* Convert the lower SPFP value to a 32-bit integer according to the
935 current rounding mode. */
936
937 /* Intel intrinsic. */
938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64(__m128 __A)939 _mm_cvtss_si64 (__m128 __A)
940 {
941 long long __res;
942 #if defined (_ARCH_PWR8) && defined (__powerpc64__)
943 double __dtmp;
944 __asm__(
945 #ifdef __LITTLE_ENDIAN__
946 "xxsldwi %x0,%x0,%x0,3;\n"
947 #endif
948 "xscvspdp %x2,%x0;\n"
949 "fctid %2,%2;\n"
950 "mfvsrd %1,%x2;\n"
951 : "+wa" (__A),
952 "=r" (__res),
953 "=f" (__dtmp)
954 : );
955 #else
956 __res = __builtin_llrint(__A[0]);
957 #endif
958 return __res;
959 }
960
961 /* Microsoft intrinsic. */
962 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64x(__m128 __A)963 _mm_cvtss_si64x (__m128 __A)
964 {
965 return _mm_cvtss_si64 ((__v4sf) __A);
966 }
967
968 /* Constants for use with _mm_prefetch. */
969 enum _mm_hint
970 {
971 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
972 _MM_HINT_ET0 = 7,
973 _MM_HINT_ET1 = 6,
974 _MM_HINT_T0 = 3,
975 _MM_HINT_T1 = 2,
976 _MM_HINT_T2 = 1,
977 _MM_HINT_NTA = 0
978 };
979
980 /* Loads one cache line from address P to a location "closer" to the
981 processor. The selector I specifies the type of prefetch operation. */
982 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_prefetch(const void * __P,enum _mm_hint __I)983 _mm_prefetch (const void *__P, enum _mm_hint __I)
984 {
985 /* Current PowerPC will ignores the hint parameters. */
986 __builtin_prefetch (__P);
987 }
988
989 /* Convert the two lower SPFP values to 32-bit integers according to the
990 current rounding mode. Return the integers in packed form. */
991 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi32(__m128 __A)992 _mm_cvtps_pi32 (__m128 __A)
993 {
994 /* Splat two lower SPFP values to both halves. */
995 __v4sf __temp, __rounded;
996 __vector unsigned long long __result;
997
998 /* Splat two lower SPFP values to both halves. */
999 __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1000 __rounded = vec_rint (__temp);
1001 __result = (__vector unsigned long long) vec_cts (__rounded, 0);
1002
1003 return (__m64) ((__vector long long) __result)[0];
1004 }
1005
1006 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_ps2pi(__m128 __A)1007 _mm_cvt_ps2pi (__m128 __A)
1008 {
1009 return _mm_cvtps_pi32 (__A);
1010 }
1011
1012 /* Truncate the lower SPFP value to a 32-bit integer. */
1013 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si32(__m128 __A)1014 _mm_cvttss_si32 (__m128 __A)
1015 {
1016 /* Extract the lower float element. */
1017 float __temp = __A[0];
1018 /* truncate to 32-bit integer and return. */
1019 return __temp;
1020 }
1021
1022 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ss2si(__m128 __A)1023 _mm_cvtt_ss2si (__m128 __A)
1024 {
1025 return _mm_cvttss_si32 (__A);
1026 }
1027
1028 /* Intel intrinsic. */
1029 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64(__m128 __A)1030 _mm_cvttss_si64 (__m128 __A)
1031 {
1032 /* Extract the lower float element. */
1033 float __temp = __A[0];
1034 /* truncate to 32-bit integer and return. */
1035 return __temp;
1036 }
1037
1038 /* Microsoft intrinsic. */
1039 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttss_si64x(__m128 __A)1040 _mm_cvttss_si64x (__m128 __A)
1041 {
1042 /* Extract the lower float element. */
1043 float __temp = __A[0];
1044 /* truncate to 32-bit integer and return. */
1045 return __temp;
1046 }
1047
1048 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1049 integers in packed form. */
1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_pi32(__m128 __A)1051 _mm_cvttps_pi32 (__m128 __A)
1052 {
1053 __v4sf __temp;
1054 __vector unsigned long long __result;
1055
1056 /* Splat two lower SPFP values to both halves. */
1057 __temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1058 __result = (__vector unsigned long long) vec_cts (__temp, 0);
1059
1060 return (__m64) ((__vector long long) __result)[0];
1061 }
1062
1063 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtt_ps2pi(__m128 __A)1064 _mm_cvtt_ps2pi (__m128 __A)
1065 {
1066 return _mm_cvttps_pi32 (__A);
1067 }
1068
1069 /* Convert B to a SPFP value and insert it as element zero in A. */
1070 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_ss(__m128 __A,int __B)1071 _mm_cvtsi32_ss (__m128 __A, int __B)
1072 {
1073 float __temp = __B;
1074 __A[0] = __temp;
1075
1076 return __A;
1077 }
1078
1079 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_si2ss(__m128 __A,int __B)1080 _mm_cvt_si2ss (__m128 __A, int __B)
1081 {
1082 return _mm_cvtsi32_ss (__A, __B);
1083 }
1084
1085 /* Convert B to a SPFP value and insert it as element zero in A. */
1086 /* Intel intrinsic. */
1087 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_ss(__m128 __A,long long __B)1088 _mm_cvtsi64_ss (__m128 __A, long long __B)
1089 {
1090 float __temp = __B;
1091 __A[0] = __temp;
1092
1093 return __A;
1094 }
1095
1096 /* Microsoft intrinsic. */
1097 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_ss(__m128 __A,long long __B)1098 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1099 {
1100 return _mm_cvtsi64_ss (__A, __B);
1101 }
1102
1103 /* Convert the two 32-bit values in B to SPFP form and insert them
1104 as the two lower elements in A. */
1105 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_ps(__m128 __A,__m64 __B)1106 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1107 {
1108 __vector signed int __vm1;
1109 __vector float __vf1;
1110
1111 __vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1112 __vf1 = (__vector float) vec_ctf (__vm1, 0);
1113
1114 return ((__m128) (__vector unsigned long long)
1115 { ((__vector unsigned long long)__vf1) [0],
1116 ((__vector unsigned long long)__A) [1]});
1117 }
1118
1119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvt_pi2ps(__m128 __A,__m64 __B)1120 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1121 {
1122 return _mm_cvtpi32_ps (__A, __B);
1123 }
1124
1125 /* Convert the four signed 16-bit values in A to SPFP form. */
1126 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi16_ps(__m64 __A)1127 _mm_cvtpi16_ps (__m64 __A)
1128 {
1129 __vector signed short __vs8;
1130 __vector signed int __vi4;
1131 __vector float __vf1;
1132
1133 __vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1134 __vi4 = vec_vupklsh (__vs8);
1135 __vf1 = (__vector float) vec_ctf (__vi4, 0);
1136
1137 return (__m128) __vf1;
1138 }
1139
1140 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1141 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpu16_ps(__m64 __A)1142 _mm_cvtpu16_ps (__m64 __A)
1143 {
1144 const __vector unsigned short __zero =
1145 { 0, 0, 0, 0, 0, 0, 0, 0 };
1146 __vector unsigned short __vs8;
1147 __vector unsigned int __vi4;
1148 __vector float __vf1;
1149
1150 __vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1151 __vi4 = (__vector unsigned int) vec_mergel
1152 #ifdef __LITTLE_ENDIAN__
1153 (__vs8, __zero);
1154 #else
1155 (__zero, __vs8);
1156 #endif
1157 __vf1 = (__vector float) vec_ctf (__vi4, 0);
1158
1159 return (__m128) __vf1;
1160 }
1161
1162 /* Convert the low four signed 8-bit values in A to SPFP form. */
1163 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi8_ps(__m64 __A)1164 _mm_cvtpi8_ps (__m64 __A)
1165 {
1166 __vector signed char __vc16;
1167 __vector signed short __vs8;
1168 __vector signed int __vi4;
1169 __vector float __vf1;
1170
1171 __vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1172 __vs8 = vec_vupkhsb (__vc16);
1173 __vi4 = vec_vupkhsh (__vs8);
1174 __vf1 = (__vector float) vec_ctf (__vi4, 0);
1175
1176 return (__m128) __vf1;
1177 }
1178
1179 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1180 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181
_mm_cvtpu8_ps(__m64 __A)1182 _mm_cvtpu8_ps (__m64 __A)
1183 {
1184 const __vector unsigned char __zero =
1185 { 0, 0, 0, 0, 0, 0, 0, 0 };
1186 __vector unsigned char __vc16;
1187 __vector unsigned short __vs8;
1188 __vector unsigned int __vi4;
1189 __vector float __vf1;
1190
1191 __vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1192 #ifdef __LITTLE_ENDIAN__
1193 __vs8 = (__vector unsigned short) vec_mergel (__vc16, __zero);
1194 __vi4 = (__vector unsigned int) vec_mergeh (__vs8,
1195 (__vector unsigned short) __zero);
1196 #else
1197 __vs8 = (__vector unsigned short) vec_mergel (__zero, __vc16);
1198 __vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) __zero,
1199 __vs8);
1200 #endif
1201 __vf1 = (__vector float) vec_ctf (__vi4, 0);
1202
1203 return (__m128) __vf1;
1204 }
1205
1206 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1207 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32x2_ps(__m64 __A,__m64 __B)1208 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1209 {
1210 __vector signed int __vi4;
1211 __vector float __vf4;
1212
1213 __vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1214 __vf4 = (__vector float) vec_ctf (__vi4, 0);
1215 return (__m128) __vf4;
1216 }
1217
1218 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi16(__m128 __A)1220 _mm_cvtps_pi16 (__m128 __A)
1221 {
1222 __v4sf __rounded;
1223 __vector signed int __temp;
1224 __vector unsigned long long __result;
1225
1226 __rounded = vec_rint(__A);
1227 __temp = vec_cts (__rounded, 0);
1228 __result = (__vector unsigned long long) vec_pack (__temp, __temp);
1229
1230 return (__m64) ((__vector long long) __result)[0];
1231 }
1232
1233 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pi8(__m128 __A)1235 _mm_cvtps_pi8 (__m128 __A)
1236 {
1237 __v4sf __rounded;
1238 __vector signed int __tmp_i;
1239 static const __vector signed int __zero = {0, 0, 0, 0};
1240 __vector signed short __tmp_s;
1241 __vector signed char __res_v;
1242
1243 __rounded = vec_rint(__A);
1244 __tmp_i = vec_cts (__rounded, 0);
1245 __tmp_s = vec_pack (__tmp_i, __zero);
1246 __res_v = vec_pack (__tmp_s, __tmp_s);
1247 return (__m64) ((__vector long long) __res_v)[0];
1248 }
1249
1250 /* Selects four specific SPFP values from A and B based on MASK. */
1251 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1252
_mm_shuffle_ps(__m128 __A,__m128 __B,int const __mask)1253 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1254 {
1255 unsigned long __element_selector_10 = __mask & 0x03;
1256 unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
1257 unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
1258 unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
1259 static const unsigned int __permute_selectors[4] =
1260 {
1261 #ifdef __LITTLE_ENDIAN__
1262 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1263 #else
1264 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1265 #endif
1266 };
1267 __vector unsigned int __t;
1268
1269 __t[0] = __permute_selectors[__element_selector_10];
1270 __t[1] = __permute_selectors[__element_selector_32];
1271 __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
1272 __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
1273 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)__t);
1274 }
1275
1276 /* Selects and interleaves the upper two SPFP values from A and B. */
1277 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_ps(__m128 __A,__m128 __B)1278 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1279 {
1280 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1281 }
1282
1283 /* Selects and interleaves the lower two SPFP values from A and B. */
1284 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_ps(__m128 __A,__m128 __B)1285 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1286 {
1287 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1288 }
1289
1290 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1291 the lower two values are passed through from A. */
1292 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pi(__m128 __A,__m64 const * __P)1293 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1294 {
1295 __vector unsigned long long __a = (__vector unsigned long long)__A;
1296 __vector unsigned long long __p = vec_splats(*__P);
1297 __a [1] = __p [1];
1298
1299 return (__m128)__a;
1300 }
1301
1302 /* Stores the upper two SPFP values of A into P. */
1303 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pi(__m64 * __P,__m128 __A)1304 _mm_storeh_pi (__m64 *__P, __m128 __A)
1305 {
1306 __vector unsigned long long __a = (__vector unsigned long long) __A;
1307
1308 *__P = __a[1];
1309 }
1310
1311 /* Moves the upper two values of B into the lower two values of A. */
1312 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehl_ps(__m128 __A,__m128 __B)1313 _mm_movehl_ps (__m128 __A, __m128 __B)
1314 {
1315 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1316 (__vector unsigned long long)__A);
1317 }
1318
1319 /* Moves the lower two values of B into the upper two values of A. */
1320 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movelh_ps(__m128 __A,__m128 __B)1321 _mm_movelh_ps (__m128 __A, __m128 __B)
1322 {
1323 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1324 (__vector unsigned long long)__B);
1325 }
1326
1327 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1328 the upper two values are passed through from A. */
1329 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pi(__m128 __A,__m64 const * __P)1330 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1331 {
1332 __vector unsigned long long __a = (__vector unsigned long long)__A;
1333 __vector unsigned long long __p = vec_splats(*__P);
1334 __a [0] = __p [0];
1335
1336 return (__m128)__a;
1337 }
1338
1339 /* Stores the lower two SPFP values of A into P. */
1340 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pi(__m64 * __P,__m128 __A)1341 _mm_storel_pi (__m64 *__P, __m128 __A)
1342 {
1343 __vector unsigned long long __a = (__vector unsigned long long) __A;
1344
1345 *__P = __a[0];
1346 }
1347
1348 #ifdef _ARCH_PWR8
1349 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1350
1351 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1352 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps(__m128 __A)1353 _mm_movemask_ps (__m128 __A)
1354 {
1355 __vector unsigned long long __result;
1356 static const __vector unsigned int __perm_mask =
1357 {
1358 #ifdef __LITTLE_ENDIAN__
1359 0x00204060, 0x80808080, 0x80808080, 0x80808080
1360 #else
1361 0x80808080, 0x80808080, 0x80808080, 0x00204060
1362 #endif
1363 };
1364
1365 __result = ((__vector unsigned long long)
1366 vec_vbpermq ((__vector unsigned char) __A,
1367 (__vector unsigned char) __perm_mask));
1368
1369 #ifdef __LITTLE_ENDIAN__
1370 return __result[1];
1371 #else
1372 return __result[0];
1373 #endif
1374 }
1375 #endif /* _ARCH_PWR8 */
1376
1377 /* Create a vector with all four elements equal to *P. */
1378 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_ps(float const * __P)1379 _mm_load1_ps (float const *__P)
1380 {
1381 return _mm_set1_ps (*__P);
1382 }
1383
1384 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_ps1(float const * __P)1385 _mm_load_ps1 (float const *__P)
1386 {
1387 return _mm_load1_ps (__P);
1388 }
1389
1390 /* Extracts one of the four words of A. The selector N must be immediate. */
1391 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_pi16(__m64 const __A,int const __N)1392 _mm_extract_pi16 (__m64 const __A, int const __N)
1393 {
1394 unsigned int __shiftr = __N & 3;
1395 #ifdef __BIG_ENDIAN__
1396 __shiftr = 3 - __shiftr;
1397 #endif
1398
1399 return ((__A >> (__shiftr * 16)) & 0xffff);
1400 }
1401
1402 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pextrw(__m64 const __A,int const __N)1403 _m_pextrw (__m64 const __A, int const __N)
1404 {
1405 return _mm_extract_pi16 (__A, __N);
1406 }
1407
1408 /* Inserts word D into one of four words of A. The selector N must be
1409 immediate. */
1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_pi16(__m64 const __A,int const __D,int const __N)1411 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1412 {
1413 const int __shiftl = (__N & 3) * 16;
1414 const __m64 __shiftD = (const __m64) __D << __shiftl;
1415 const __m64 __mask = 0xffffUL << __shiftl;
1416 __m64 __result = (__A & (~__mask)) | (__shiftD & __mask);
1417
1418 return __result;
1419 }
1420
1421 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pinsrw(__m64 const __A,int const __D,int const __N)1422 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1423 {
1424 return _mm_insert_pi16 (__A, __D, __N);
1425 }
1426
1427 /* Compute the element-wise maximum of signed 16-bit values. */
1428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1429
_mm_max_pi16(__m64 __A,__m64 __B)1430 _mm_max_pi16 (__m64 __A, __m64 __B)
1431 {
1432 #if _ARCH_PWR8
1433 __vector signed short __a, __b, __r;
1434 __vector __bool short __c;
1435
1436 __a = (__vector signed short)vec_splats (__A);
1437 __b = (__vector signed short)vec_splats (__B);
1438 __c = (__vector __bool short)vec_cmpgt (__a, __b);
1439 __r = vec_sel (__b, __a, __c);
1440 return (__m64) ((__vector long long) __r)[0];
1441 #else
1442 __m64_union __m1, __m2, __res;
1443
1444 __m1.as_m64 = __A;
1445 __m2.as_m64 = __B;
1446
1447 __res.as_short[0] =
1448 (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
1449 __res.as_short[1] =
1450 (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
1451 __res.as_short[2] =
1452 (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
1453 __res.as_short[3] =
1454 (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
1455
1456 return (__m64) __res.as_m64;
1457 #endif
1458 }
1459
1460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxsw(__m64 __A,__m64 __B)1461 _m_pmaxsw (__m64 __A, __m64 __B)
1462 {
1463 return _mm_max_pi16 (__A, __B);
1464 }
1465
1466 /* Compute the element-wise maximum of unsigned 8-bit values. */
1467 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pu8(__m64 __A,__m64 __B)1468 _mm_max_pu8 (__m64 __A, __m64 __B)
1469 {
1470 #if _ARCH_PWR8
1471 __vector unsigned char __a, __b, __r;
1472 __vector __bool char __c;
1473
1474 __a = (__vector unsigned char)vec_splats (__A);
1475 __b = (__vector unsigned char)vec_splats (__B);
1476 __c = (__vector __bool char)vec_cmpgt (__a, __b);
1477 __r = vec_sel (__b, __a, __c);
1478 return (__m64) ((__vector long long) __r)[0];
1479 #else
1480 __m64_union __m1, __m2, __res;
1481 long __i;
1482
1483 __m1.as_m64 = __A;
1484 __m2.as_m64 = __B;
1485
1486 for (__i = 0; __i < 8; __i++)
1487 __res.as_char[__i] =
1488 ((unsigned char) __m1.as_char[__i] > (unsigned char) __m2.as_char[__i]) ?
1489 __m1.as_char[__i] : __m2.as_char[__i];
1490
1491 return (__m64) __res.as_m64;
1492 #endif
1493 }
1494
1495 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaxub(__m64 __A,__m64 __B)1496 _m_pmaxub (__m64 __A, __m64 __B)
1497 {
1498 return _mm_max_pu8 (__A, __B);
1499 }
1500
1501 /* Compute the element-wise minimum of signed 16-bit values. */
1502 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pi16(__m64 __A,__m64 __B)1503 _mm_min_pi16 (__m64 __A, __m64 __B)
1504 {
1505 #if _ARCH_PWR8
1506 __vector signed short __a, __b, __r;
1507 __vector __bool short __c;
1508
1509 __a = (__vector signed short)vec_splats (__A);
1510 __b = (__vector signed short)vec_splats (__B);
1511 __c = (__vector __bool short)vec_cmplt (__a, __b);
1512 __r = vec_sel (__b, __a, __c);
1513 return (__m64) ((__vector long long) __r)[0];
1514 #else
1515 __m64_union __m1, __m2, __res;
1516
1517 __m1.as_m64 = __A;
1518 __m2.as_m64 = __B;
1519
1520 __res.as_short[0] =
1521 (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] : __m2.as_short[0];
1522 __res.as_short[1] =
1523 (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] : __m2.as_short[1];
1524 __res.as_short[2] =
1525 (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] : __m2.as_short[2];
1526 __res.as_short[3] =
1527 (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] : __m2.as_short[3];
1528
1529 return (__m64) __res.as_m64;
1530 #endif
1531 }
1532
1533 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminsw(__m64 __A,__m64 __B)1534 _m_pminsw (__m64 __A, __m64 __B)
1535 {
1536 return _mm_min_pi16 (__A, __B);
1537 }
1538
1539 /* Compute the element-wise minimum of unsigned 8-bit values. */
1540 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pu8(__m64 __A,__m64 __B)1541 _mm_min_pu8 (__m64 __A, __m64 __B)
1542 {
1543 #if _ARCH_PWR8
1544 __vector unsigned char __a, __b, __r;
1545 __vector __bool char __c;
1546
1547 __a = (__vector unsigned char)vec_splats (__A);
1548 __b = (__vector unsigned char)vec_splats (__B);
1549 __c = (__vector __bool char)vec_cmplt (__a, __b);
1550 __r = vec_sel (__b, __a, __c);
1551 return (__m64) ((__vector long long) __r)[0];
1552 #else
1553 __m64_union __m1, __m2, __res;
1554 long __i;
1555
1556 __m1.as_m64 = __A;
1557 __m2.as_m64 = __B;
1558
1559
1560 for (__i = 0; __i < 8; __i++)
1561 __res.as_char[__i] =
1562 ((unsigned char) __m1.as_char[__i] < (unsigned char) __m2.as_char[__i]) ?
1563 __m1.as_char[__i] : __m2.as_char[__i];
1564
1565 return (__m64) __res.as_m64;
1566 #endif
1567 }
1568
1569 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pminub(__m64 __A,__m64 __B)1570 _m_pminub (__m64 __A, __m64 __B)
1571 {
1572 return _mm_min_pu8 (__A, __B);
1573 }
1574
1575 /* Create an 8-bit mask of the signs of 8-bit values. */
1576 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8(__m64 __A)1577 _mm_movemask_pi8 (__m64 __A)
1578 {
1579 #ifdef __powerpc64__
1580 unsigned long long __p =
1581 #ifdef __LITTLE_ENDIAN__
1582 0x0008101820283038UL; // permute control for sign bits
1583 #else
1584 0x3830282018100800UL; // permute control for sign bits
1585 #endif
1586 return __builtin_bpermd (__p, __A);
1587 #else
1588 #ifdef __LITTLE_ENDIAN__
1589 unsigned int __mask = 0x20283038UL;
1590 unsigned int __r1 = __builtin_bpermd (__mask, __A) & 0xf;
1591 unsigned int __r2 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
1592 #else
1593 unsigned int __mask = 0x38302820UL;
1594 unsigned int __r1 = __builtin_bpermd (__mask, __A >> 32) & 0xf;
1595 unsigned int __r2 = __builtin_bpermd (__mask, __A) & 0xf;
1596 #endif
1597 return (__r2 << 4) | __r1;
1598 #endif
1599 }
1600
1601 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmovmskb(__m64 __A)1602 _m_pmovmskb (__m64 __A)
1603 {
1604 return _mm_movemask_pi8 (__A);
1605 }
1606
1607 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1608 in B and produce the high 16 bits of the 32-bit results. */
1609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16(__m64 __A,__m64 __B)1610 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1611 {
1612 __vector unsigned short __a, __b;
1613 __vector unsigned short __c;
1614 __vector unsigned int __w0, __w1;
1615 __vector unsigned char __xform1 = {
1616 #ifdef __LITTLE_ENDIAN__
1617 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1618 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1619 #else
1620 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1621 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1622 #endif
1623 };
1624
1625 __a = (__vector unsigned short)vec_splats (__A);
1626 __b = (__vector unsigned short)vec_splats (__B);
1627
1628 __w0 = vec_vmuleuh (__a, __b);
1629 __w1 = vec_vmulouh (__a, __b);
1630 __c = (__vector unsigned short)vec_perm (__w0, __w1, __xform1);
1631
1632 return (__m64) ((__vector long long) __c)[0];
1633 }
1634
1635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhuw(__m64 __A,__m64 __B)1636 _m_pmulhuw (__m64 __A, __m64 __B)
1637 {
1638 return _mm_mulhi_pu16 (__A, __B);
1639 }
1640
1641 /* Return a combination of the four 16-bit values in A. The selector
1642 must be an immediate. */
1643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi16(__m64 __A,int const __N)1644 _mm_shuffle_pi16 (__m64 __A, int const __N)
1645 {
1646 unsigned long __element_selector_10 = __N & 0x03;
1647 unsigned long __element_selector_32 = (__N >> 2) & 0x03;
1648 unsigned long __element_selector_54 = (__N >> 4) & 0x03;
1649 unsigned long __element_selector_76 = (__N >> 6) & 0x03;
1650 static const unsigned short __permute_selectors[4] =
1651 {
1652 #ifdef __LITTLE_ENDIAN__
1653 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1654 #else
1655 0x0607, 0x0405, 0x0203, 0x0001
1656 #endif
1657 };
1658 __m64_union __t;
1659 __vector unsigned long long __a, __p, __r;
1660
1661 #ifdef __LITTLE_ENDIAN__
1662 __t.as_short[0] = __permute_selectors[__element_selector_10];
1663 __t.as_short[1] = __permute_selectors[__element_selector_32];
1664 __t.as_short[2] = __permute_selectors[__element_selector_54];
1665 __t.as_short[3] = __permute_selectors[__element_selector_76];
1666 #else
1667 __t.as_short[3] = __permute_selectors[__element_selector_10];
1668 __t.as_short[2] = __permute_selectors[__element_selector_32];
1669 __t.as_short[1] = __permute_selectors[__element_selector_54];
1670 __t.as_short[0] = __permute_selectors[__element_selector_76];
1671 #endif
1672 __p = vec_splats (__t.as_m64);
1673 __a = vec_splats (__A);
1674 __r = vec_perm (__a, __a, (__vector unsigned char)__p);
1675 return (__m64) ((__vector long long) __r)[0];
1676 }
1677
1678 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pshufw(__m64 __A,int const __N)1679 _m_pshufw (__m64 __A, int const __N)
1680 {
1681 return _mm_shuffle_pi16 (__A, __N);
1682 }
1683
1684 /* Conditionally store byte elements of A into P. The high bit of each
1685 byte in the selector N determines whether the corresponding byte from
1686 A is stored. */
1687 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmove_si64(__m64 __A,__m64 __N,char * __P)1688 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1689 {
1690 __m64 __hibit = 0x8080808080808080UL;
1691 __m64 __mask, __tmp;
1692 __m64 *__p = (__m64*)__P;
1693
1694 __tmp = *__p;
1695 __mask = _mm_cmpeq_pi8 ((__N & __hibit), __hibit);
1696 __tmp = (__tmp & (~__mask)) | (__A & __mask);
1697 *__p = __tmp;
1698 }
1699
1700 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_maskmovq(__m64 __A,__m64 __N,char * __P)1701 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1702 {
1703 _mm_maskmove_si64 (__A, __N, __P);
1704 }
1705
1706 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu8(__m64 __A,__m64 __B)1708 _mm_avg_pu8 (__m64 __A, __m64 __B)
1709 {
1710 __vector unsigned char __a, __b, __c;
1711
1712 __a = (__vector unsigned char)vec_splats (__A);
1713 __b = (__vector unsigned char)vec_splats (__B);
1714 __c = vec_avg (__a, __b);
1715 return (__m64) ((__vector long long) __c)[0];
1716 }
1717
1718 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgb(__m64 __A,__m64 __B)1719 _m_pavgb (__m64 __A, __m64 __B)
1720 {
1721 return _mm_avg_pu8 (__A, __B);
1722 }
1723
1724 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1725 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_pu16(__m64 __A,__m64 __B)1726 _mm_avg_pu16 (__m64 __A, __m64 __B)
1727 {
1728 __vector unsigned short __a, __b, __c;
1729
1730 __a = (__vector unsigned short)vec_splats (__A);
1731 __b = (__vector unsigned short)vec_splats (__B);
1732 __c = vec_avg (__a, __b);
1733 return (__m64) ((__vector long long) __c)[0];
1734 }
1735
1736 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgw(__m64 __A,__m64 __B)1737 _m_pavgw (__m64 __A, __m64 __B)
1738 {
1739 return _mm_avg_pu16 (__A, __B);
1740 }
1741
1742 /* Compute the sum of the absolute differences of the unsigned 8-bit
1743 values in A and B. Return the value in the lower 16-bit word; the
1744 upper words are cleared. */
1745 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_pu8(__m64 __A,__m64 __B)1746 _mm_sad_pu8 (__m64 __A, __m64 __B)
1747 {
1748 __vector unsigned char __a, __b;
1749 __vector unsigned char __vmin, __vmax, __vabsdiff;
1750 __vector signed int __vsum;
1751 const __vector unsigned int __zero =
1752 { 0, 0, 0, 0 };
1753 __m64_union __result = {0};
1754
1755 __a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1756 __b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1757 __vmin = vec_min (__a, __b);
1758 __vmax = vec_max (__a, __b);
1759 __vabsdiff = vec_sub (__vmax, __vmin);
1760 /* Sum four groups of bytes into integers. */
1761 __vsum = (__vector signed int) vec_sum4s (__vabsdiff, __zero);
1762 /* Sum across four integers with integer result. */
1763 __vsum = vec_sums (__vsum, (__vector signed int) __zero);
1764 /* The sum is in the right most 32-bits of the vector result.
1765 Transfer to a GPR and truncate to 16 bits. */
1766 __result.as_short[0] = __vsum[3];
1767 return __result.as_m64;
1768 }
1769
1770 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psadbw(__m64 __A,__m64 __B)1771 _m_psadbw (__m64 __A, __m64 __B)
1772 {
1773 return _mm_sad_pu8 (__A, __B);
1774 }
1775
1776 /* Stores the data in A to the address P without polluting the caches. */
1777 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pi(__m64 * __P,__m64 __A)1778 _mm_stream_pi (__m64 *__P, __m64 __A)
1779 {
1780 /* Use the data cache block touch for store transient. */
1781 __asm__ (
1782 " dcbtstt 0,%0"
1783 :
1784 : "b" (__P)
1785 : "memory"
1786 );
1787 *__P = __A;
1788 }
1789
1790 /* Likewise. The address must be 16-byte aligned. */
1791 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ps(float * __P,__m128 __A)1792 _mm_stream_ps (float *__P, __m128 __A)
1793 {
1794 /* Use the data cache block touch for store transient. */
1795 __asm__ (
1796 " dcbtstt 0,%0"
1797 :
1798 : "b" (__P)
1799 : "memory"
1800 );
1801 _mm_store_ps (__P, __A);
1802 }
1803
1804 /* Guarantees that every preceding store is globally visible before
1805 any subsequent store. */
1806 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sfence(void)1807 _mm_sfence (void)
1808 {
1809 /* Generate a light weight sync. */
1810 __atomic_thread_fence (__ATOMIC_RELEASE);
1811 }
1812
1813 /* The execution of the next instruction is delayed by an implementation
1814 specific amount of time. The instruction does not modify the
1815 architectural state. This is after the pop_options pragma because
1816 it does not require SSE support in the processor--the encoding is a
1817 nop on processors that do not support it. */
1818 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_pause(void)1819 _mm_pause (void)
1820 {
1821 /* There is no exact match with this construct, but the following is
1822 close to the desired effect. */
1823 #if _ARCH_PWR8
1824 /* On power8 and later processors we can depend on Program Priority
1825 (PRI) and associated "very low" PPI setting. Since we don't know
1826 what PPI this thread is running at we: 1) save the current PRI
1827 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1828 via the special or 31,31,31 encoding. 3) issue an "isync" to
1829 insure the PRI change takes effect before we execute any more
1830 instructions.
1831 Now we can execute a lwsync (release barrier) while we execute
1832 this thread at "very low" PRI. Finally we restore the original
1833 PRI and continue execution. */
1834 unsigned long __PPR;
1835
1836 __asm__ volatile (
1837 " mfppr %0;"
1838 " or 31,31,31;"
1839 " isync;"
1840 " lwsync;"
1841 " isync;"
1842 " mtppr %0;"
1843 : "=r" (__PPR)
1844 :
1845 : "memory"
1846 );
1847 #else
1848 /* For older processor where we may not even have Program Priority
1849 controls we can only depend on Heavy Weight Sync. */
1850 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1851 #endif
1852 }
1853
1854 /* Transpose the 4x4 matrix composed of row[0-3]. */
1855 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1856 do { \
1857 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1858 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1859 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1860 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1861 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1862 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1863 (__vector long long)__t1); \
1864 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1865 (__vector long long)__t1); \
1866 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1867 (__vector long long)__t3); \
1868 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1869 (__vector long long)__t3); \
1870 } while (0)
1871
1872 /* For backward source compatibility. */
1873 //# include <emmintrin.h>
1874
1875 #endif /* _XMMINTRIN_H_INCLUDED */
1876