xref: /dflybsd-src/contrib/gcc-8.0/gcc/config/i386/xopintrin.h (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Copyright (C) 2007-2018 Free Software Foundation, Inc.
2*38fd1498Szrj 
3*38fd1498Szrj    This file is part of GCC.
4*38fd1498Szrj 
5*38fd1498Szrj    GCC is free software; you can redistribute it and/or modify
6*38fd1498Szrj    it under the terms of the GNU General Public License as published by
7*38fd1498Szrj    the Free Software Foundation; either version 3, or (at your option)
8*38fd1498Szrj    any later version.
9*38fd1498Szrj 
10*38fd1498Szrj    GCC is distributed in the hope that it will be useful,
11*38fd1498Szrj    but WITHOUT ANY WARRANTY; without even the implied warranty of
12*38fd1498Szrj    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13*38fd1498Szrj    GNU General Public License for more details.
14*38fd1498Szrj 
15*38fd1498Szrj    Under Section 7 of GPL version 3, you are granted additional
16*38fd1498Szrj    permissions described in the GCC Runtime Library Exception, version
17*38fd1498Szrj    3.1, as published by the Free Software Foundation.
18*38fd1498Szrj 
19*38fd1498Szrj    You should have received a copy of the GNU General Public License and
20*38fd1498Szrj    a copy of the GCC Runtime Library Exception along with this program;
21*38fd1498Szrj    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22*38fd1498Szrj    <http://www.gnu.org/licenses/>.  */
23*38fd1498Szrj 
24*38fd1498Szrj #ifndef _X86INTRIN_H_INCLUDED
25*38fd1498Szrj # error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
26*38fd1498Szrj #endif
27*38fd1498Szrj 
28*38fd1498Szrj #ifndef _XOPMMINTRIN_H_INCLUDED
29*38fd1498Szrj #define _XOPMMINTRIN_H_INCLUDED
30*38fd1498Szrj 
31*38fd1498Szrj #include <fma4intrin.h>
32*38fd1498Szrj 
33*38fd1498Szrj #ifndef __XOP__
34*38fd1498Szrj #pragma GCC push_options
35*38fd1498Szrj #pragma GCC target("xop")
36*38fd1498Szrj #define __DISABLE_XOP__
37*38fd1498Szrj #endif /* __XOP__ */
38*38fd1498Szrj 
39*38fd1498Szrj /* Integer multiply/add instructions. */
40*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccs_epi16(__m128i __A,__m128i __B,__m128i __C)41*38fd1498Szrj _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
42*38fd1498Szrj {
43*38fd1498Szrj   return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
44*38fd1498Szrj }
45*38fd1498Szrj 
46*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_epi16(__m128i __A,__m128i __B,__m128i __C)47*38fd1498Szrj _mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
48*38fd1498Szrj {
49*38fd1498Szrj   return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
50*38fd1498Szrj }
51*38fd1498Szrj 
52*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccsd_epi16(__m128i __A,__m128i __B,__m128i __C)53*38fd1498Szrj _mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
54*38fd1498Szrj {
55*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
56*38fd1498Szrj }
57*38fd1498Szrj 
58*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccd_epi16(__m128i __A,__m128i __B,__m128i __C)59*38fd1498Szrj _mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
60*38fd1498Szrj {
61*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
62*38fd1498Szrj }
63*38fd1498Szrj 
64*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccs_epi32(__m128i __A,__m128i __B,__m128i __C)65*38fd1498Szrj _mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
66*38fd1498Szrj {
67*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
68*38fd1498Szrj }
69*38fd1498Szrj 
70*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_epi32(__m128i __A,__m128i __B,__m128i __C)71*38fd1498Szrj _mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
72*38fd1498Szrj {
73*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
74*38fd1498Szrj }
75*38fd1498Szrj 
76*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccslo_epi32(__m128i __A,__m128i __B,__m128i __C)77*38fd1498Szrj _mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
78*38fd1498Szrj {
79*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
80*38fd1498Szrj }
81*38fd1498Szrj 
82*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macclo_epi32(__m128i __A,__m128i __B,__m128i __C)83*38fd1498Szrj _mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
84*38fd1498Szrj {
85*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
86*38fd1498Szrj }
87*38fd1498Szrj 
88*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maccshi_epi32(__m128i __A,__m128i __B,__m128i __C)89*38fd1498Szrj _mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
90*38fd1498Szrj {
91*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
92*38fd1498Szrj }
93*38fd1498Szrj 
94*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macchi_epi32(__m128i __A,__m128i __B,__m128i __C)95*38fd1498Szrj _mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
96*38fd1498Szrj {
97*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
98*38fd1498Szrj }
99*38fd1498Szrj 
100*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsd_epi16(__m128i __A,__m128i __B,__m128i __C)101*38fd1498Szrj _mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
102*38fd1498Szrj {
103*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
104*38fd1498Szrj }
105*38fd1498Szrj 
106*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddd_epi16(__m128i __A,__m128i __B,__m128i __C)107*38fd1498Szrj _mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
108*38fd1498Szrj {
109*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
110*38fd1498Szrj }
111*38fd1498Szrj 
112*38fd1498Szrj /* Packed Integer Horizontal Add and Subtract */
113*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddw_epi8(__m128i __A)114*38fd1498Szrj _mm_haddw_epi8(__m128i __A)
115*38fd1498Szrj {
116*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A);
117*38fd1498Szrj }
118*38fd1498Szrj 
119*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epi8(__m128i __A)120*38fd1498Szrj _mm_haddd_epi8(__m128i __A)
121*38fd1498Szrj {
122*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A);
123*38fd1498Szrj }
124*38fd1498Szrj 
125*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi8(__m128i __A)126*38fd1498Szrj _mm_haddq_epi8(__m128i __A)
127*38fd1498Szrj {
128*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A);
129*38fd1498Szrj }
130*38fd1498Szrj 
131*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epi16(__m128i __A)132*38fd1498Szrj _mm_haddd_epi16(__m128i __A)
133*38fd1498Szrj {
134*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A);
135*38fd1498Szrj }
136*38fd1498Szrj 
137*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi16(__m128i __A)138*38fd1498Szrj _mm_haddq_epi16(__m128i __A)
139*38fd1498Szrj {
140*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A);
141*38fd1498Szrj }
142*38fd1498Szrj 
143*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epi32(__m128i __A)144*38fd1498Szrj _mm_haddq_epi32(__m128i __A)
145*38fd1498Szrj {
146*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphadddq ((__v4si)__A);
147*38fd1498Szrj }
148*38fd1498Szrj 
149*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddw_epu8(__m128i __A)150*38fd1498Szrj _mm_haddw_epu8(__m128i __A)
151*38fd1498Szrj {
152*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A);
153*38fd1498Szrj }
154*38fd1498Szrj 
155*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epu8(__m128i __A)156*38fd1498Szrj _mm_haddd_epu8(__m128i __A)
157*38fd1498Szrj {
158*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A);
159*38fd1498Szrj }
160*38fd1498Szrj 
161*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu8(__m128i __A)162*38fd1498Szrj _mm_haddq_epu8(__m128i __A)
163*38fd1498Szrj {
164*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A);
165*38fd1498Szrj }
166*38fd1498Szrj 
167*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddd_epu16(__m128i __A)168*38fd1498Szrj _mm_haddd_epu16(__m128i __A)
169*38fd1498Szrj {
170*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A);
171*38fd1498Szrj }
172*38fd1498Szrj 
173*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu16(__m128i __A)174*38fd1498Szrj _mm_haddq_epu16(__m128i __A)
175*38fd1498Szrj {
176*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A);
177*38fd1498Szrj }
178*38fd1498Szrj 
179*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_haddq_epu32(__m128i __A)180*38fd1498Szrj _mm_haddq_epu32(__m128i __A)
181*38fd1498Szrj {
182*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A);
183*38fd1498Szrj }
184*38fd1498Szrj 
185*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubw_epi8(__m128i __A)186*38fd1498Szrj _mm_hsubw_epi8(__m128i __A)
187*38fd1498Szrj {
188*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A);
189*38fd1498Szrj }
190*38fd1498Szrj 
191*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubd_epi16(__m128i __A)192*38fd1498Szrj _mm_hsubd_epi16(__m128i __A)
193*38fd1498Szrj {
194*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A);
195*38fd1498Szrj }
196*38fd1498Szrj 
197*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubq_epi32(__m128i __A)198*38fd1498Szrj _mm_hsubq_epi32(__m128i __A)
199*38fd1498Szrj {
200*38fd1498Szrj   return  (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A);
201*38fd1498Szrj }
202*38fd1498Szrj 
203*38fd1498Szrj /* Vector conditional move and permute */
204*38fd1498Szrj 
205*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmov_si128(__m128i __A,__m128i __B,__m128i __C)206*38fd1498Szrj _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
207*38fd1498Szrj {
208*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpcmov (__A, __B, __C);
209*38fd1498Szrj }
210*38fd1498Szrj 
211*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_perm_epi8(__m128i __A,__m128i __B,__m128i __C)212*38fd1498Szrj _mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
213*38fd1498Szrj {
214*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
215*38fd1498Szrj }
216*38fd1498Szrj 
217*38fd1498Szrj /* Packed Integer Rotates and Shifts
218*38fd1498Szrj    Rotates - Non-Immediate form */
219*38fd1498Szrj 
220*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi8(__m128i __A,__m128i __B)221*38fd1498Szrj _mm_rot_epi8(__m128i __A,  __m128i __B)
222*38fd1498Szrj {
223*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B);
224*38fd1498Szrj }
225*38fd1498Szrj 
226*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi16(__m128i __A,__m128i __B)227*38fd1498Szrj _mm_rot_epi16(__m128i __A,  __m128i __B)
228*38fd1498Szrj {
229*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B);
230*38fd1498Szrj }
231*38fd1498Szrj 
232*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi32(__m128i __A,__m128i __B)233*38fd1498Szrj _mm_rot_epi32(__m128i __A,  __m128i __B)
234*38fd1498Szrj {
235*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B);
236*38fd1498Szrj }
237*38fd1498Szrj 
238*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rot_epi64(__m128i __A,__m128i __B)239*38fd1498Szrj _mm_rot_epi64(__m128i __A,  __m128i __B)
240*38fd1498Szrj {
241*38fd1498Szrj   return (__m128i)  __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B);
242*38fd1498Szrj }
243*38fd1498Szrj 
244*38fd1498Szrj /* Rotates - Immediate form */
245*38fd1498Szrj 
246*38fd1498Szrj #ifdef __OPTIMIZE__
247*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi8(__m128i __A,const int __B)248*38fd1498Szrj _mm_roti_epi8(__m128i __A, const int __B)
249*38fd1498Szrj {
250*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B);
251*38fd1498Szrj }
252*38fd1498Szrj 
253*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi16(__m128i __A,const int __B)254*38fd1498Szrj _mm_roti_epi16(__m128i __A, const int __B)
255*38fd1498Szrj {
256*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B);
257*38fd1498Szrj }
258*38fd1498Szrj 
259*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi32(__m128i __A,const int __B)260*38fd1498Szrj _mm_roti_epi32(__m128i __A, const int __B)
261*38fd1498Szrj {
262*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B);
263*38fd1498Szrj }
264*38fd1498Szrj 
265*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_roti_epi64(__m128i __A,const int __B)266*38fd1498Szrj _mm_roti_epi64(__m128i __A, const int __B)
267*38fd1498Szrj {
268*38fd1498Szrj   return  (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B);
269*38fd1498Szrj }
270*38fd1498Szrj #else
271*38fd1498Szrj #define _mm_roti_epi8(A, N) \
272*38fd1498Szrj   ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N)))
273*38fd1498Szrj #define _mm_roti_epi16(A, N) \
274*38fd1498Szrj   ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N)))
275*38fd1498Szrj #define _mm_roti_epi32(A, N) \
276*38fd1498Szrj   ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N)))
277*38fd1498Szrj #define _mm_roti_epi64(A, N) \
278*38fd1498Szrj   ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N)))
279*38fd1498Szrj #endif
280*38fd1498Szrj 
281*38fd1498Szrj /* Shifts */
282*38fd1498Szrj 
283*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi8(__m128i __A,__m128i __B)284*38fd1498Szrj _mm_shl_epi8(__m128i __A,  __m128i __B)
285*38fd1498Szrj {
286*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B);
287*38fd1498Szrj }
288*38fd1498Szrj 
289*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi16(__m128i __A,__m128i __B)290*38fd1498Szrj _mm_shl_epi16(__m128i __A,  __m128i __B)
291*38fd1498Szrj {
292*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B);
293*38fd1498Szrj }
294*38fd1498Szrj 
295*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi32(__m128i __A,__m128i __B)296*38fd1498Szrj _mm_shl_epi32(__m128i __A,  __m128i __B)
297*38fd1498Szrj {
298*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B);
299*38fd1498Szrj }
300*38fd1498Szrj 
301*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shl_epi64(__m128i __A,__m128i __B)302*38fd1498Szrj _mm_shl_epi64(__m128i __A,  __m128i __B)
303*38fd1498Szrj {
304*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B);
305*38fd1498Szrj }
306*38fd1498Szrj 
307*38fd1498Szrj 
308*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi8(__m128i __A,__m128i __B)309*38fd1498Szrj _mm_sha_epi8(__m128i __A,  __m128i __B)
310*38fd1498Szrj {
311*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B);
312*38fd1498Szrj }
313*38fd1498Szrj 
314*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi16(__m128i __A,__m128i __B)315*38fd1498Szrj _mm_sha_epi16(__m128i __A,  __m128i __B)
316*38fd1498Szrj {
317*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B);
318*38fd1498Szrj }
319*38fd1498Szrj 
320*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi32(__m128i __A,__m128i __B)321*38fd1498Szrj _mm_sha_epi32(__m128i __A,  __m128i __B)
322*38fd1498Szrj {
323*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B);
324*38fd1498Szrj }
325*38fd1498Szrj 
326*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha_epi64(__m128i __A,__m128i __B)327*38fd1498Szrj _mm_sha_epi64(__m128i __A,  __m128i __B)
328*38fd1498Szrj {
329*38fd1498Szrj   return  (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B);
330*38fd1498Szrj }
331*38fd1498Szrj 
332*38fd1498Szrj /* Compare and Predicate Generation
333*38fd1498Szrj    pcom (integer, unsigned bytes) */
334*38fd1498Szrj 
335*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu8(__m128i __A,__m128i __B)336*38fd1498Szrj _mm_comlt_epu8(__m128i __A, __m128i __B)
337*38fd1498Szrj {
338*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B);
339*38fd1498Szrj }
340*38fd1498Szrj 
341*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu8(__m128i __A,__m128i __B)342*38fd1498Szrj _mm_comle_epu8(__m128i __A, __m128i __B)
343*38fd1498Szrj {
344*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B);
345*38fd1498Szrj }
346*38fd1498Szrj 
347*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu8(__m128i __A,__m128i __B)348*38fd1498Szrj _mm_comgt_epu8(__m128i __A, __m128i __B)
349*38fd1498Szrj {
350*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B);
351*38fd1498Szrj }
352*38fd1498Szrj 
353*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu8(__m128i __A,__m128i __B)354*38fd1498Szrj _mm_comge_epu8(__m128i __A, __m128i __B)
355*38fd1498Szrj {
356*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B);
357*38fd1498Szrj }
358*38fd1498Szrj 
359*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu8(__m128i __A,__m128i __B)360*38fd1498Szrj _mm_comeq_epu8(__m128i __A, __m128i __B)
361*38fd1498Szrj {
362*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B);
363*38fd1498Szrj }
364*38fd1498Szrj 
365*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu8(__m128i __A,__m128i __B)366*38fd1498Szrj _mm_comneq_epu8(__m128i __A, __m128i __B)
367*38fd1498Szrj {
368*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B);
369*38fd1498Szrj }
370*38fd1498Szrj 
371*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu8(__m128i __A,__m128i __B)372*38fd1498Szrj _mm_comfalse_epu8(__m128i __A, __m128i __B)
373*38fd1498Szrj {
374*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B);
375*38fd1498Szrj }
376*38fd1498Szrj 
377*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu8(__m128i __A,__m128i __B)378*38fd1498Szrj _mm_comtrue_epu8(__m128i __A, __m128i __B)
379*38fd1498Szrj {
380*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
381*38fd1498Szrj }
382*38fd1498Szrj 
383*38fd1498Szrj /*pcom (integer, unsigned words) */
384*38fd1498Szrj 
385*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu16(__m128i __A,__m128i __B)386*38fd1498Szrj _mm_comlt_epu16(__m128i __A, __m128i __B)
387*38fd1498Szrj {
388*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B);
389*38fd1498Szrj }
390*38fd1498Szrj 
391*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu16(__m128i __A,__m128i __B)392*38fd1498Szrj _mm_comle_epu16(__m128i __A, __m128i __B)
393*38fd1498Szrj {
394*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B);
395*38fd1498Szrj }
396*38fd1498Szrj 
397*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu16(__m128i __A,__m128i __B)398*38fd1498Szrj _mm_comgt_epu16(__m128i __A, __m128i __B)
399*38fd1498Szrj {
400*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B);
401*38fd1498Szrj }
402*38fd1498Szrj 
403*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu16(__m128i __A,__m128i __B)404*38fd1498Szrj _mm_comge_epu16(__m128i __A, __m128i __B)
405*38fd1498Szrj {
406*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B);
407*38fd1498Szrj }
408*38fd1498Szrj 
409*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu16(__m128i __A,__m128i __B)410*38fd1498Szrj _mm_comeq_epu16(__m128i __A, __m128i __B)
411*38fd1498Szrj {
412*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B);
413*38fd1498Szrj }
414*38fd1498Szrj 
415*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu16(__m128i __A,__m128i __B)416*38fd1498Szrj _mm_comneq_epu16(__m128i __A, __m128i __B)
417*38fd1498Szrj {
418*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B);
419*38fd1498Szrj }
420*38fd1498Szrj 
421*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu16(__m128i __A,__m128i __B)422*38fd1498Szrj _mm_comfalse_epu16(__m128i __A, __m128i __B)
423*38fd1498Szrj {
424*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B);
425*38fd1498Szrj }
426*38fd1498Szrj 
427*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu16(__m128i __A,__m128i __B)428*38fd1498Szrj _mm_comtrue_epu16(__m128i __A, __m128i __B)
429*38fd1498Szrj {
430*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
431*38fd1498Szrj }
432*38fd1498Szrj 
433*38fd1498Szrj /*pcom (integer, unsigned double words) */
434*38fd1498Szrj 
435*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu32(__m128i __A,__m128i __B)436*38fd1498Szrj _mm_comlt_epu32(__m128i __A, __m128i __B)
437*38fd1498Szrj {
438*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B);
439*38fd1498Szrj }
440*38fd1498Szrj 
441*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu32(__m128i __A,__m128i __B)442*38fd1498Szrj _mm_comle_epu32(__m128i __A, __m128i __B)
443*38fd1498Szrj {
444*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B);
445*38fd1498Szrj }
446*38fd1498Szrj 
447*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu32(__m128i __A,__m128i __B)448*38fd1498Szrj _mm_comgt_epu32(__m128i __A, __m128i __B)
449*38fd1498Szrj {
450*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B);
451*38fd1498Szrj }
452*38fd1498Szrj 
453*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu32(__m128i __A,__m128i __B)454*38fd1498Szrj _mm_comge_epu32(__m128i __A, __m128i __B)
455*38fd1498Szrj {
456*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B);
457*38fd1498Szrj }
458*38fd1498Szrj 
459*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu32(__m128i __A,__m128i __B)460*38fd1498Szrj _mm_comeq_epu32(__m128i __A, __m128i __B)
461*38fd1498Szrj {
462*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B);
463*38fd1498Szrj }
464*38fd1498Szrj 
465*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu32(__m128i __A,__m128i __B)466*38fd1498Szrj _mm_comneq_epu32(__m128i __A, __m128i __B)
467*38fd1498Szrj {
468*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B);
469*38fd1498Szrj }
470*38fd1498Szrj 
471*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu32(__m128i __A,__m128i __B)472*38fd1498Szrj _mm_comfalse_epu32(__m128i __A, __m128i __B)
473*38fd1498Szrj {
474*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B);
475*38fd1498Szrj }
476*38fd1498Szrj 
477*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu32(__m128i __A,__m128i __B)478*38fd1498Szrj _mm_comtrue_epu32(__m128i __A, __m128i __B)
479*38fd1498Szrj {
480*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
481*38fd1498Szrj }
482*38fd1498Szrj 
483*38fd1498Szrj /*pcom (integer, unsigned quad words) */
484*38fd1498Szrj 
485*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epu64(__m128i __A,__m128i __B)486*38fd1498Szrj _mm_comlt_epu64(__m128i __A, __m128i __B)
487*38fd1498Szrj {
488*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B);
489*38fd1498Szrj }
490*38fd1498Szrj 
491*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epu64(__m128i __A,__m128i __B)492*38fd1498Szrj _mm_comle_epu64(__m128i __A, __m128i __B)
493*38fd1498Szrj {
494*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B);
495*38fd1498Szrj }
496*38fd1498Szrj 
497*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epu64(__m128i __A,__m128i __B)498*38fd1498Szrj _mm_comgt_epu64(__m128i __A, __m128i __B)
499*38fd1498Szrj {
500*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B);
501*38fd1498Szrj }
502*38fd1498Szrj 
503*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epu64(__m128i __A,__m128i __B)504*38fd1498Szrj _mm_comge_epu64(__m128i __A, __m128i __B)
505*38fd1498Szrj {
506*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B);
507*38fd1498Szrj }
508*38fd1498Szrj 
509*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epu64(__m128i __A,__m128i __B)510*38fd1498Szrj _mm_comeq_epu64(__m128i __A, __m128i __B)
511*38fd1498Szrj {
512*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B);
513*38fd1498Szrj }
514*38fd1498Szrj 
515*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epu64(__m128i __A,__m128i __B)516*38fd1498Szrj _mm_comneq_epu64(__m128i __A, __m128i __B)
517*38fd1498Szrj {
518*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B);
519*38fd1498Szrj }
520*38fd1498Szrj 
521*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epu64(__m128i __A,__m128i __B)522*38fd1498Szrj _mm_comfalse_epu64(__m128i __A, __m128i __B)
523*38fd1498Szrj {
524*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B);
525*38fd1498Szrj }
526*38fd1498Szrj 
527*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epu64(__m128i __A,__m128i __B)528*38fd1498Szrj _mm_comtrue_epu64(__m128i __A, __m128i __B)
529*38fd1498Szrj {
530*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B);
531*38fd1498Szrj }
532*38fd1498Szrj 
533*38fd1498Szrj /*pcom (integer, signed bytes) */
534*38fd1498Szrj 
535*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi8(__m128i __A,__m128i __B)536*38fd1498Szrj _mm_comlt_epi8(__m128i __A, __m128i __B)
537*38fd1498Szrj {
538*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B);
539*38fd1498Szrj }
540*38fd1498Szrj 
541*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi8(__m128i __A,__m128i __B)542*38fd1498Szrj _mm_comle_epi8(__m128i __A, __m128i __B)
543*38fd1498Szrj {
544*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B);
545*38fd1498Szrj }
546*38fd1498Szrj 
547*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi8(__m128i __A,__m128i __B)548*38fd1498Szrj _mm_comgt_epi8(__m128i __A, __m128i __B)
549*38fd1498Szrj {
550*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B);
551*38fd1498Szrj }
552*38fd1498Szrj 
553*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi8(__m128i __A,__m128i __B)554*38fd1498Szrj _mm_comge_epi8(__m128i __A, __m128i __B)
555*38fd1498Szrj {
556*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B);
557*38fd1498Szrj }
558*38fd1498Szrj 
559*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi8(__m128i __A,__m128i __B)560*38fd1498Szrj _mm_comeq_epi8(__m128i __A, __m128i __B)
561*38fd1498Szrj {
562*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B);
563*38fd1498Szrj }
564*38fd1498Szrj 
565*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi8(__m128i __A,__m128i __B)566*38fd1498Szrj _mm_comneq_epi8(__m128i __A, __m128i __B)
567*38fd1498Szrj {
568*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B);
569*38fd1498Szrj }
570*38fd1498Szrj 
571*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi8(__m128i __A,__m128i __B)572*38fd1498Szrj _mm_comfalse_epi8(__m128i __A, __m128i __B)
573*38fd1498Szrj {
574*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B);
575*38fd1498Szrj }
576*38fd1498Szrj 
577*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi8(__m128i __A,__m128i __B)578*38fd1498Szrj _mm_comtrue_epi8(__m128i __A, __m128i __B)
579*38fd1498Szrj {
580*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B);
581*38fd1498Szrj }
582*38fd1498Szrj 
583*38fd1498Szrj /*pcom (integer, signed words) */
584*38fd1498Szrj 
585*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi16(__m128i __A,__m128i __B)586*38fd1498Szrj _mm_comlt_epi16(__m128i __A, __m128i __B)
587*38fd1498Szrj {
588*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B);
589*38fd1498Szrj }
590*38fd1498Szrj 
591*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi16(__m128i __A,__m128i __B)592*38fd1498Szrj _mm_comle_epi16(__m128i __A, __m128i __B)
593*38fd1498Szrj {
594*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B);
595*38fd1498Szrj }
596*38fd1498Szrj 
597*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi16(__m128i __A,__m128i __B)598*38fd1498Szrj _mm_comgt_epi16(__m128i __A, __m128i __B)
599*38fd1498Szrj {
600*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B);
601*38fd1498Szrj }
602*38fd1498Szrj 
603*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi16(__m128i __A,__m128i __B)604*38fd1498Szrj _mm_comge_epi16(__m128i __A, __m128i __B)
605*38fd1498Szrj {
606*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B);
607*38fd1498Szrj }
608*38fd1498Szrj 
609*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi16(__m128i __A,__m128i __B)610*38fd1498Szrj _mm_comeq_epi16(__m128i __A, __m128i __B)
611*38fd1498Szrj {
612*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B);
613*38fd1498Szrj }
614*38fd1498Szrj 
615*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi16(__m128i __A,__m128i __B)616*38fd1498Szrj _mm_comneq_epi16(__m128i __A, __m128i __B)
617*38fd1498Szrj {
618*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B);
619*38fd1498Szrj }
620*38fd1498Szrj 
621*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi16(__m128i __A,__m128i __B)622*38fd1498Szrj _mm_comfalse_epi16(__m128i __A, __m128i __B)
623*38fd1498Szrj {
624*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B);
625*38fd1498Szrj }
626*38fd1498Szrj 
627*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi16(__m128i __A,__m128i __B)628*38fd1498Szrj _mm_comtrue_epi16(__m128i __A, __m128i __B)
629*38fd1498Szrj {
630*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B);
631*38fd1498Szrj }
632*38fd1498Szrj 
633*38fd1498Szrj /*pcom (integer, signed double words) */
634*38fd1498Szrj 
635*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi32(__m128i __A,__m128i __B)636*38fd1498Szrj _mm_comlt_epi32(__m128i __A, __m128i __B)
637*38fd1498Szrj {
638*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B);
639*38fd1498Szrj }
640*38fd1498Szrj 
641*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi32(__m128i __A,__m128i __B)642*38fd1498Szrj _mm_comle_epi32(__m128i __A, __m128i __B)
643*38fd1498Szrj {
644*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B);
645*38fd1498Szrj }
646*38fd1498Szrj 
647*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi32(__m128i __A,__m128i __B)648*38fd1498Szrj _mm_comgt_epi32(__m128i __A, __m128i __B)
649*38fd1498Szrj {
650*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B);
651*38fd1498Szrj }
652*38fd1498Szrj 
653*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi32(__m128i __A,__m128i __B)654*38fd1498Szrj _mm_comge_epi32(__m128i __A, __m128i __B)
655*38fd1498Szrj {
656*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B);
657*38fd1498Szrj }
658*38fd1498Szrj 
659*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi32(__m128i __A,__m128i __B)660*38fd1498Szrj _mm_comeq_epi32(__m128i __A, __m128i __B)
661*38fd1498Szrj {
662*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B);
663*38fd1498Szrj }
664*38fd1498Szrj 
665*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi32(__m128i __A,__m128i __B)666*38fd1498Szrj _mm_comneq_epi32(__m128i __A, __m128i __B)
667*38fd1498Szrj {
668*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B);
669*38fd1498Szrj }
670*38fd1498Szrj 
671*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi32(__m128i __A,__m128i __B)672*38fd1498Szrj _mm_comfalse_epi32(__m128i __A, __m128i __B)
673*38fd1498Szrj {
674*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B);
675*38fd1498Szrj }
676*38fd1498Szrj 
677*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi32(__m128i __A,__m128i __B)678*38fd1498Szrj _mm_comtrue_epi32(__m128i __A, __m128i __B)
679*38fd1498Szrj {
680*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B);
681*38fd1498Szrj }
682*38fd1498Szrj 
683*38fd1498Szrj /*pcom (integer, signed quad words) */
684*38fd1498Szrj 
685*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comlt_epi64(__m128i __A,__m128i __B)686*38fd1498Szrj _mm_comlt_epi64(__m128i __A, __m128i __B)
687*38fd1498Szrj {
688*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B);
689*38fd1498Szrj }
690*38fd1498Szrj 
691*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comle_epi64(__m128i __A,__m128i __B)692*38fd1498Szrj _mm_comle_epi64(__m128i __A, __m128i __B)
693*38fd1498Szrj {
694*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B);
695*38fd1498Szrj }
696*38fd1498Szrj 
697*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comgt_epi64(__m128i __A,__m128i __B)698*38fd1498Szrj _mm_comgt_epi64(__m128i __A, __m128i __B)
699*38fd1498Szrj {
700*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B);
701*38fd1498Szrj }
702*38fd1498Szrj 
703*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comge_epi64(__m128i __A,__m128i __B)704*38fd1498Szrj _mm_comge_epi64(__m128i __A, __m128i __B)
705*38fd1498Szrj {
706*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B);
707*38fd1498Szrj }
708*38fd1498Szrj 
709*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comeq_epi64(__m128i __A,__m128i __B)710*38fd1498Szrj _mm_comeq_epi64(__m128i __A, __m128i __B)
711*38fd1498Szrj {
712*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B);
713*38fd1498Szrj }
714*38fd1498Szrj 
715*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comneq_epi64(__m128i __A,__m128i __B)716*38fd1498Szrj _mm_comneq_epi64(__m128i __A, __m128i __B)
717*38fd1498Szrj {
718*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B);
719*38fd1498Szrj }
720*38fd1498Szrj 
721*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comfalse_epi64(__m128i __A,__m128i __B)722*38fd1498Szrj _mm_comfalse_epi64(__m128i __A, __m128i __B)
723*38fd1498Szrj {
724*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B);
725*38fd1498Szrj }
726*38fd1498Szrj 
727*38fd1498Szrj extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comtrue_epi64(__m128i __A,__m128i __B)728*38fd1498Szrj _mm_comtrue_epi64(__m128i __A, __m128i __B)
729*38fd1498Szrj {
730*38fd1498Szrj   return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B);
731*38fd1498Szrj }
732*38fd1498Szrj 
733*38fd1498Szrj /* FRCZ */
734*38fd1498Szrj 
735*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_ps(__m128 __A)736*38fd1498Szrj _mm_frcz_ps (__m128 __A)
737*38fd1498Szrj {
738*38fd1498Szrj   return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A);
739*38fd1498Szrj }
740*38fd1498Szrj 
741*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_pd(__m128d __A)742*38fd1498Szrj _mm_frcz_pd (__m128d __A)
743*38fd1498Szrj {
744*38fd1498Szrj   return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A);
745*38fd1498Szrj }
746*38fd1498Szrj 
747*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_ss(__m128 __A,__m128 __B)748*38fd1498Szrj _mm_frcz_ss (__m128 __A, __m128 __B)
749*38fd1498Szrj {
750*38fd1498Szrj   return (__m128) __builtin_ia32_movss ((__v4sf)__A,
751*38fd1498Szrj 					(__v4sf)
752*38fd1498Szrj 					__builtin_ia32_vfrczss ((__v4sf)__B));
753*38fd1498Szrj }
754*38fd1498Szrj 
755*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_frcz_sd(__m128d __A,__m128d __B)756*38fd1498Szrj _mm_frcz_sd (__m128d __A, __m128d __B)
757*38fd1498Szrj {
758*38fd1498Szrj   return (__m128d) __builtin_ia32_movsd ((__v2df)__A,
759*38fd1498Szrj 					 (__v2df)
760*38fd1498Szrj 					 __builtin_ia32_vfrczsd ((__v2df)__B));
761*38fd1498Szrj }
762*38fd1498Szrj 
763*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_frcz_ps(__m256 __A)764*38fd1498Szrj _mm256_frcz_ps (__m256 __A)
765*38fd1498Szrj {
766*38fd1498Szrj   return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A);
767*38fd1498Szrj }
768*38fd1498Szrj 
769*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_frcz_pd(__m256d __A)770*38fd1498Szrj _mm256_frcz_pd (__m256d __A)
771*38fd1498Szrj {
772*38fd1498Szrj   return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
773*38fd1498Szrj }
774*38fd1498Szrj 
775*38fd1498Szrj /* PERMIL2 */
776*38fd1498Szrj 
777*38fd1498Szrj #ifdef __OPTIMIZE__
778*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_pd(__m128d __X,__m128d __Y,__m128i __C,const int __I)779*38fd1498Szrj _mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
780*38fd1498Szrj {
781*38fd1498Szrj   return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
782*38fd1498Szrj 					      (__v2df)__Y,
783*38fd1498Szrj 					      (__v2di)__C,
784*38fd1498Szrj 					      __I);
785*38fd1498Szrj }
786*38fd1498Szrj 
787*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2_pd(__m256d __X,__m256d __Y,__m256i __C,const int __I)788*38fd1498Szrj _mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
789*38fd1498Szrj {
790*38fd1498Szrj   return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
791*38fd1498Szrj 						 (__v4df)__Y,
792*38fd1498Szrj 						 (__v4di)__C,
793*38fd1498Szrj 						 __I);
794*38fd1498Szrj }
795*38fd1498Szrj 
796*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permute2_ps(__m128 __X,__m128 __Y,__m128i __C,const int __I)797*38fd1498Szrj _mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
798*38fd1498Szrj {
799*38fd1498Szrj   return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
800*38fd1498Szrj 					     (__v4sf)__Y,
801*38fd1498Szrj 					     (__v4si)__C,
802*38fd1498Szrj 					     __I);
803*38fd1498Szrj }
804*38fd1498Szrj 
805*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2_ps(__m256 __X,__m256 __Y,__m256i __C,const int __I)806*38fd1498Szrj _mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
807*38fd1498Szrj {
808*38fd1498Szrj   return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
809*38fd1498Szrj 						(__v8sf)__Y,
810*38fd1498Szrj 						(__v8si)__C,
811*38fd1498Szrj 						__I);
812*38fd1498Szrj }
813*38fd1498Szrj #else
814*38fd1498Szrj #define _mm_permute2_pd(X, Y, C, I)					\
815*38fd1498Szrj   ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),		\
816*38fd1498Szrj 					(__v2df)(__m128d)(Y),		\
817*38fd1498Szrj 					(__v2di)(__m128d)(C),		\
818*38fd1498Szrj 					(int)(I)))
819*38fd1498Szrj 
820*38fd1498Szrj #define _mm256_permute2_pd(X, Y, C, I)					\
821*38fd1498Szrj   ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),	\
822*38fd1498Szrj 					   (__v4df)(__m256d)(Y),	\
823*38fd1498Szrj 					   (__v4di)(__m256d)(C),	\
824*38fd1498Szrj 					   (int)(I)))
825*38fd1498Szrj 
826*38fd1498Szrj #define _mm_permute2_ps(X, Y, C, I)					\
827*38fd1498Szrj   ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),		\
828*38fd1498Szrj 				       (__v4sf)(__m128)(Y),		\
829*38fd1498Szrj 				       (__v4si)(__m128)(C),		\
830*38fd1498Szrj 				       (int)(I)))
831*38fd1498Szrj 
832*38fd1498Szrj #define _mm256_permute2_ps(X, Y, C, I)					\
833*38fd1498Szrj   ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),		\
834*38fd1498Szrj 					  (__v8sf)(__m256)(Y),  	\
835*38fd1498Szrj 					  (__v8si)(__m256)(C),		\
836*38fd1498Szrj  					  (int)(I)))
837*38fd1498Szrj #endif /* __OPTIMIZE__ */
838*38fd1498Szrj 
839*38fd1498Szrj #ifdef __DISABLE_XOP__
840*38fd1498Szrj #undef __DISABLE_XOP__
841*38fd1498Szrj #pragma GCC pop_options
842*38fd1498Szrj #endif /* __DISABLE_XOP__ */
843*38fd1498Szrj 
844*38fd1498Szrj #endif /* _XOPMMINTRIN_H_INCLUDED */
845