1*38fd1498Szrj /* Copyright (C) 2011-2018 Free Software Foundation, Inc.
2*38fd1498Szrj
3*38fd1498Szrj This file is part of GCC.
4*38fd1498Szrj
5*38fd1498Szrj GCC is free software; you can redistribute it and/or modify
6*38fd1498Szrj it under the terms of the GNU General Public License as published by
7*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
8*38fd1498Szrj any later version.
9*38fd1498Szrj
10*38fd1498Szrj GCC is distributed in the hope that it will be useful,
11*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
12*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*38fd1498Szrj GNU General Public License for more details.
14*38fd1498Szrj
15*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
16*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
17*38fd1498Szrj 3.1, as published by the Free Software Foundation.
18*38fd1498Szrj
19*38fd1498Szrj You should have received a copy of the GNU General Public License and
20*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
21*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22*38fd1498Szrj <http://www.gnu.org/licenses/>. */
23*38fd1498Szrj
24*38fd1498Szrj #ifndef _IMMINTRIN_H_INCLUDED
25*38fd1498Szrj # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26*38fd1498Szrj #endif
27*38fd1498Szrj
28*38fd1498Szrj #ifndef _AVX2INTRIN_H_INCLUDED
29*38fd1498Szrj #define _AVX2INTRIN_H_INCLUDED
30*38fd1498Szrj
31*38fd1498Szrj #ifndef __AVX2__
32*38fd1498Szrj #pragma GCC push_options
33*38fd1498Szrj #pragma GCC target("avx2")
34*38fd1498Szrj #define __DISABLE_AVX2__
35*38fd1498Szrj #endif /* __AVX2__ */
36*38fd1498Szrj
37*38fd1498Szrj /* Sum absolute 8-bit integer difference of adjacent groups of 4
38*38fd1498Szrj byte integers in the first 2 operands. Starting offsets within
39*38fd1498Szrj operands are determined by the 3rd mask operand. */
40*38fd1498Szrj #ifdef __OPTIMIZE__
41*38fd1498Szrj extern __inline __m256i
42*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mpsadbw_epu8(__m256i __X,__m256i __Y,const int __M)43*38fd1498Szrj _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
44*38fd1498Szrj {
45*38fd1498Szrj return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
46*38fd1498Szrj (__v32qi)__Y, __M);
47*38fd1498Szrj }
48*38fd1498Szrj #else
49*38fd1498Szrj #define _mm256_mpsadbw_epu8(X, Y, M) \
50*38fd1498Szrj ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
51*38fd1498Szrj (__v32qi)(__m256i)(Y), (int)(M)))
52*38fd1498Szrj #endif
53*38fd1498Szrj
54*38fd1498Szrj extern __inline __m256i
55*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_abs_epi8(__m256i __A)56*38fd1498Szrj _mm256_abs_epi8 (__m256i __A)
57*38fd1498Szrj {
58*38fd1498Szrj return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
59*38fd1498Szrj }
60*38fd1498Szrj
61*38fd1498Szrj extern __inline __m256i
62*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_abs_epi16(__m256i __A)63*38fd1498Szrj _mm256_abs_epi16 (__m256i __A)
64*38fd1498Szrj {
65*38fd1498Szrj return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
66*38fd1498Szrj }
67*38fd1498Szrj
68*38fd1498Szrj extern __inline __m256i
69*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_abs_epi32(__m256i __A)70*38fd1498Szrj _mm256_abs_epi32 (__m256i __A)
71*38fd1498Szrj {
72*38fd1498Szrj return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
73*38fd1498Szrj }
74*38fd1498Szrj
75*38fd1498Szrj extern __inline __m256i
76*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_packs_epi32(__m256i __A,__m256i __B)77*38fd1498Szrj _mm256_packs_epi32 (__m256i __A, __m256i __B)
78*38fd1498Szrj {
79*38fd1498Szrj return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
80*38fd1498Szrj }
81*38fd1498Szrj
82*38fd1498Szrj extern __inline __m256i
83*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_packs_epi16(__m256i __A,__m256i __B)84*38fd1498Szrj _mm256_packs_epi16 (__m256i __A, __m256i __B)
85*38fd1498Szrj {
86*38fd1498Szrj return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
87*38fd1498Szrj }
88*38fd1498Szrj
89*38fd1498Szrj extern __inline __m256i
90*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_packus_epi32(__m256i __A,__m256i __B)91*38fd1498Szrj _mm256_packus_epi32 (__m256i __A, __m256i __B)
92*38fd1498Szrj {
93*38fd1498Szrj return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
94*38fd1498Szrj }
95*38fd1498Szrj
96*38fd1498Szrj extern __inline __m256i
97*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_packus_epi16(__m256i __A,__m256i __B)98*38fd1498Szrj _mm256_packus_epi16 (__m256i __A, __m256i __B)
99*38fd1498Szrj {
100*38fd1498Szrj return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
101*38fd1498Szrj }
102*38fd1498Szrj
103*38fd1498Szrj extern __inline __m256i
104*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi8(__m256i __A,__m256i __B)105*38fd1498Szrj _mm256_add_epi8 (__m256i __A, __m256i __B)
106*38fd1498Szrj {
107*38fd1498Szrj return (__m256i) ((__v32qu)__A + (__v32qu)__B);
108*38fd1498Szrj }
109*38fd1498Szrj
110*38fd1498Szrj extern __inline __m256i
111*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi16(__m256i __A,__m256i __B)112*38fd1498Szrj _mm256_add_epi16 (__m256i __A, __m256i __B)
113*38fd1498Szrj {
114*38fd1498Szrj return (__m256i) ((__v16hu)__A + (__v16hu)__B);
115*38fd1498Szrj }
116*38fd1498Szrj
117*38fd1498Szrj extern __inline __m256i
118*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi32(__m256i __A,__m256i __B)119*38fd1498Szrj _mm256_add_epi32 (__m256i __A, __m256i __B)
120*38fd1498Szrj {
121*38fd1498Szrj return (__m256i) ((__v8su)__A + (__v8su)__B);
122*38fd1498Szrj }
123*38fd1498Szrj
124*38fd1498Szrj extern __inline __m256i
125*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi64(__m256i __A,__m256i __B)126*38fd1498Szrj _mm256_add_epi64 (__m256i __A, __m256i __B)
127*38fd1498Szrj {
128*38fd1498Szrj return (__m256i) ((__v4du)__A + (__v4du)__B);
129*38fd1498Szrj }
130*38fd1498Szrj
131*38fd1498Szrj extern __inline __m256i
132*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_adds_epi8(__m256i __A,__m256i __B)133*38fd1498Szrj _mm256_adds_epi8 (__m256i __A, __m256i __B)
134*38fd1498Szrj {
135*38fd1498Szrj return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
136*38fd1498Szrj }
137*38fd1498Szrj
138*38fd1498Szrj extern __inline __m256i
139*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_adds_epi16(__m256i __A,__m256i __B)140*38fd1498Szrj _mm256_adds_epi16 (__m256i __A, __m256i __B)
141*38fd1498Szrj {
142*38fd1498Szrj return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
143*38fd1498Szrj }
144*38fd1498Szrj
145*38fd1498Szrj extern __inline __m256i
146*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_adds_epu8(__m256i __A,__m256i __B)147*38fd1498Szrj _mm256_adds_epu8 (__m256i __A, __m256i __B)
148*38fd1498Szrj {
149*38fd1498Szrj return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
150*38fd1498Szrj }
151*38fd1498Szrj
152*38fd1498Szrj extern __inline __m256i
153*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_adds_epu16(__m256i __A,__m256i __B)154*38fd1498Szrj _mm256_adds_epu16 (__m256i __A, __m256i __B)
155*38fd1498Szrj {
156*38fd1498Szrj return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
157*38fd1498Szrj }
158*38fd1498Szrj
159*38fd1498Szrj #ifdef __OPTIMIZE__
160*38fd1498Szrj extern __inline __m256i
161*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_alignr_epi8(__m256i __A,__m256i __B,const int __N)162*38fd1498Szrj _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
163*38fd1498Szrj {
164*38fd1498Szrj return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
165*38fd1498Szrj (__v4di)__B,
166*38fd1498Szrj __N * 8);
167*38fd1498Szrj }
168*38fd1498Szrj #else
169*38fd1498Szrj /* In that case (__N*8) will be in vreg, and insn will not be matched. */
170*38fd1498Szrj /* Use define instead */
171*38fd1498Szrj #define _mm256_alignr_epi8(A, B, N) \
172*38fd1498Szrj ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
173*38fd1498Szrj (__v4di)(__m256i)(B), \
174*38fd1498Szrj (int)(N) * 8))
175*38fd1498Szrj #endif
176*38fd1498Szrj
177*38fd1498Szrj extern __inline __m256i
178*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_and_si256(__m256i __A,__m256i __B)179*38fd1498Szrj _mm256_and_si256 (__m256i __A, __m256i __B)
180*38fd1498Szrj {
181*38fd1498Szrj return (__m256i) ((__v4du)__A & (__v4du)__B);
182*38fd1498Szrj }
183*38fd1498Szrj
184*38fd1498Szrj extern __inline __m256i
185*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_andnot_si256(__m256i __A,__m256i __B)186*38fd1498Szrj _mm256_andnot_si256 (__m256i __A, __m256i __B)
187*38fd1498Szrj {
188*38fd1498Szrj return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
189*38fd1498Szrj }
190*38fd1498Szrj
191*38fd1498Szrj extern __inline __m256i
192*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_avg_epu8(__m256i __A,__m256i __B)193*38fd1498Szrj _mm256_avg_epu8 (__m256i __A, __m256i __B)
194*38fd1498Szrj {
195*38fd1498Szrj return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
196*38fd1498Szrj }
197*38fd1498Szrj
198*38fd1498Szrj extern __inline __m256i
199*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_avg_epu16(__m256i __A,__m256i __B)200*38fd1498Szrj _mm256_avg_epu16 (__m256i __A, __m256i __B)
201*38fd1498Szrj {
202*38fd1498Szrj return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
203*38fd1498Szrj }
204*38fd1498Szrj
205*38fd1498Szrj extern __inline __m256i
206*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blendv_epi8(__m256i __X,__m256i __Y,__m256i __M)207*38fd1498Szrj _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
208*38fd1498Szrj {
209*38fd1498Szrj return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
210*38fd1498Szrj (__v32qi)__Y,
211*38fd1498Szrj (__v32qi)__M);
212*38fd1498Szrj }
213*38fd1498Szrj
214*38fd1498Szrj #ifdef __OPTIMIZE__
215*38fd1498Szrj extern __inline __m256i
216*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blend_epi16(__m256i __X,__m256i __Y,const int __M)217*38fd1498Szrj _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
218*38fd1498Szrj {
219*38fd1498Szrj return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
220*38fd1498Szrj (__v16hi)__Y,
221*38fd1498Szrj __M);
222*38fd1498Szrj }
223*38fd1498Szrj #else
224*38fd1498Szrj #define _mm256_blend_epi16(X, Y, M) \
225*38fd1498Szrj ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
226*38fd1498Szrj (__v16hi)(__m256i)(Y), (int)(M)))
227*38fd1498Szrj #endif
228*38fd1498Szrj
229*38fd1498Szrj extern __inline __m256i
230*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi8(__m256i __A,__m256i __B)231*38fd1498Szrj _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
232*38fd1498Szrj {
233*38fd1498Szrj return (__m256i) ((__v32qi)__A == (__v32qi)__B);
234*38fd1498Szrj }
235*38fd1498Szrj
236*38fd1498Szrj extern __inline __m256i
237*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi16(__m256i __A,__m256i __B)238*38fd1498Szrj _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
239*38fd1498Szrj {
240*38fd1498Szrj return (__m256i) ((__v16hi)__A == (__v16hi)__B);
241*38fd1498Szrj }
242*38fd1498Szrj
243*38fd1498Szrj extern __inline __m256i
244*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi32(__m256i __A,__m256i __B)245*38fd1498Szrj _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
246*38fd1498Szrj {
247*38fd1498Szrj return (__m256i) ((__v8si)__A == (__v8si)__B);
248*38fd1498Szrj }
249*38fd1498Szrj
250*38fd1498Szrj extern __inline __m256i
251*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi64(__m256i __A,__m256i __B)252*38fd1498Szrj _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
253*38fd1498Szrj {
254*38fd1498Szrj return (__m256i) ((__v4di)__A == (__v4di)__B);
255*38fd1498Szrj }
256*38fd1498Szrj
257*38fd1498Szrj extern __inline __m256i
258*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi8(__m256i __A,__m256i __B)259*38fd1498Szrj _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
260*38fd1498Szrj {
261*38fd1498Szrj return (__m256i) ((__v32qi)__A > (__v32qi)__B);
262*38fd1498Szrj }
263*38fd1498Szrj
264*38fd1498Szrj extern __inline __m256i
265*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi16(__m256i __A,__m256i __B)266*38fd1498Szrj _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
267*38fd1498Szrj {
268*38fd1498Szrj return (__m256i) ((__v16hi)__A > (__v16hi)__B);
269*38fd1498Szrj }
270*38fd1498Szrj
271*38fd1498Szrj extern __inline __m256i
272*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi32(__m256i __A,__m256i __B)273*38fd1498Szrj _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
274*38fd1498Szrj {
275*38fd1498Szrj return (__m256i) ((__v8si)__A > (__v8si)__B);
276*38fd1498Szrj }
277*38fd1498Szrj
278*38fd1498Szrj extern __inline __m256i
279*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi64(__m256i __A,__m256i __B)280*38fd1498Szrj _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
281*38fd1498Szrj {
282*38fd1498Szrj return (__m256i) ((__v4di)__A > (__v4di)__B);
283*38fd1498Szrj }
284*38fd1498Szrj
285*38fd1498Szrj extern __inline __m256i
286*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hadd_epi16(__m256i __X,__m256i __Y)287*38fd1498Szrj _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
288*38fd1498Szrj {
289*38fd1498Szrj return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
290*38fd1498Szrj (__v16hi)__Y);
291*38fd1498Szrj }
292*38fd1498Szrj
293*38fd1498Szrj extern __inline __m256i
294*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hadd_epi32(__m256i __X,__m256i __Y)295*38fd1498Szrj _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
296*38fd1498Szrj {
297*38fd1498Szrj return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
298*38fd1498Szrj }
299*38fd1498Szrj
300*38fd1498Szrj extern __inline __m256i
301*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hadds_epi16(__m256i __X,__m256i __Y)302*38fd1498Szrj _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
303*38fd1498Szrj {
304*38fd1498Szrj return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
305*38fd1498Szrj (__v16hi)__Y);
306*38fd1498Szrj }
307*38fd1498Szrj
308*38fd1498Szrj extern __inline __m256i
309*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hsub_epi16(__m256i __X,__m256i __Y)310*38fd1498Szrj _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
311*38fd1498Szrj {
312*38fd1498Szrj return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
313*38fd1498Szrj (__v16hi)__Y);
314*38fd1498Szrj }
315*38fd1498Szrj
316*38fd1498Szrj extern __inline __m256i
317*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hsub_epi32(__m256i __X,__m256i __Y)318*38fd1498Szrj _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
319*38fd1498Szrj {
320*38fd1498Szrj return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
321*38fd1498Szrj }
322*38fd1498Szrj
323*38fd1498Szrj extern __inline __m256i
324*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_hsubs_epi16(__m256i __X,__m256i __Y)325*38fd1498Szrj _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
326*38fd1498Szrj {
327*38fd1498Szrj return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
328*38fd1498Szrj (__v16hi)__Y);
329*38fd1498Szrj }
330*38fd1498Szrj
331*38fd1498Szrj extern __inline __m256i
332*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddubs_epi16(__m256i __X,__m256i __Y)333*38fd1498Szrj _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
334*38fd1498Szrj {
335*38fd1498Szrj return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
336*38fd1498Szrj (__v32qi)__Y);
337*38fd1498Szrj }
338*38fd1498Szrj
339*38fd1498Szrj extern __inline __m256i
340*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_madd_epi16(__m256i __A,__m256i __B)341*38fd1498Szrj _mm256_madd_epi16 (__m256i __A, __m256i __B)
342*38fd1498Szrj {
343*38fd1498Szrj return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
344*38fd1498Szrj (__v16hi)__B);
345*38fd1498Szrj }
346*38fd1498Szrj
347*38fd1498Szrj extern __inline __m256i
348*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epi8(__m256i __A,__m256i __B)349*38fd1498Szrj _mm256_max_epi8 (__m256i __A, __m256i __B)
350*38fd1498Szrj {
351*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
352*38fd1498Szrj }
353*38fd1498Szrj
354*38fd1498Szrj extern __inline __m256i
355*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epi16(__m256i __A,__m256i __B)356*38fd1498Szrj _mm256_max_epi16 (__m256i __A, __m256i __B)
357*38fd1498Szrj {
358*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
359*38fd1498Szrj }
360*38fd1498Szrj
361*38fd1498Szrj extern __inline __m256i
362*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epi32(__m256i __A,__m256i __B)363*38fd1498Szrj _mm256_max_epi32 (__m256i __A, __m256i __B)
364*38fd1498Szrj {
365*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
366*38fd1498Szrj }
367*38fd1498Szrj
368*38fd1498Szrj extern __inline __m256i
369*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epu8(__m256i __A,__m256i __B)370*38fd1498Szrj _mm256_max_epu8 (__m256i __A, __m256i __B)
371*38fd1498Szrj {
372*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
373*38fd1498Szrj }
374*38fd1498Szrj
375*38fd1498Szrj extern __inline __m256i
376*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epu16(__m256i __A,__m256i __B)377*38fd1498Szrj _mm256_max_epu16 (__m256i __A, __m256i __B)
378*38fd1498Szrj {
379*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
380*38fd1498Szrj }
381*38fd1498Szrj
382*38fd1498Szrj extern __inline __m256i
383*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_max_epu32(__m256i __A,__m256i __B)384*38fd1498Szrj _mm256_max_epu32 (__m256i __A, __m256i __B)
385*38fd1498Szrj {
386*38fd1498Szrj return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
387*38fd1498Szrj }
388*38fd1498Szrj
389*38fd1498Szrj extern __inline __m256i
390*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epi8(__m256i __A,__m256i __B)391*38fd1498Szrj _mm256_min_epi8 (__m256i __A, __m256i __B)
392*38fd1498Szrj {
393*38fd1498Szrj return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
394*38fd1498Szrj }
395*38fd1498Szrj
396*38fd1498Szrj extern __inline __m256i
397*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epi16(__m256i __A,__m256i __B)398*38fd1498Szrj _mm256_min_epi16 (__m256i __A, __m256i __B)
399*38fd1498Szrj {
400*38fd1498Szrj return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
401*38fd1498Szrj }
402*38fd1498Szrj
403*38fd1498Szrj extern __inline __m256i
404*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epi32(__m256i __A,__m256i __B)405*38fd1498Szrj _mm256_min_epi32 (__m256i __A, __m256i __B)
406*38fd1498Szrj {
407*38fd1498Szrj return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
408*38fd1498Szrj }
409*38fd1498Szrj
410*38fd1498Szrj extern __inline __m256i
411*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epu8(__m256i __A,__m256i __B)412*38fd1498Szrj _mm256_min_epu8 (__m256i __A, __m256i __B)
413*38fd1498Szrj {
414*38fd1498Szrj return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
415*38fd1498Szrj }
416*38fd1498Szrj
417*38fd1498Szrj extern __inline __m256i
418*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epu16(__m256i __A,__m256i __B)419*38fd1498Szrj _mm256_min_epu16 (__m256i __A, __m256i __B)
420*38fd1498Szrj {
421*38fd1498Szrj return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
422*38fd1498Szrj }
423*38fd1498Szrj
424*38fd1498Szrj extern __inline __m256i
425*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_min_epu32(__m256i __A,__m256i __B)426*38fd1498Szrj _mm256_min_epu32 (__m256i __A, __m256i __B)
427*38fd1498Szrj {
428*38fd1498Szrj return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
429*38fd1498Szrj }
430*38fd1498Szrj
431*38fd1498Szrj extern __inline int
432*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_movemask_epi8(__m256i __A)433*38fd1498Szrj _mm256_movemask_epi8 (__m256i __A)
434*38fd1498Szrj {
435*38fd1498Szrj return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
436*38fd1498Szrj }
437*38fd1498Szrj
438*38fd1498Szrj extern __inline __m256i
439*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi8_epi16(__m128i __X)440*38fd1498Szrj _mm256_cvtepi8_epi16 (__m128i __X)
441*38fd1498Szrj {
442*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
443*38fd1498Szrj }
444*38fd1498Szrj
445*38fd1498Szrj extern __inline __m256i
446*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi8_epi32(__m128i __X)447*38fd1498Szrj _mm256_cvtepi8_epi32 (__m128i __X)
448*38fd1498Szrj {
449*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
450*38fd1498Szrj }
451*38fd1498Szrj
452*38fd1498Szrj extern __inline __m256i
453*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi8_epi64(__m128i __X)454*38fd1498Szrj _mm256_cvtepi8_epi64 (__m128i __X)
455*38fd1498Szrj {
456*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
457*38fd1498Szrj }
458*38fd1498Szrj
459*38fd1498Szrj extern __inline __m256i
460*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi16_epi32(__m128i __X)461*38fd1498Szrj _mm256_cvtepi16_epi32 (__m128i __X)
462*38fd1498Szrj {
463*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
464*38fd1498Szrj }
465*38fd1498Szrj
466*38fd1498Szrj extern __inline __m256i
467*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi16_epi64(__m128i __X)468*38fd1498Szrj _mm256_cvtepi16_epi64 (__m128i __X)
469*38fd1498Szrj {
470*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
471*38fd1498Szrj }
472*38fd1498Szrj
473*38fd1498Szrj extern __inline __m256i
474*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepi32_epi64(__m128i __X)475*38fd1498Szrj _mm256_cvtepi32_epi64 (__m128i __X)
476*38fd1498Szrj {
477*38fd1498Szrj return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
478*38fd1498Szrj }
479*38fd1498Szrj
480*38fd1498Szrj extern __inline __m256i
481*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu8_epi16(__m128i __X)482*38fd1498Szrj _mm256_cvtepu8_epi16 (__m128i __X)
483*38fd1498Szrj {
484*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
485*38fd1498Szrj }
486*38fd1498Szrj
487*38fd1498Szrj extern __inline __m256i
488*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu8_epi32(__m128i __X)489*38fd1498Szrj _mm256_cvtepu8_epi32 (__m128i __X)
490*38fd1498Szrj {
491*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
492*38fd1498Szrj }
493*38fd1498Szrj
494*38fd1498Szrj extern __inline __m256i
495*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu8_epi64(__m128i __X)496*38fd1498Szrj _mm256_cvtepu8_epi64 (__m128i __X)
497*38fd1498Szrj {
498*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
499*38fd1498Szrj }
500*38fd1498Szrj
501*38fd1498Szrj extern __inline __m256i
502*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu16_epi32(__m128i __X)503*38fd1498Szrj _mm256_cvtepu16_epi32 (__m128i __X)
504*38fd1498Szrj {
505*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
506*38fd1498Szrj }
507*38fd1498Szrj
508*38fd1498Szrj extern __inline __m256i
509*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu16_epi64(__m128i __X)510*38fd1498Szrj _mm256_cvtepu16_epi64 (__m128i __X)
511*38fd1498Szrj {
512*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
513*38fd1498Szrj }
514*38fd1498Szrj
515*38fd1498Szrj extern __inline __m256i
516*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtepu32_epi64(__m128i __X)517*38fd1498Szrj _mm256_cvtepu32_epi64 (__m128i __X)
518*38fd1498Szrj {
519*38fd1498Szrj return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
520*38fd1498Szrj }
521*38fd1498Szrj
522*38fd1498Szrj extern __inline __m256i
523*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_epi32(__m256i __X,__m256i __Y)524*38fd1498Szrj _mm256_mul_epi32 (__m256i __X, __m256i __Y)
525*38fd1498Szrj {
526*38fd1498Szrj return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
527*38fd1498Szrj }
528*38fd1498Szrj
529*38fd1498Szrj extern __inline __m256i
530*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mulhrs_epi16(__m256i __X,__m256i __Y)531*38fd1498Szrj _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
532*38fd1498Szrj {
533*38fd1498Szrj return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
534*38fd1498Szrj (__v16hi)__Y);
535*38fd1498Szrj }
536*38fd1498Szrj
537*38fd1498Szrj extern __inline __m256i
538*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mulhi_epu16(__m256i __A,__m256i __B)539*38fd1498Szrj _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
540*38fd1498Szrj {
541*38fd1498Szrj return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
542*38fd1498Szrj }
543*38fd1498Szrj
544*38fd1498Szrj extern __inline __m256i
545*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mulhi_epi16(__m256i __A,__m256i __B)546*38fd1498Szrj _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
547*38fd1498Szrj {
548*38fd1498Szrj return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
549*38fd1498Szrj }
550*38fd1498Szrj
551*38fd1498Szrj extern __inline __m256i
552*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi16(__m256i __A,__m256i __B)553*38fd1498Szrj _mm256_mullo_epi16 (__m256i __A, __m256i __B)
554*38fd1498Szrj {
555*38fd1498Szrj return (__m256i) ((__v16hu)__A * (__v16hu)__B);
556*38fd1498Szrj }
557*38fd1498Szrj
558*38fd1498Szrj extern __inline __m256i
559*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi32(__m256i __A,__m256i __B)560*38fd1498Szrj _mm256_mullo_epi32 (__m256i __A, __m256i __B)
561*38fd1498Szrj {
562*38fd1498Szrj return (__m256i) ((__v8su)__A * (__v8su)__B);
563*38fd1498Szrj }
564*38fd1498Szrj
565*38fd1498Szrj extern __inline __m256i
566*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_epu32(__m256i __A,__m256i __B)567*38fd1498Szrj _mm256_mul_epu32 (__m256i __A, __m256i __B)
568*38fd1498Szrj {
569*38fd1498Szrj return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
570*38fd1498Szrj }
571*38fd1498Szrj
572*38fd1498Szrj extern __inline __m256i
573*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_si256(__m256i __A,__m256i __B)574*38fd1498Szrj _mm256_or_si256 (__m256i __A, __m256i __B)
575*38fd1498Szrj {
576*38fd1498Szrj return (__m256i) ((__v4du)__A | (__v4du)__B);
577*38fd1498Szrj }
578*38fd1498Szrj
579*38fd1498Szrj extern __inline __m256i
580*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sad_epu8(__m256i __A,__m256i __B)581*38fd1498Szrj _mm256_sad_epu8 (__m256i __A, __m256i __B)
582*38fd1498Szrj {
583*38fd1498Szrj return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
584*38fd1498Szrj }
585*38fd1498Szrj
586*38fd1498Szrj extern __inline __m256i
587*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shuffle_epi8(__m256i __X,__m256i __Y)588*38fd1498Szrj _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
589*38fd1498Szrj {
590*38fd1498Szrj return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
591*38fd1498Szrj (__v32qi)__Y);
592*38fd1498Szrj }
593*38fd1498Szrj
594*38fd1498Szrj #ifdef __OPTIMIZE__
595*38fd1498Szrj extern __inline __m256i
596*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shuffle_epi32(__m256i __A,const int __mask)597*38fd1498Szrj _mm256_shuffle_epi32 (__m256i __A, const int __mask)
598*38fd1498Szrj {
599*38fd1498Szrj return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
600*38fd1498Szrj }
601*38fd1498Szrj
602*38fd1498Szrj extern __inline __m256i
603*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shufflehi_epi16(__m256i __A,const int __mask)604*38fd1498Szrj _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
605*38fd1498Szrj {
606*38fd1498Szrj return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
607*38fd1498Szrj }
608*38fd1498Szrj
609*38fd1498Szrj extern __inline __m256i
610*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shufflelo_epi16(__m256i __A,const int __mask)611*38fd1498Szrj _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
612*38fd1498Szrj {
613*38fd1498Szrj return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
614*38fd1498Szrj }
615*38fd1498Szrj #else
616*38fd1498Szrj #define _mm256_shuffle_epi32(A, N) \
617*38fd1498Szrj ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
618*38fd1498Szrj #define _mm256_shufflehi_epi16(A, N) \
619*38fd1498Szrj ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
620*38fd1498Szrj #define _mm256_shufflelo_epi16(A, N) \
621*38fd1498Szrj ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
622*38fd1498Szrj #endif
623*38fd1498Szrj
624*38fd1498Szrj extern __inline __m256i
625*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sign_epi8(__m256i __X,__m256i __Y)626*38fd1498Szrj _mm256_sign_epi8 (__m256i __X, __m256i __Y)
627*38fd1498Szrj {
628*38fd1498Szrj return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
629*38fd1498Szrj }
630*38fd1498Szrj
631*38fd1498Szrj extern __inline __m256i
632*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sign_epi16(__m256i __X,__m256i __Y)633*38fd1498Szrj _mm256_sign_epi16 (__m256i __X, __m256i __Y)
634*38fd1498Szrj {
635*38fd1498Szrj return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
636*38fd1498Szrj }
637*38fd1498Szrj
638*38fd1498Szrj extern __inline __m256i
639*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sign_epi32(__m256i __X,__m256i __Y)640*38fd1498Szrj _mm256_sign_epi32 (__m256i __X, __m256i __Y)
641*38fd1498Szrj {
642*38fd1498Szrj return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
643*38fd1498Szrj }
644*38fd1498Szrj
645*38fd1498Szrj #ifdef __OPTIMIZE__
646*38fd1498Szrj extern __inline __m256i
647*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_bslli_epi128(__m256i __A,const int __N)648*38fd1498Szrj _mm256_bslli_epi128 (__m256i __A, const int __N)
649*38fd1498Szrj {
650*38fd1498Szrj return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
651*38fd1498Szrj }
652*38fd1498Szrj
653*38fd1498Szrj extern __inline __m256i
654*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_slli_si256(__m256i __A,const int __N)655*38fd1498Szrj _mm256_slli_si256 (__m256i __A, const int __N)
656*38fd1498Szrj {
657*38fd1498Szrj return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
658*38fd1498Szrj }
659*38fd1498Szrj #else
660*38fd1498Szrj #define _mm256_bslli_epi128(A, N) \
661*38fd1498Szrj ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
662*38fd1498Szrj #define _mm256_slli_si256(A, N) \
663*38fd1498Szrj ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
664*38fd1498Szrj #endif
665*38fd1498Szrj
666*38fd1498Szrj extern __inline __m256i
667*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_slli_epi16(__m256i __A,int __B)668*38fd1498Szrj _mm256_slli_epi16 (__m256i __A, int __B)
669*38fd1498Szrj {
670*38fd1498Szrj return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
671*38fd1498Szrj }
672*38fd1498Szrj
673*38fd1498Szrj extern __inline __m256i
674*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sll_epi16(__m256i __A,__m128i __B)675*38fd1498Szrj _mm256_sll_epi16 (__m256i __A, __m128i __B)
676*38fd1498Szrj {
677*38fd1498Szrj return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
678*38fd1498Szrj }
679*38fd1498Szrj
680*38fd1498Szrj extern __inline __m256i
681*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_slli_epi32(__m256i __A,int __B)682*38fd1498Szrj _mm256_slli_epi32 (__m256i __A, int __B)
683*38fd1498Szrj {
684*38fd1498Szrj return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
685*38fd1498Szrj }
686*38fd1498Szrj
687*38fd1498Szrj extern __inline __m256i
688*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sll_epi32(__m256i __A,__m128i __B)689*38fd1498Szrj _mm256_sll_epi32 (__m256i __A, __m128i __B)
690*38fd1498Szrj {
691*38fd1498Szrj return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
692*38fd1498Szrj }
693*38fd1498Szrj
694*38fd1498Szrj extern __inline __m256i
695*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_slli_epi64(__m256i __A,int __B)696*38fd1498Szrj _mm256_slli_epi64 (__m256i __A, int __B)
697*38fd1498Szrj {
698*38fd1498Szrj return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
699*38fd1498Szrj }
700*38fd1498Szrj
701*38fd1498Szrj extern __inline __m256i
702*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sll_epi64(__m256i __A,__m128i __B)703*38fd1498Szrj _mm256_sll_epi64 (__m256i __A, __m128i __B)
704*38fd1498Szrj {
705*38fd1498Szrj return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
706*38fd1498Szrj }
707*38fd1498Szrj
708*38fd1498Szrj extern __inline __m256i
709*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srai_epi16(__m256i __A,int __B)710*38fd1498Szrj _mm256_srai_epi16 (__m256i __A, int __B)
711*38fd1498Szrj {
712*38fd1498Szrj return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
713*38fd1498Szrj }
714*38fd1498Szrj
715*38fd1498Szrj extern __inline __m256i
716*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sra_epi16(__m256i __A,__m128i __B)717*38fd1498Szrj _mm256_sra_epi16 (__m256i __A, __m128i __B)
718*38fd1498Szrj {
719*38fd1498Szrj return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
720*38fd1498Szrj }
721*38fd1498Szrj
722*38fd1498Szrj extern __inline __m256i
723*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srai_epi32(__m256i __A,int __B)724*38fd1498Szrj _mm256_srai_epi32 (__m256i __A, int __B)
725*38fd1498Szrj {
726*38fd1498Szrj return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
727*38fd1498Szrj }
728*38fd1498Szrj
729*38fd1498Szrj extern __inline __m256i
730*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sra_epi32(__m256i __A,__m128i __B)731*38fd1498Szrj _mm256_sra_epi32 (__m256i __A, __m128i __B)
732*38fd1498Szrj {
733*38fd1498Szrj return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
734*38fd1498Szrj }
735*38fd1498Szrj
736*38fd1498Szrj #ifdef __OPTIMIZE__
737*38fd1498Szrj extern __inline __m256i
738*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_bsrli_epi128(__m256i __A,const int __N)739*38fd1498Szrj _mm256_bsrli_epi128 (__m256i __A, const int __N)
740*38fd1498Szrj {
741*38fd1498Szrj return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
742*38fd1498Szrj }
743*38fd1498Szrj
744*38fd1498Szrj extern __inline __m256i
745*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srli_si256(__m256i __A,const int __N)746*38fd1498Szrj _mm256_srli_si256 (__m256i __A, const int __N)
747*38fd1498Szrj {
748*38fd1498Szrj return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
749*38fd1498Szrj }
750*38fd1498Szrj #else
751*38fd1498Szrj #define _mm256_bsrli_epi128(A, N) \
752*38fd1498Szrj ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
753*38fd1498Szrj #define _mm256_srli_si256(A, N) \
754*38fd1498Szrj ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
755*38fd1498Szrj #endif
756*38fd1498Szrj
757*38fd1498Szrj extern __inline __m256i
758*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srli_epi16(__m256i __A,int __B)759*38fd1498Szrj _mm256_srli_epi16 (__m256i __A, int __B)
760*38fd1498Szrj {
761*38fd1498Szrj return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
762*38fd1498Szrj }
763*38fd1498Szrj
764*38fd1498Szrj extern __inline __m256i
765*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srl_epi16(__m256i __A,__m128i __B)766*38fd1498Szrj _mm256_srl_epi16 (__m256i __A, __m128i __B)
767*38fd1498Szrj {
768*38fd1498Szrj return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
769*38fd1498Szrj }
770*38fd1498Szrj
771*38fd1498Szrj extern __inline __m256i
772*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srli_epi32(__m256i __A,int __B)773*38fd1498Szrj _mm256_srli_epi32 (__m256i __A, int __B)
774*38fd1498Szrj {
775*38fd1498Szrj return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
776*38fd1498Szrj }
777*38fd1498Szrj
778*38fd1498Szrj extern __inline __m256i
779*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srl_epi32(__m256i __A,__m128i __B)780*38fd1498Szrj _mm256_srl_epi32 (__m256i __A, __m128i __B)
781*38fd1498Szrj {
782*38fd1498Szrj return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
783*38fd1498Szrj }
784*38fd1498Szrj
785*38fd1498Szrj extern __inline __m256i
786*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srli_epi64(__m256i __A,int __B)787*38fd1498Szrj _mm256_srli_epi64 (__m256i __A, int __B)
788*38fd1498Szrj {
789*38fd1498Szrj return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
790*38fd1498Szrj }
791*38fd1498Szrj
792*38fd1498Szrj extern __inline __m256i
793*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srl_epi64(__m256i __A,__m128i __B)794*38fd1498Szrj _mm256_srl_epi64 (__m256i __A, __m128i __B)
795*38fd1498Szrj {
796*38fd1498Szrj return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
797*38fd1498Szrj }
798*38fd1498Szrj
799*38fd1498Szrj extern __inline __m256i
800*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi8(__m256i __A,__m256i __B)801*38fd1498Szrj _mm256_sub_epi8 (__m256i __A, __m256i __B)
802*38fd1498Szrj {
803*38fd1498Szrj return (__m256i) ((__v32qu)__A - (__v32qu)__B);
804*38fd1498Szrj }
805*38fd1498Szrj
806*38fd1498Szrj extern __inline __m256i
807*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi16(__m256i __A,__m256i __B)808*38fd1498Szrj _mm256_sub_epi16 (__m256i __A, __m256i __B)
809*38fd1498Szrj {
810*38fd1498Szrj return (__m256i) ((__v16hu)__A - (__v16hu)__B);
811*38fd1498Szrj }
812*38fd1498Szrj
813*38fd1498Szrj extern __inline __m256i
814*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi32(__m256i __A,__m256i __B)815*38fd1498Szrj _mm256_sub_epi32 (__m256i __A, __m256i __B)
816*38fd1498Szrj {
817*38fd1498Szrj return (__m256i) ((__v8su)__A - (__v8su)__B);
818*38fd1498Szrj }
819*38fd1498Szrj
820*38fd1498Szrj extern __inline __m256i
821*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi64(__m256i __A,__m256i __B)822*38fd1498Szrj _mm256_sub_epi64 (__m256i __A, __m256i __B)
823*38fd1498Szrj {
824*38fd1498Szrj return (__m256i) ((__v4du)__A - (__v4du)__B);
825*38fd1498Szrj }
826*38fd1498Szrj
827*38fd1498Szrj extern __inline __m256i
828*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_subs_epi8(__m256i __A,__m256i __B)829*38fd1498Szrj _mm256_subs_epi8 (__m256i __A, __m256i __B)
830*38fd1498Szrj {
831*38fd1498Szrj return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
832*38fd1498Szrj }
833*38fd1498Szrj
834*38fd1498Szrj extern __inline __m256i
835*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_subs_epi16(__m256i __A,__m256i __B)836*38fd1498Szrj _mm256_subs_epi16 (__m256i __A, __m256i __B)
837*38fd1498Szrj {
838*38fd1498Szrj return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
839*38fd1498Szrj }
840*38fd1498Szrj
841*38fd1498Szrj extern __inline __m256i
842*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_subs_epu8(__m256i __A,__m256i __B)843*38fd1498Szrj _mm256_subs_epu8 (__m256i __A, __m256i __B)
844*38fd1498Szrj {
845*38fd1498Szrj return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
846*38fd1498Szrj }
847*38fd1498Szrj
848*38fd1498Szrj extern __inline __m256i
849*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_subs_epu16(__m256i __A,__m256i __B)850*38fd1498Szrj _mm256_subs_epu16 (__m256i __A, __m256i __B)
851*38fd1498Szrj {
852*38fd1498Szrj return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
853*38fd1498Szrj }
854*38fd1498Szrj
855*38fd1498Szrj extern __inline __m256i
856*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_epi8(__m256i __A,__m256i __B)857*38fd1498Szrj _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
858*38fd1498Szrj {
859*38fd1498Szrj return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
860*38fd1498Szrj }
861*38fd1498Szrj
862*38fd1498Szrj extern __inline __m256i
863*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_epi16(__m256i __A,__m256i __B)864*38fd1498Szrj _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
865*38fd1498Szrj {
866*38fd1498Szrj return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
867*38fd1498Szrj }
868*38fd1498Szrj
869*38fd1498Szrj extern __inline __m256i
870*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_epi32(__m256i __A,__m256i __B)871*38fd1498Szrj _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
872*38fd1498Szrj {
873*38fd1498Szrj return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
874*38fd1498Szrj }
875*38fd1498Szrj
876*38fd1498Szrj extern __inline __m256i
877*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpackhi_epi64(__m256i __A,__m256i __B)878*38fd1498Szrj _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
879*38fd1498Szrj {
880*38fd1498Szrj return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
881*38fd1498Szrj }
882*38fd1498Szrj
883*38fd1498Szrj extern __inline __m256i
884*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_epi8(__m256i __A,__m256i __B)885*38fd1498Szrj _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
886*38fd1498Szrj {
887*38fd1498Szrj return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
888*38fd1498Szrj }
889*38fd1498Szrj
890*38fd1498Szrj extern __inline __m256i
891*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_epi16(__m256i __A,__m256i __B)892*38fd1498Szrj _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
893*38fd1498Szrj {
894*38fd1498Szrj return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
895*38fd1498Szrj }
896*38fd1498Szrj
897*38fd1498Szrj extern __inline __m256i
898*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_epi32(__m256i __A,__m256i __B)899*38fd1498Szrj _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
900*38fd1498Szrj {
901*38fd1498Szrj return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
902*38fd1498Szrj }
903*38fd1498Szrj
904*38fd1498Szrj extern __inline __m256i
905*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_unpacklo_epi64(__m256i __A,__m256i __B)906*38fd1498Szrj _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
907*38fd1498Szrj {
908*38fd1498Szrj return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
909*38fd1498Szrj }
910*38fd1498Szrj
911*38fd1498Szrj extern __inline __m256i
912*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_si256(__m256i __A,__m256i __B)913*38fd1498Szrj _mm256_xor_si256 (__m256i __A, __m256i __B)
914*38fd1498Szrj {
915*38fd1498Szrj return (__m256i) ((__v4du)__A ^ (__v4du)__B);
916*38fd1498Szrj }
917*38fd1498Szrj
918*38fd1498Szrj extern __inline __m256i
919*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_stream_load_si256(__m256i const * __X)920*38fd1498Szrj _mm256_stream_load_si256 (__m256i const *__X)
921*38fd1498Szrj {
922*38fd1498Szrj return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
923*38fd1498Szrj }
924*38fd1498Szrj
925*38fd1498Szrj extern __inline __m128
926*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcastss_ps(__m128 __X)927*38fd1498Szrj _mm_broadcastss_ps (__m128 __X)
928*38fd1498Szrj {
929*38fd1498Szrj return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
930*38fd1498Szrj }
931*38fd1498Szrj
932*38fd1498Szrj extern __inline __m256
933*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastss_ps(__m128 __X)934*38fd1498Szrj _mm256_broadcastss_ps (__m128 __X)
935*38fd1498Szrj {
936*38fd1498Szrj return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
937*38fd1498Szrj }
938*38fd1498Szrj
939*38fd1498Szrj extern __inline __m256d
940*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastsd_pd(__m128d __X)941*38fd1498Szrj _mm256_broadcastsd_pd (__m128d __X)
942*38fd1498Szrj {
943*38fd1498Szrj return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
944*38fd1498Szrj }
945*38fd1498Szrj
946*38fd1498Szrj extern __inline __m256i
947*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastsi128_si256(__m128i __X)948*38fd1498Szrj _mm256_broadcastsi128_si256 (__m128i __X)
949*38fd1498Szrj {
950*38fd1498Szrj return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
951*38fd1498Szrj }
952*38fd1498Szrj
953*38fd1498Szrj #ifdef __OPTIMIZE__
954*38fd1498Szrj extern __inline __m128i
955*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi32(__m128i __X,__m128i __Y,const int __M)956*38fd1498Szrj _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
957*38fd1498Szrj {
958*38fd1498Szrj return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
959*38fd1498Szrj (__v4si)__Y,
960*38fd1498Szrj __M);
961*38fd1498Szrj }
962*38fd1498Szrj #else
963*38fd1498Szrj #define _mm_blend_epi32(X, Y, M) \
964*38fd1498Szrj ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
965*38fd1498Szrj (__v4si)(__m128i)(Y), (int)(M)))
966*38fd1498Szrj #endif
967*38fd1498Szrj
968*38fd1498Szrj #ifdef __OPTIMIZE__
969*38fd1498Szrj extern __inline __m256i
970*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_blend_epi32(__m256i __X,__m256i __Y,const int __M)971*38fd1498Szrj _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
972*38fd1498Szrj {
973*38fd1498Szrj return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
974*38fd1498Szrj (__v8si)__Y,
975*38fd1498Szrj __M);
976*38fd1498Szrj }
977*38fd1498Szrj #else
978*38fd1498Szrj #define _mm256_blend_epi32(X, Y, M) \
979*38fd1498Szrj ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
980*38fd1498Szrj (__v8si)(__m256i)(Y), (int)(M)))
981*38fd1498Szrj #endif
982*38fd1498Szrj
983*38fd1498Szrj extern __inline __m256i
984*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastb_epi8(__m128i __X)985*38fd1498Szrj _mm256_broadcastb_epi8 (__m128i __X)
986*38fd1498Szrj {
987*38fd1498Szrj return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
988*38fd1498Szrj }
989*38fd1498Szrj
990*38fd1498Szrj extern __inline __m256i
991*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastw_epi16(__m128i __X)992*38fd1498Szrj _mm256_broadcastw_epi16 (__m128i __X)
993*38fd1498Szrj {
994*38fd1498Szrj return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
995*38fd1498Szrj }
996*38fd1498Szrj
997*38fd1498Szrj extern __inline __m256i
998*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastd_epi32(__m128i __X)999*38fd1498Szrj _mm256_broadcastd_epi32 (__m128i __X)
1000*38fd1498Szrj {
1001*38fd1498Szrj return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
1002*38fd1498Szrj }
1003*38fd1498Szrj
1004*38fd1498Szrj extern __inline __m256i
1005*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_broadcastq_epi64(__m128i __X)1006*38fd1498Szrj _mm256_broadcastq_epi64 (__m128i __X)
1007*38fd1498Szrj {
1008*38fd1498Szrj return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1009*38fd1498Szrj }
1010*38fd1498Szrj
1011*38fd1498Szrj extern __inline __m128i
1012*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcastb_epi8(__m128i __X)1013*38fd1498Szrj _mm_broadcastb_epi8 (__m128i __X)
1014*38fd1498Szrj {
1015*38fd1498Szrj return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1016*38fd1498Szrj }
1017*38fd1498Szrj
1018*38fd1498Szrj extern __inline __m128i
1019*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcastw_epi16(__m128i __X)1020*38fd1498Szrj _mm_broadcastw_epi16 (__m128i __X)
1021*38fd1498Szrj {
1022*38fd1498Szrj return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1023*38fd1498Szrj }
1024*38fd1498Szrj
1025*38fd1498Szrj extern __inline __m128i
1026*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcastd_epi32(__m128i __X)1027*38fd1498Szrj _mm_broadcastd_epi32 (__m128i __X)
1028*38fd1498Szrj {
1029*38fd1498Szrj return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1030*38fd1498Szrj }
1031*38fd1498Szrj
1032*38fd1498Szrj extern __inline __m128i
1033*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_broadcastq_epi64(__m128i __X)1034*38fd1498Szrj _mm_broadcastq_epi64 (__m128i __X)
1035*38fd1498Szrj {
1036*38fd1498Szrj return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1037*38fd1498Szrj }
1038*38fd1498Szrj
1039*38fd1498Szrj extern __inline __m256i
1040*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar8x32_epi32(__m256i __X,__m256i __Y)1041*38fd1498Szrj _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1042*38fd1498Szrj {
1043*38fd1498Szrj return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1044*38fd1498Szrj }
1045*38fd1498Szrj
1046*38fd1498Szrj #ifdef __OPTIMIZE__
1047*38fd1498Szrj extern __inline __m256d
1048*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute4x64_pd(__m256d __X,const int __M)1049*38fd1498Szrj _mm256_permute4x64_pd (__m256d __X, const int __M)
1050*38fd1498Szrj {
1051*38fd1498Szrj return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1052*38fd1498Szrj }
1053*38fd1498Szrj #else
1054*38fd1498Szrj #define _mm256_permute4x64_pd(X, M) \
1055*38fd1498Szrj ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1056*38fd1498Szrj #endif
1057*38fd1498Szrj
1058*38fd1498Szrj extern __inline __m256
1059*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutevar8x32_ps(__m256 __X,__m256i __Y)1060*38fd1498Szrj _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1061*38fd1498Szrj {
1062*38fd1498Szrj return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1063*38fd1498Szrj }
1064*38fd1498Szrj
1065*38fd1498Szrj #ifdef __OPTIMIZE__
1066*38fd1498Szrj extern __inline __m256i
1067*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute4x64_epi64(__m256i __X,const int __M)1068*38fd1498Szrj _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1069*38fd1498Szrj {
1070*38fd1498Szrj return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1071*38fd1498Szrj }
1072*38fd1498Szrj #else
1073*38fd1498Szrj #define _mm256_permute4x64_epi64(X, M) \
1074*38fd1498Szrj ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1075*38fd1498Szrj #endif
1076*38fd1498Szrj
1077*38fd1498Szrj
1078*38fd1498Szrj #ifdef __OPTIMIZE__
1079*38fd1498Szrj extern __inline __m256i
1080*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permute2x128_si256(__m256i __X,__m256i __Y,const int __M)1081*38fd1498Szrj _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1082*38fd1498Szrj {
1083*38fd1498Szrj return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1084*38fd1498Szrj }
1085*38fd1498Szrj #else
1086*38fd1498Szrj #define _mm256_permute2x128_si256(X, Y, M) \
1087*38fd1498Szrj ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1088*38fd1498Szrj #endif
1089*38fd1498Szrj
1090*38fd1498Szrj #ifdef __OPTIMIZE__
1091*38fd1498Szrj extern __inline __m128i
1092*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_extracti128_si256(__m256i __X,const int __M)1093*38fd1498Szrj _mm256_extracti128_si256 (__m256i __X, const int __M)
1094*38fd1498Szrj {
1095*38fd1498Szrj return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1096*38fd1498Szrj }
1097*38fd1498Szrj #else
1098*38fd1498Szrj #define _mm256_extracti128_si256(X, M) \
1099*38fd1498Szrj ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1100*38fd1498Szrj #endif
1101*38fd1498Szrj
1102*38fd1498Szrj #ifdef __OPTIMIZE__
1103*38fd1498Szrj extern __inline __m256i
1104*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_inserti128_si256(__m256i __X,__m128i __Y,const int __M)1105*38fd1498Szrj _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1106*38fd1498Szrj {
1107*38fd1498Szrj return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1108*38fd1498Szrj }
1109*38fd1498Szrj #else
1110*38fd1498Szrj #define _mm256_inserti128_si256(X, Y, M) \
1111*38fd1498Szrj ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1112*38fd1498Szrj (__v2di)(__m128i)(Y), \
1113*38fd1498Szrj (int)(M)))
1114*38fd1498Szrj #endif
1115*38fd1498Szrj
1116*38fd1498Szrj extern __inline __m256i
1117*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskload_epi32(int const * __X,__m256i __M)1118*38fd1498Szrj _mm256_maskload_epi32 (int const *__X, __m256i __M )
1119*38fd1498Szrj {
1120*38fd1498Szrj return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1121*38fd1498Szrj (__v8si)__M);
1122*38fd1498Szrj }
1123*38fd1498Szrj
1124*38fd1498Szrj extern __inline __m256i
1125*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskload_epi64(long long const * __X,__m256i __M)1126*38fd1498Szrj _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1127*38fd1498Szrj {
1128*38fd1498Szrj return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1129*38fd1498Szrj (__v4di)__M);
1130*38fd1498Szrj }
1131*38fd1498Szrj
1132*38fd1498Szrj extern __inline __m128i
1133*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_epi32(int const * __X,__m128i __M)1134*38fd1498Szrj _mm_maskload_epi32 (int const *__X, __m128i __M )
1135*38fd1498Szrj {
1136*38fd1498Szrj return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1137*38fd1498Szrj (__v4si)__M);
1138*38fd1498Szrj }
1139*38fd1498Szrj
1140*38fd1498Szrj extern __inline __m128i
1141*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskload_epi64(long long const * __X,__m128i __M)1142*38fd1498Szrj _mm_maskload_epi64 (long long const *__X, __m128i __M )
1143*38fd1498Szrj {
1144*38fd1498Szrj return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1145*38fd1498Szrj (__v2di)__M);
1146*38fd1498Szrj }
1147*38fd1498Szrj
1148*38fd1498Szrj extern __inline void
1149*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskstore_epi32(int * __X,__m256i __M,__m256i __Y)1150*38fd1498Szrj _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1151*38fd1498Szrj {
1152*38fd1498Szrj __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1153*38fd1498Szrj }
1154*38fd1498Szrj
1155*38fd1498Szrj extern __inline void
1156*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskstore_epi64(long long * __X,__m256i __M,__m256i __Y)1157*38fd1498Szrj _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1158*38fd1498Szrj {
1159*38fd1498Szrj __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1160*38fd1498Szrj }
1161*38fd1498Szrj
1162*38fd1498Szrj extern __inline void
1163*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_epi32(int * __X,__m128i __M,__m128i __Y)1164*38fd1498Szrj _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1165*38fd1498Szrj {
1166*38fd1498Szrj __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1167*38fd1498Szrj }
1168*38fd1498Szrj
1169*38fd1498Szrj extern __inline void
1170*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskstore_epi64(long long * __X,__m128i __M,__m128i __Y)1171*38fd1498Szrj _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1172*38fd1498Szrj {
1173*38fd1498Szrj __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1174*38fd1498Szrj }
1175*38fd1498Szrj
1176*38fd1498Szrj extern __inline __m256i
1177*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sllv_epi32(__m256i __X,__m256i __Y)1178*38fd1498Szrj _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1179*38fd1498Szrj {
1180*38fd1498Szrj return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1181*38fd1498Szrj }
1182*38fd1498Szrj
1183*38fd1498Szrj extern __inline __m128i
1184*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sllv_epi32(__m128i __X,__m128i __Y)1185*38fd1498Szrj _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1186*38fd1498Szrj {
1187*38fd1498Szrj return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1188*38fd1498Szrj }
1189*38fd1498Szrj
1190*38fd1498Szrj extern __inline __m256i
1191*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sllv_epi64(__m256i __X,__m256i __Y)1192*38fd1498Szrj _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1193*38fd1498Szrj {
1194*38fd1498Szrj return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1195*38fd1498Szrj }
1196*38fd1498Szrj
1197*38fd1498Szrj extern __inline __m128i
1198*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_sllv_epi64(__m128i __X,__m128i __Y)1199*38fd1498Szrj _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1200*38fd1498Szrj {
1201*38fd1498Szrj return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1202*38fd1498Szrj }
1203*38fd1498Szrj
1204*38fd1498Szrj extern __inline __m256i
1205*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srav_epi32(__m256i __X,__m256i __Y)1206*38fd1498Szrj _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1207*38fd1498Szrj {
1208*38fd1498Szrj return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1209*38fd1498Szrj }
1210*38fd1498Szrj
1211*38fd1498Szrj extern __inline __m128i
1212*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_srav_epi32(__m128i __X,__m128i __Y)1213*38fd1498Szrj _mm_srav_epi32 (__m128i __X, __m128i __Y)
1214*38fd1498Szrj {
1215*38fd1498Szrj return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1216*38fd1498Szrj }
1217*38fd1498Szrj
1218*38fd1498Szrj extern __inline __m256i
1219*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srlv_epi32(__m256i __X,__m256i __Y)1220*38fd1498Szrj _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1221*38fd1498Szrj {
1222*38fd1498Szrj return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1223*38fd1498Szrj }
1224*38fd1498Szrj
1225*38fd1498Szrj extern __inline __m128i
1226*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_srlv_epi32(__m128i __X,__m128i __Y)1227*38fd1498Szrj _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1228*38fd1498Szrj {
1229*38fd1498Szrj return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1230*38fd1498Szrj }
1231*38fd1498Szrj
1232*38fd1498Szrj extern __inline __m256i
1233*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_srlv_epi64(__m256i __X,__m256i __Y)1234*38fd1498Szrj _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1235*38fd1498Szrj {
1236*38fd1498Szrj return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1237*38fd1498Szrj }
1238*38fd1498Szrj
1239*38fd1498Szrj extern __inline __m128i
1240*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_srlv_epi64(__m128i __X,__m128i __Y)1241*38fd1498Szrj _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1242*38fd1498Szrj {
1243*38fd1498Szrj return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1244*38fd1498Szrj }
1245*38fd1498Szrj
1246*38fd1498Szrj #ifdef __OPTIMIZE__
1247*38fd1498Szrj extern __inline __m128d
1248*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_pd(double const * __base,__m128i __index,const int __scale)1249*38fd1498Szrj _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1250*38fd1498Szrj {
1251*38fd1498Szrj __v2df __zero = _mm_setzero_pd ();
1252*38fd1498Szrj __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
1253*38fd1498Szrj
1254*38fd1498Szrj return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
1255*38fd1498Szrj __base,
1256*38fd1498Szrj (__v4si)__index,
1257*38fd1498Szrj __mask,
1258*38fd1498Szrj __scale);
1259*38fd1498Szrj }
1260*38fd1498Szrj
1261*38fd1498Szrj extern __inline __m128d
1262*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i32gather_pd(__m128d __src,double const * __base,__m128i __index,__m128d __mask,const int __scale)1263*38fd1498Szrj _mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
1264*38fd1498Szrj __m128d __mask, const int __scale)
1265*38fd1498Szrj {
1266*38fd1498Szrj return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
1267*38fd1498Szrj __base,
1268*38fd1498Szrj (__v4si)__index,
1269*38fd1498Szrj (__v2df)__mask,
1270*38fd1498Szrj __scale);
1271*38fd1498Szrj }
1272*38fd1498Szrj
1273*38fd1498Szrj extern __inline __m256d
1274*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_pd(double const * __base,__m128i __index,const int __scale)1275*38fd1498Szrj _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
1276*38fd1498Szrj {
1277*38fd1498Szrj __v4df __zero = _mm256_setzero_pd ();
1278*38fd1498Szrj __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
1279*38fd1498Szrj
1280*38fd1498Szrj return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
1281*38fd1498Szrj __base,
1282*38fd1498Szrj (__v4si)__index,
1283*38fd1498Szrj __mask,
1284*38fd1498Szrj __scale);
1285*38fd1498Szrj }
1286*38fd1498Szrj
1287*38fd1498Szrj extern __inline __m256d
1288*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i32gather_pd(__m256d __src,double const * __base,__m128i __index,__m256d __mask,const int __scale)1289*38fd1498Szrj _mm256_mask_i32gather_pd (__m256d __src, double const *__base,
1290*38fd1498Szrj __m128i __index, __m256d __mask, const int __scale)
1291*38fd1498Szrj {
1292*38fd1498Szrj return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
1293*38fd1498Szrj __base,
1294*38fd1498Szrj (__v4si)__index,
1295*38fd1498Szrj (__v4df)__mask,
1296*38fd1498Szrj __scale);
1297*38fd1498Szrj }
1298*38fd1498Szrj
1299*38fd1498Szrj extern __inline __m128d
1300*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_pd(double const * __base,__m128i __index,const int __scale)1301*38fd1498Szrj _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
1302*38fd1498Szrj {
1303*38fd1498Szrj __v2df __src = _mm_setzero_pd ();
1304*38fd1498Szrj __v2df __mask = _mm_cmpeq_pd (__src, __src);
1305*38fd1498Szrj
1306*38fd1498Szrj return (__m128d) __builtin_ia32_gatherdiv2df (__src,
1307*38fd1498Szrj __base,
1308*38fd1498Szrj (__v2di)__index,
1309*38fd1498Szrj __mask,
1310*38fd1498Szrj __scale);
1311*38fd1498Szrj }
1312*38fd1498Szrj
1313*38fd1498Szrj extern __inline __m128d
1314*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i64gather_pd(__m128d __src,double const * __base,__m128i __index,__m128d __mask,const int __scale)1315*38fd1498Szrj _mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
1316*38fd1498Szrj __m128d __mask, const int __scale)
1317*38fd1498Szrj {
1318*38fd1498Szrj return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
1319*38fd1498Szrj __base,
1320*38fd1498Szrj (__v2di)__index,
1321*38fd1498Szrj (__v2df)__mask,
1322*38fd1498Szrj __scale);
1323*38fd1498Szrj }
1324*38fd1498Szrj
1325*38fd1498Szrj extern __inline __m256d
1326*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_pd(double const * __base,__m256i __index,const int __scale)1327*38fd1498Szrj _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
1328*38fd1498Szrj {
1329*38fd1498Szrj __v4df __src = _mm256_setzero_pd ();
1330*38fd1498Szrj __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
1331*38fd1498Szrj
1332*38fd1498Szrj return (__m256d) __builtin_ia32_gatherdiv4df (__src,
1333*38fd1498Szrj __base,
1334*38fd1498Szrj (__v4di)__index,
1335*38fd1498Szrj __mask,
1336*38fd1498Szrj __scale);
1337*38fd1498Szrj }
1338*38fd1498Szrj
1339*38fd1498Szrj extern __inline __m256d
1340*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i64gather_pd(__m256d __src,double const * __base,__m256i __index,__m256d __mask,const int __scale)1341*38fd1498Szrj _mm256_mask_i64gather_pd (__m256d __src, double const *__base,
1342*38fd1498Szrj __m256i __index, __m256d __mask, const int __scale)
1343*38fd1498Szrj {
1344*38fd1498Szrj return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
1345*38fd1498Szrj __base,
1346*38fd1498Szrj (__v4di)__index,
1347*38fd1498Szrj (__v4df)__mask,
1348*38fd1498Szrj __scale);
1349*38fd1498Szrj }
1350*38fd1498Szrj
1351*38fd1498Szrj extern __inline __m128
1352*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_ps(float const * __base,__m128i __index,const int __scale)1353*38fd1498Szrj _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
1354*38fd1498Szrj {
1355*38fd1498Szrj __v4sf __src = _mm_setzero_ps ();
1356*38fd1498Szrj __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1357*38fd1498Szrj
1358*38fd1498Szrj return (__m128) __builtin_ia32_gathersiv4sf (__src,
1359*38fd1498Szrj __base,
1360*38fd1498Szrj (__v4si)__index,
1361*38fd1498Szrj __mask,
1362*38fd1498Szrj __scale);
1363*38fd1498Szrj }
1364*38fd1498Szrj
1365*38fd1498Szrj extern __inline __m128
1366*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i32gather_ps(__m128 __src,float const * __base,__m128i __index,__m128 __mask,const int __scale)1367*38fd1498Szrj _mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
1368*38fd1498Szrj __m128 __mask, const int __scale)
1369*38fd1498Szrj {
1370*38fd1498Szrj return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
1371*38fd1498Szrj __base,
1372*38fd1498Szrj (__v4si)__index,
1373*38fd1498Szrj (__v4sf)__mask,
1374*38fd1498Szrj __scale);
1375*38fd1498Szrj }
1376*38fd1498Szrj
1377*38fd1498Szrj extern __inline __m256
1378*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_ps(float const * __base,__m256i __index,const int __scale)1379*38fd1498Szrj _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
1380*38fd1498Szrj {
1381*38fd1498Szrj __v8sf __src = _mm256_setzero_ps ();
1382*38fd1498Szrj __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
1383*38fd1498Szrj
1384*38fd1498Szrj return (__m256) __builtin_ia32_gathersiv8sf (__src,
1385*38fd1498Szrj __base,
1386*38fd1498Szrj (__v8si)__index,
1387*38fd1498Szrj __mask,
1388*38fd1498Szrj __scale);
1389*38fd1498Szrj }
1390*38fd1498Szrj
1391*38fd1498Szrj extern __inline __m256
1392*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i32gather_ps(__m256 __src,float const * __base,__m256i __index,__m256 __mask,const int __scale)1393*38fd1498Szrj _mm256_mask_i32gather_ps (__m256 __src, float const *__base,
1394*38fd1498Szrj __m256i __index, __m256 __mask, const int __scale)
1395*38fd1498Szrj {
1396*38fd1498Szrj return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
1397*38fd1498Szrj __base,
1398*38fd1498Szrj (__v8si)__index,
1399*38fd1498Szrj (__v8sf)__mask,
1400*38fd1498Szrj __scale);
1401*38fd1498Szrj }
1402*38fd1498Szrj
1403*38fd1498Szrj extern __inline __m128
1404*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_ps(float const * __base,__m128i __index,const int __scale)1405*38fd1498Szrj _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
1406*38fd1498Szrj {
1407*38fd1498Szrj __v4sf __src = _mm_setzero_ps ();
1408*38fd1498Szrj __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1409*38fd1498Szrj
1410*38fd1498Szrj return (__m128) __builtin_ia32_gatherdiv4sf (__src,
1411*38fd1498Szrj __base,
1412*38fd1498Szrj (__v2di)__index,
1413*38fd1498Szrj __mask,
1414*38fd1498Szrj __scale);
1415*38fd1498Szrj }
1416*38fd1498Szrj
1417*38fd1498Szrj extern __inline __m128
1418*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i64gather_ps(__m128 __src,float const * __base,__m128i __index,__m128 __mask,const int __scale)1419*38fd1498Szrj _mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
1420*38fd1498Szrj __m128 __mask, const int __scale)
1421*38fd1498Szrj {
1422*38fd1498Szrj return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
1423*38fd1498Szrj __base,
1424*38fd1498Szrj (__v2di)__index,
1425*38fd1498Szrj (__v4sf)__mask,
1426*38fd1498Szrj __scale);
1427*38fd1498Szrj }
1428*38fd1498Szrj
1429*38fd1498Szrj extern __inline __m128
1430*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_ps(float const * __base,__m256i __index,const int __scale)1431*38fd1498Szrj _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
1432*38fd1498Szrj {
1433*38fd1498Szrj __v4sf __src = _mm_setzero_ps ();
1434*38fd1498Szrj __v4sf __mask = _mm_cmpeq_ps (__src, __src);
1435*38fd1498Szrj
1436*38fd1498Szrj return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
1437*38fd1498Szrj __base,
1438*38fd1498Szrj (__v4di)__index,
1439*38fd1498Szrj __mask,
1440*38fd1498Szrj __scale);
1441*38fd1498Szrj }
1442*38fd1498Szrj
1443*38fd1498Szrj extern __inline __m128
1444*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i64gather_ps(__m128 __src,float const * __base,__m256i __index,__m128 __mask,const int __scale)1445*38fd1498Szrj _mm256_mask_i64gather_ps (__m128 __src, float const *__base,
1446*38fd1498Szrj __m256i __index, __m128 __mask, const int __scale)
1447*38fd1498Szrj {
1448*38fd1498Szrj return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
1449*38fd1498Szrj __base,
1450*38fd1498Szrj (__v4di)__index,
1451*38fd1498Szrj (__v4sf)__mask,
1452*38fd1498Szrj __scale);
1453*38fd1498Szrj }
1454*38fd1498Szrj
1455*38fd1498Szrj extern __inline __m128i
1456*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_epi64(long long int const * __base,__m128i __index,const int __scale)1457*38fd1498Szrj _mm_i32gather_epi64 (long long int const *__base,
1458*38fd1498Szrj __m128i __index, const int __scale)
1459*38fd1498Szrj {
1460*38fd1498Szrj __v2di __src = __extension__ (__v2di){ 0, 0 };
1461*38fd1498Szrj __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1462*38fd1498Szrj
1463*38fd1498Szrj return (__m128i) __builtin_ia32_gathersiv2di (__src,
1464*38fd1498Szrj __base,
1465*38fd1498Szrj (__v4si)__index,
1466*38fd1498Szrj __mask,
1467*38fd1498Szrj __scale);
1468*38fd1498Szrj }
1469*38fd1498Szrj
1470*38fd1498Szrj extern __inline __m128i
1471*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i32gather_epi64(__m128i __src,long long int const * __base,__m128i __index,__m128i __mask,const int __scale)1472*38fd1498Szrj _mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
1473*38fd1498Szrj __m128i __index, __m128i __mask, const int __scale)
1474*38fd1498Szrj {
1475*38fd1498Szrj return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
1476*38fd1498Szrj __base,
1477*38fd1498Szrj (__v4si)__index,
1478*38fd1498Szrj (__v2di)__mask,
1479*38fd1498Szrj __scale);
1480*38fd1498Szrj }
1481*38fd1498Szrj
1482*38fd1498Szrj extern __inline __m256i
1483*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_epi64(long long int const * __base,__m128i __index,const int __scale)1484*38fd1498Szrj _mm256_i32gather_epi64 (long long int const *__base,
1485*38fd1498Szrj __m128i __index, const int __scale)
1486*38fd1498Szrj {
1487*38fd1498Szrj __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1488*38fd1498Szrj __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1489*38fd1498Szrj
1490*38fd1498Szrj return (__m256i) __builtin_ia32_gathersiv4di (__src,
1491*38fd1498Szrj __base,
1492*38fd1498Szrj (__v4si)__index,
1493*38fd1498Szrj __mask,
1494*38fd1498Szrj __scale);
1495*38fd1498Szrj }
1496*38fd1498Szrj
1497*38fd1498Szrj extern __inline __m256i
1498*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i32gather_epi64(__m256i __src,long long int const * __base,__m128i __index,__m256i __mask,const int __scale)1499*38fd1498Szrj _mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
1500*38fd1498Szrj __m128i __index, __m256i __mask,
1501*38fd1498Szrj const int __scale)
1502*38fd1498Szrj {
1503*38fd1498Szrj return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
1504*38fd1498Szrj __base,
1505*38fd1498Szrj (__v4si)__index,
1506*38fd1498Szrj (__v4di)__mask,
1507*38fd1498Szrj __scale);
1508*38fd1498Szrj }
1509*38fd1498Szrj
1510*38fd1498Szrj extern __inline __m128i
1511*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_epi64(long long int const * __base,__m128i __index,const int __scale)1512*38fd1498Szrj _mm_i64gather_epi64 (long long int const *__base,
1513*38fd1498Szrj __m128i __index, const int __scale)
1514*38fd1498Szrj {
1515*38fd1498Szrj __v2di __src = __extension__ (__v2di){ 0, 0 };
1516*38fd1498Szrj __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
1517*38fd1498Szrj
1518*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv2di (__src,
1519*38fd1498Szrj __base,
1520*38fd1498Szrj (__v2di)__index,
1521*38fd1498Szrj __mask,
1522*38fd1498Szrj __scale);
1523*38fd1498Szrj }
1524*38fd1498Szrj
1525*38fd1498Szrj extern __inline __m128i
1526*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i64gather_epi64(__m128i __src,long long int const * __base,__m128i __index,__m128i __mask,const int __scale)1527*38fd1498Szrj _mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
1528*38fd1498Szrj __m128i __index, __m128i __mask, const int __scale)
1529*38fd1498Szrj {
1530*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
1531*38fd1498Szrj __base,
1532*38fd1498Szrj (__v2di)__index,
1533*38fd1498Szrj (__v2di)__mask,
1534*38fd1498Szrj __scale);
1535*38fd1498Szrj }
1536*38fd1498Szrj
1537*38fd1498Szrj extern __inline __m256i
1538*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_epi64(long long int const * __base,__m256i __index,const int __scale)1539*38fd1498Szrj _mm256_i64gather_epi64 (long long int const *__base,
1540*38fd1498Szrj __m256i __index, const int __scale)
1541*38fd1498Szrj {
1542*38fd1498Szrj __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
1543*38fd1498Szrj __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1544*38fd1498Szrj
1545*38fd1498Szrj return (__m256i) __builtin_ia32_gatherdiv4di (__src,
1546*38fd1498Szrj __base,
1547*38fd1498Szrj (__v4di)__index,
1548*38fd1498Szrj __mask,
1549*38fd1498Szrj __scale);
1550*38fd1498Szrj }
1551*38fd1498Szrj
1552*38fd1498Szrj extern __inline __m256i
1553*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i64gather_epi64(__m256i __src,long long int const * __base,__m256i __index,__m256i __mask,const int __scale)1554*38fd1498Szrj _mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
1555*38fd1498Szrj __m256i __index, __m256i __mask,
1556*38fd1498Szrj const int __scale)
1557*38fd1498Szrj {
1558*38fd1498Szrj return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
1559*38fd1498Szrj __base,
1560*38fd1498Szrj (__v4di)__index,
1561*38fd1498Szrj (__v4di)__mask,
1562*38fd1498Szrj __scale);
1563*38fd1498Szrj }
1564*38fd1498Szrj
1565*38fd1498Szrj extern __inline __m128i
1566*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_epi32(int const * __base,__m128i __index,const int __scale)1567*38fd1498Szrj _mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
1568*38fd1498Szrj {
1569*38fd1498Szrj __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1570*38fd1498Szrj __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1571*38fd1498Szrj
1572*38fd1498Szrj return (__m128i) __builtin_ia32_gathersiv4si (__src,
1573*38fd1498Szrj __base,
1574*38fd1498Szrj (__v4si)__index,
1575*38fd1498Szrj __mask,
1576*38fd1498Szrj __scale);
1577*38fd1498Szrj }
1578*38fd1498Szrj
1579*38fd1498Szrj extern __inline __m128i
1580*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i32gather_epi32(__m128i __src,int const * __base,__m128i __index,__m128i __mask,const int __scale)1581*38fd1498Szrj _mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1582*38fd1498Szrj __m128i __mask, const int __scale)
1583*38fd1498Szrj {
1584*38fd1498Szrj return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
1585*38fd1498Szrj __base,
1586*38fd1498Szrj (__v4si)__index,
1587*38fd1498Szrj (__v4si)__mask,
1588*38fd1498Szrj __scale);
1589*38fd1498Szrj }
1590*38fd1498Szrj
1591*38fd1498Szrj extern __inline __m256i
1592*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_epi32(int const * __base,__m256i __index,const int __scale)1593*38fd1498Szrj _mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
1594*38fd1498Szrj {
1595*38fd1498Szrj __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1596*38fd1498Szrj __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1597*38fd1498Szrj
1598*38fd1498Szrj return (__m256i) __builtin_ia32_gathersiv8si (__src,
1599*38fd1498Szrj __base,
1600*38fd1498Szrj (__v8si)__index,
1601*38fd1498Szrj __mask,
1602*38fd1498Szrj __scale);
1603*38fd1498Szrj }
1604*38fd1498Szrj
1605*38fd1498Szrj extern __inline __m256i
1606*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i32gather_epi32(__m256i __src,int const * __base,__m256i __index,__m256i __mask,const int __scale)1607*38fd1498Szrj _mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
1608*38fd1498Szrj __m256i __index, __m256i __mask,
1609*38fd1498Szrj const int __scale)
1610*38fd1498Szrj {
1611*38fd1498Szrj return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
1612*38fd1498Szrj __base,
1613*38fd1498Szrj (__v8si)__index,
1614*38fd1498Szrj (__v8si)__mask,
1615*38fd1498Szrj __scale);
1616*38fd1498Szrj }
1617*38fd1498Szrj
1618*38fd1498Szrj extern __inline __m128i
1619*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_epi32(int const * __base,__m128i __index,const int __scale)1620*38fd1498Szrj _mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
1621*38fd1498Szrj {
1622*38fd1498Szrj __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1623*38fd1498Szrj __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1624*38fd1498Szrj
1625*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv4si (__src,
1626*38fd1498Szrj __base,
1627*38fd1498Szrj (__v2di)__index,
1628*38fd1498Szrj __mask,
1629*38fd1498Szrj __scale);
1630*38fd1498Szrj }
1631*38fd1498Szrj
1632*38fd1498Szrj extern __inline __m128i
1633*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_i64gather_epi32(__m128i __src,int const * __base,__m128i __index,__m128i __mask,const int __scale)1634*38fd1498Szrj _mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
1635*38fd1498Szrj __m128i __mask, const int __scale)
1636*38fd1498Szrj {
1637*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
1638*38fd1498Szrj __base,
1639*38fd1498Szrj (__v2di)__index,
1640*38fd1498Szrj (__v4si)__mask,
1641*38fd1498Szrj __scale);
1642*38fd1498Szrj }
1643*38fd1498Szrj
1644*38fd1498Szrj extern __inline __m128i
1645*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_epi32(int const * __base,__m256i __index,const int __scale)1646*38fd1498Szrj _mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
1647*38fd1498Szrj {
1648*38fd1498Szrj __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
1649*38fd1498Szrj __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1650*38fd1498Szrj
1651*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
1652*38fd1498Szrj __base,
1653*38fd1498Szrj (__v4di)__index,
1654*38fd1498Szrj __mask,
1655*38fd1498Szrj __scale);
1656*38fd1498Szrj }
1657*38fd1498Szrj
1658*38fd1498Szrj extern __inline __m128i
1659*38fd1498Szrj __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_i64gather_epi32(__m128i __src,int const * __base,__m256i __index,__m128i __mask,const int __scale)1660*38fd1498Szrj _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
1661*38fd1498Szrj __m256i __index, __m128i __mask,
1662*38fd1498Szrj const int __scale)
1663*38fd1498Szrj {
1664*38fd1498Szrj return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
1665*38fd1498Szrj __base,
1666*38fd1498Szrj (__v4di)__index,
1667*38fd1498Szrj (__v4si)__mask,
1668*38fd1498Szrj __scale);
1669*38fd1498Szrj }
1670*38fd1498Szrj #else /* __OPTIMIZE__ */
1671*38fd1498Szrj #define _mm_i32gather_pd(BASE, INDEX, SCALE) \
1672*38fd1498Szrj (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
1673*38fd1498Szrj (double const *)BASE, \
1674*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1675*38fd1498Szrj (__v2df)_mm_set1_pd( \
1676*38fd1498Szrj (double)(long long int) -1), \
1677*38fd1498Szrj (int)SCALE)
1678*38fd1498Szrj
1679*38fd1498Szrj #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1680*38fd1498Szrj (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
1681*38fd1498Szrj (double const *)BASE, \
1682*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1683*38fd1498Szrj (__v2df)(__m128d)MASK, \
1684*38fd1498Szrj (int)SCALE)
1685*38fd1498Szrj
1686*38fd1498Szrj #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
1687*38fd1498Szrj (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
1688*38fd1498Szrj (double const *)BASE, \
1689*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1690*38fd1498Szrj (__v4df)_mm256_set1_pd( \
1691*38fd1498Szrj (double)(long long int) -1), \
1692*38fd1498Szrj (int)SCALE)
1693*38fd1498Szrj
1694*38fd1498Szrj #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1695*38fd1498Szrj (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
1696*38fd1498Szrj (double const *)BASE, \
1697*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1698*38fd1498Szrj (__v4df)(__m256d)MASK, \
1699*38fd1498Szrj (int)SCALE)
1700*38fd1498Szrj
1701*38fd1498Szrj #define _mm_i64gather_pd(BASE, INDEX, SCALE) \
1702*38fd1498Szrj (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
1703*38fd1498Szrj (double const *)BASE, \
1704*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1705*38fd1498Szrj (__v2df)_mm_set1_pd( \
1706*38fd1498Szrj (double)(long long int) -1), \
1707*38fd1498Szrj (int)SCALE)
1708*38fd1498Szrj
1709*38fd1498Szrj #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1710*38fd1498Szrj (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
1711*38fd1498Szrj (double const *)BASE, \
1712*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1713*38fd1498Szrj (__v2df)(__m128d)MASK, \
1714*38fd1498Szrj (int)SCALE)
1715*38fd1498Szrj
1716*38fd1498Szrj #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
1717*38fd1498Szrj (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
1718*38fd1498Szrj (double const *)BASE, \
1719*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1720*38fd1498Szrj (__v4df)_mm256_set1_pd( \
1721*38fd1498Szrj (double)(long long int) -1), \
1722*38fd1498Szrj (int)SCALE)
1723*38fd1498Szrj
1724*38fd1498Szrj #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
1725*38fd1498Szrj (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
1726*38fd1498Szrj (double const *)BASE, \
1727*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1728*38fd1498Szrj (__v4df)(__m256d)MASK, \
1729*38fd1498Szrj (int)SCALE)
1730*38fd1498Szrj
1731*38fd1498Szrj #define _mm_i32gather_ps(BASE, INDEX, SCALE) \
1732*38fd1498Szrj (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
1733*38fd1498Szrj (float const *)BASE, \
1734*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1735*38fd1498Szrj _mm_set1_ps ((float)(int) -1), \
1736*38fd1498Szrj (int)SCALE)
1737*38fd1498Szrj
1738*38fd1498Szrj #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1739*38fd1498Szrj (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
1740*38fd1498Szrj (float const *)BASE, \
1741*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1742*38fd1498Szrj (__v4sf)(__m128d)MASK, \
1743*38fd1498Szrj (int)SCALE)
1744*38fd1498Szrj
1745*38fd1498Szrj #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
1746*38fd1498Szrj (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1747*38fd1498Szrj (float const *)BASE, \
1748*38fd1498Szrj (__v8si)(__m256i)INDEX, \
1749*38fd1498Szrj (__v8sf)_mm256_set1_ps ( \
1750*38fd1498Szrj (float)(int) -1), \
1751*38fd1498Szrj (int)SCALE)
1752*38fd1498Szrj
1753*38fd1498Szrj #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1754*38fd1498Szrj (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
1755*38fd1498Szrj (float const *)BASE, \
1756*38fd1498Szrj (__v8si)(__m256i)INDEX, \
1757*38fd1498Szrj (__v8sf)(__m256d)MASK, \
1758*38fd1498Szrj (int)SCALE)
1759*38fd1498Szrj
1760*38fd1498Szrj #define _mm_i64gather_ps(BASE, INDEX, SCALE) \
1761*38fd1498Szrj (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
1762*38fd1498Szrj (float const *)BASE, \
1763*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1764*38fd1498Szrj (__v4sf)_mm_set1_ps ( \
1765*38fd1498Szrj (float)(int) -1), \
1766*38fd1498Szrj (int)SCALE)
1767*38fd1498Szrj
1768*38fd1498Szrj #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1769*38fd1498Szrj (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
1770*38fd1498Szrj (float const *)BASE, \
1771*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1772*38fd1498Szrj (__v4sf)(__m128d)MASK, \
1773*38fd1498Szrj (int)SCALE)
1774*38fd1498Szrj
1775*38fd1498Szrj #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
1776*38fd1498Szrj (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
1777*38fd1498Szrj (float const *)BASE, \
1778*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1779*38fd1498Szrj (__v4sf)_mm_set1_ps( \
1780*38fd1498Szrj (float)(int) -1), \
1781*38fd1498Szrj (int)SCALE)
1782*38fd1498Szrj
1783*38fd1498Szrj #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1784*38fd1498Szrj (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
1785*38fd1498Szrj (float const *)BASE, \
1786*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1787*38fd1498Szrj (__v4sf)(__m128)MASK, \
1788*38fd1498Szrj (int)SCALE)
1789*38fd1498Szrj
1790*38fd1498Szrj #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
1791*38fd1498Szrj (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1792*38fd1498Szrj (long long const *)BASE, \
1793*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1794*38fd1498Szrj (__v2di)_mm_set1_epi64x (-1), \
1795*38fd1498Szrj (int)SCALE)
1796*38fd1498Szrj
1797*38fd1498Szrj #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1798*38fd1498Szrj (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \
1799*38fd1498Szrj (long long const *)BASE, \
1800*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1801*38fd1498Szrj (__v2di)(__m128i)MASK, \
1802*38fd1498Szrj (int)SCALE)
1803*38fd1498Szrj
1804*38fd1498Szrj #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
1805*38fd1498Szrj (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1806*38fd1498Szrj (long long const *)BASE, \
1807*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1808*38fd1498Szrj (__v4di)_mm256_set1_epi64x (-1), \
1809*38fd1498Szrj (int)SCALE)
1810*38fd1498Szrj
1811*38fd1498Szrj #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1812*38fd1498Szrj (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \
1813*38fd1498Szrj (long long const *)BASE, \
1814*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1815*38fd1498Szrj (__v4di)(__m256i)MASK, \
1816*38fd1498Szrj (int)SCALE)
1817*38fd1498Szrj
1818*38fd1498Szrj #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
1819*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1820*38fd1498Szrj (long long const *)BASE, \
1821*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1822*38fd1498Szrj (__v2di)_mm_set1_epi64x (-1), \
1823*38fd1498Szrj (int)SCALE)
1824*38fd1498Szrj
1825*38fd1498Szrj #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1826*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \
1827*38fd1498Szrj (long long const *)BASE, \
1828*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1829*38fd1498Szrj (__v2di)(__m128i)MASK, \
1830*38fd1498Szrj (int)SCALE)
1831*38fd1498Szrj
1832*38fd1498Szrj #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
1833*38fd1498Szrj (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1834*38fd1498Szrj (long long const *)BASE, \
1835*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1836*38fd1498Szrj (__v4di)_mm256_set1_epi64x (-1), \
1837*38fd1498Szrj (int)SCALE)
1838*38fd1498Szrj
1839*38fd1498Szrj #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1840*38fd1498Szrj (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \
1841*38fd1498Szrj (long long const *)BASE, \
1842*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1843*38fd1498Szrj (__v4di)(__m256i)MASK, \
1844*38fd1498Szrj (int)SCALE)
1845*38fd1498Szrj
1846*38fd1498Szrj #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
1847*38fd1498Szrj (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
1848*38fd1498Szrj (int const *)BASE, \
1849*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1850*38fd1498Szrj (__v4si)_mm_set1_epi32 (-1), \
1851*38fd1498Szrj (int)SCALE)
1852*38fd1498Szrj
1853*38fd1498Szrj #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1854*38fd1498Szrj (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
1855*38fd1498Szrj (int const *)BASE, \
1856*38fd1498Szrj (__v4si)(__m128i)INDEX, \
1857*38fd1498Szrj (__v4si)(__m128i)MASK, \
1858*38fd1498Szrj (int)SCALE)
1859*38fd1498Szrj
1860*38fd1498Szrj #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
1861*38fd1498Szrj (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1862*38fd1498Szrj (int const *)BASE, \
1863*38fd1498Szrj (__v8si)(__m256i)INDEX, \
1864*38fd1498Szrj (__v8si)_mm256_set1_epi32 (-1), \
1865*38fd1498Szrj (int)SCALE)
1866*38fd1498Szrj
1867*38fd1498Szrj #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1868*38fd1498Szrj (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \
1869*38fd1498Szrj (int const *)BASE, \
1870*38fd1498Szrj (__v8si)(__m256i)INDEX, \
1871*38fd1498Szrj (__v8si)(__m256i)MASK, \
1872*38fd1498Szrj (int)SCALE)
1873*38fd1498Szrj
1874*38fd1498Szrj #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
1875*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
1876*38fd1498Szrj (int const *)BASE, \
1877*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1878*38fd1498Szrj (__v4si)_mm_set1_epi32 (-1), \
1879*38fd1498Szrj (int)SCALE)
1880*38fd1498Szrj
1881*38fd1498Szrj #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1882*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
1883*38fd1498Szrj (int const *)BASE, \
1884*38fd1498Szrj (__v2di)(__m128i)INDEX, \
1885*38fd1498Szrj (__v4si)(__m128i)MASK, \
1886*38fd1498Szrj (int)SCALE)
1887*38fd1498Szrj
1888*38fd1498Szrj #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
1889*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1890*38fd1498Szrj (int const *)BASE, \
1891*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1892*38fd1498Szrj (__v4si)_mm_set1_epi32(-1), \
1893*38fd1498Szrj (int)SCALE)
1894*38fd1498Szrj
1895*38fd1498Szrj #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1896*38fd1498Szrj (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \
1897*38fd1498Szrj (int const *)BASE, \
1898*38fd1498Szrj (__v4di)(__m256i)INDEX, \
1899*38fd1498Szrj (__v4si)(__m128i)MASK, \
1900*38fd1498Szrj (int)SCALE)
1901*38fd1498Szrj #endif /* __OPTIMIZE__ */
1902*38fd1498Szrj
1903*38fd1498Szrj #ifdef __DISABLE_AVX2__
1904*38fd1498Szrj #undef __DISABLE_AVX2__
1905*38fd1498Szrj #pragma GCC pop_options
1906*38fd1498Szrj #endif /* __DISABLE_AVX2__ */
1907*38fd1498Szrj
1908*38fd1498Szrj #endif /* _AVX2INTRIN_H_INCLUDED */
1909