1*38fd1498Szrj /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
2*38fd1498Szrj
3*38fd1498Szrj This file is part of GCC.
4*38fd1498Szrj
5*38fd1498Szrj GCC is free software; you can redistribute it and/or modify
6*38fd1498Szrj it under the terms of the GNU General Public License as published by
7*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
8*38fd1498Szrj any later version.
9*38fd1498Szrj
10*38fd1498Szrj GCC is distributed in the hope that it will be useful,
11*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
12*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*38fd1498Szrj GNU General Public License for more details.
14*38fd1498Szrj
15*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
16*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
17*38fd1498Szrj 3.1, as published by the Free Software Foundation.
18*38fd1498Szrj
19*38fd1498Szrj You should have received a copy of the GNU General Public License and
20*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
21*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22*38fd1498Szrj <http://www.gnu.org/licenses/>. */
23*38fd1498Szrj
24*38fd1498Szrj /* Implemented from the specification included in the Intel C++ Compiler
25*38fd1498Szrj User Guide and Reference, version 9.0. */
26*38fd1498Szrj
27*38fd1498Szrj #ifndef _MMINTRIN_H_INCLUDED
28*38fd1498Szrj #define _MMINTRIN_H_INCLUDED
29*38fd1498Szrj
30*38fd1498Szrj #if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
31*38fd1498Szrj #pragma GCC push_options
32*38fd1498Szrj #ifdef __x86_64__
33*38fd1498Szrj #pragma GCC target("sse,mmx")
34*38fd1498Szrj #else
35*38fd1498Szrj #pragma GCC target("mmx")
36*38fd1498Szrj #endif
37*38fd1498Szrj #define __DISABLE_MMX__
38*38fd1498Szrj #endif /* __MMX__ */
39*38fd1498Szrj
40*38fd1498Szrj /* The Intel API is flexible enough that we must allow aliasing with other
41*38fd1498Szrj vector types, and their scalar components. */
42*38fd1498Szrj typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
43*38fd1498Szrj
44*38fd1498Szrj /* Unaligned version of the same type */
45*38fd1498Szrj typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
46*38fd1498Szrj
47*38fd1498Szrj /* Internal data types for implementing the intrinsics. */
48*38fd1498Szrj typedef int __v2si __attribute__ ((__vector_size__ (8)));
49*38fd1498Szrj typedef short __v4hi __attribute__ ((__vector_size__ (8)));
50*38fd1498Szrj typedef char __v8qi __attribute__ ((__vector_size__ (8)));
51*38fd1498Szrj typedef long long __v1di __attribute__ ((__vector_size__ (8)));
52*38fd1498Szrj typedef float __v2sf __attribute__ ((__vector_size__ (8)));
53*38fd1498Szrj
54*38fd1498Szrj /* Empty the multimedia state. */
55*38fd1498Szrj extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void)56*38fd1498Szrj _mm_empty (void)
57*38fd1498Szrj {
58*38fd1498Szrj __builtin_ia32_emms ();
59*38fd1498Szrj }
60*38fd1498Szrj
61*38fd1498Szrj extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_empty(void)62*38fd1498Szrj _m_empty (void)
63*38fd1498Szrj {
64*38fd1498Szrj _mm_empty ();
65*38fd1498Szrj }
66*38fd1498Szrj
67*38fd1498Szrj /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
68*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si64(int __i)69*38fd1498Szrj _mm_cvtsi32_si64 (int __i)
70*38fd1498Szrj {
71*38fd1498Szrj return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
72*38fd1498Szrj }
73*38fd1498Szrj
74*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int(int __i)75*38fd1498Szrj _m_from_int (int __i)
76*38fd1498Szrj {
77*38fd1498Szrj return _mm_cvtsi32_si64 (__i);
78*38fd1498Szrj }
79*38fd1498Szrj
80*38fd1498Szrj #ifdef __x86_64__
81*38fd1498Szrj /* Convert I to a __m64 object. */
82*38fd1498Szrj
83*38fd1498Szrj /* Intel intrinsic. */
84*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int64(long long __i)85*38fd1498Szrj _m_from_int64 (long long __i)
86*38fd1498Szrj {
87*38fd1498Szrj return (__m64) __i;
88*38fd1498Szrj }
89*38fd1498Szrj
90*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_m64(long long __i)91*38fd1498Szrj _mm_cvtsi64_m64 (long long __i)
92*38fd1498Szrj {
93*38fd1498Szrj return (__m64) __i;
94*38fd1498Szrj }
95*38fd1498Szrj
96*38fd1498Szrj /* Microsoft intrinsic. */
97*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si64(long long __i)98*38fd1498Szrj _mm_cvtsi64x_si64 (long long __i)
99*38fd1498Szrj {
100*38fd1498Szrj return (__m64) __i;
101*38fd1498Szrj }
102*38fd1498Szrj
103*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi64x(long long __i)104*38fd1498Szrj _mm_set_pi64x (long long __i)
105*38fd1498Szrj {
106*38fd1498Szrj return (__m64) __i;
107*38fd1498Szrj }
108*38fd1498Szrj #endif
109*38fd1498Szrj
110*38fd1498Szrj /* Convert the lower 32 bits of the __m64 object into an integer. */
111*38fd1498Szrj extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si32(__m64 __i)112*38fd1498Szrj _mm_cvtsi64_si32 (__m64 __i)
113*38fd1498Szrj {
114*38fd1498Szrj return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
115*38fd1498Szrj }
116*38fd1498Szrj
117*38fd1498Szrj extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int(__m64 __i)118*38fd1498Szrj _m_to_int (__m64 __i)
119*38fd1498Szrj {
120*38fd1498Szrj return _mm_cvtsi64_si32 (__i);
121*38fd1498Szrj }
122*38fd1498Szrj
123*38fd1498Szrj #ifdef __x86_64__
124*38fd1498Szrj /* Convert the __m64 object to a 64bit integer. */
125*38fd1498Szrj
126*38fd1498Szrj /* Intel intrinsic. */
127*38fd1498Szrj extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int64(__m64 __i)128*38fd1498Szrj _m_to_int64 (__m64 __i)
129*38fd1498Szrj {
130*38fd1498Szrj return (long long)__i;
131*38fd1498Szrj }
132*38fd1498Szrj
133*38fd1498Szrj extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtm64_si64(__m64 __i)134*38fd1498Szrj _mm_cvtm64_si64 (__m64 __i)
135*38fd1498Szrj {
136*38fd1498Szrj return (long long)__i;
137*38fd1498Szrj }
138*38fd1498Szrj
139*38fd1498Szrj /* Microsoft intrinsic. */
140*38fd1498Szrj extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si64x(__m64 __i)141*38fd1498Szrj _mm_cvtsi64_si64x (__m64 __i)
142*38fd1498Szrj {
143*38fd1498Szrj return (long long)__i;
144*38fd1498Szrj }
145*38fd1498Szrj #endif
146*38fd1498Szrj
147*38fd1498Szrj /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148*38fd1498Szrj the result, and the four 16-bit values from M2 into the upper four 8-bit
149*38fd1498Szrj values of the result, all with signed saturation. */
150*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi16(__m64 __m1,__m64 __m2)151*38fd1498Szrj _mm_packs_pi16 (__m64 __m1, __m64 __m2)
152*38fd1498Szrj {
153*38fd1498Szrj return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
154*38fd1498Szrj }
155*38fd1498Szrj
156*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packsswb(__m64 __m1,__m64 __m2)157*38fd1498Szrj _m_packsswb (__m64 __m1, __m64 __m2)
158*38fd1498Szrj {
159*38fd1498Szrj return _mm_packs_pi16 (__m1, __m2);
160*38fd1498Szrj }
161*38fd1498Szrj
162*38fd1498Szrj /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
163*38fd1498Szrj the result, and the two 32-bit values from M2 into the upper two 16-bit
164*38fd1498Szrj values of the result, all with signed saturation. */
165*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1,__m64 __m2)166*38fd1498Szrj _mm_packs_pi32 (__m64 __m1, __m64 __m2)
167*38fd1498Szrj {
168*38fd1498Szrj return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
169*38fd1498Szrj }
170*38fd1498Szrj
171*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packssdw(__m64 __m1,__m64 __m2)172*38fd1498Szrj _m_packssdw (__m64 __m1, __m64 __m2)
173*38fd1498Szrj {
174*38fd1498Szrj return _mm_packs_pi32 (__m1, __m2);
175*38fd1498Szrj }
176*38fd1498Szrj
177*38fd1498Szrj /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
178*38fd1498Szrj the result, and the four 16-bit values from M2 into the upper four 8-bit
179*38fd1498Szrj values of the result, all with unsigned saturation. */
180*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1,__m64 __m2)181*38fd1498Szrj _mm_packs_pu16 (__m64 __m1, __m64 __m2)
182*38fd1498Szrj {
183*38fd1498Szrj return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
184*38fd1498Szrj }
185*38fd1498Szrj
186*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packuswb(__m64 __m1,__m64 __m2)187*38fd1498Szrj _m_packuswb (__m64 __m1, __m64 __m2)
188*38fd1498Szrj {
189*38fd1498Szrj return _mm_packs_pu16 (__m1, __m2);
190*38fd1498Szrj }
191*38fd1498Szrj
192*38fd1498Szrj /* Interleave the four 8-bit values from the high half of M1 with the four
193*38fd1498Szrj 8-bit values from the high half of M2. */
194*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)195*38fd1498Szrj _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
196*38fd1498Szrj {
197*38fd1498Szrj return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
198*38fd1498Szrj }
199*38fd1498Szrj
200*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhbw(__m64 __m1,__m64 __m2)201*38fd1498Szrj _m_punpckhbw (__m64 __m1, __m64 __m2)
202*38fd1498Szrj {
203*38fd1498Szrj return _mm_unpackhi_pi8 (__m1, __m2);
204*38fd1498Szrj }
205*38fd1498Szrj
206*38fd1498Szrj /* Interleave the two 16-bit values from the high half of M1 with the two
207*38fd1498Szrj 16-bit values from the high half of M2. */
208*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)209*38fd1498Szrj _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
210*38fd1498Szrj {
211*38fd1498Szrj return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
212*38fd1498Szrj }
213*38fd1498Szrj
214*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhwd(__m64 __m1,__m64 __m2)215*38fd1498Szrj _m_punpckhwd (__m64 __m1, __m64 __m2)
216*38fd1498Szrj {
217*38fd1498Szrj return _mm_unpackhi_pi16 (__m1, __m2);
218*38fd1498Szrj }
219*38fd1498Szrj
220*38fd1498Szrj /* Interleave the 32-bit value from the high half of M1 with the 32-bit
221*38fd1498Szrj value from the high half of M2. */
222*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)223*38fd1498Szrj _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
224*38fd1498Szrj {
225*38fd1498Szrj return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
226*38fd1498Szrj }
227*38fd1498Szrj
228*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhdq(__m64 __m1,__m64 __m2)229*38fd1498Szrj _m_punpckhdq (__m64 __m1, __m64 __m2)
230*38fd1498Szrj {
231*38fd1498Szrj return _mm_unpackhi_pi32 (__m1, __m2);
232*38fd1498Szrj }
233*38fd1498Szrj
234*38fd1498Szrj /* Interleave the four 8-bit values from the low half of M1 with the four
235*38fd1498Szrj 8-bit values from the low half of M2. */
236*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)237*38fd1498Szrj _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
238*38fd1498Szrj {
239*38fd1498Szrj return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
240*38fd1498Szrj }
241*38fd1498Szrj
242*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklbw(__m64 __m1,__m64 __m2)243*38fd1498Szrj _m_punpcklbw (__m64 __m1, __m64 __m2)
244*38fd1498Szrj {
245*38fd1498Szrj return _mm_unpacklo_pi8 (__m1, __m2);
246*38fd1498Szrj }
247*38fd1498Szrj
248*38fd1498Szrj /* Interleave the two 16-bit values from the low half of M1 with the two
249*38fd1498Szrj 16-bit values from the low half of M2. */
250*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)251*38fd1498Szrj _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
252*38fd1498Szrj {
253*38fd1498Szrj return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
254*38fd1498Szrj }
255*38fd1498Szrj
256*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklwd(__m64 __m1,__m64 __m2)257*38fd1498Szrj _m_punpcklwd (__m64 __m1, __m64 __m2)
258*38fd1498Szrj {
259*38fd1498Szrj return _mm_unpacklo_pi16 (__m1, __m2);
260*38fd1498Szrj }
261*38fd1498Szrj
262*38fd1498Szrj /* Interleave the 32-bit value from the low half of M1 with the 32-bit
263*38fd1498Szrj value from the low half of M2. */
264*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)265*38fd1498Szrj _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
266*38fd1498Szrj {
267*38fd1498Szrj return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
268*38fd1498Szrj }
269*38fd1498Szrj
270*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckldq(__m64 __m1,__m64 __m2)271*38fd1498Szrj _m_punpckldq (__m64 __m1, __m64 __m2)
272*38fd1498Szrj {
273*38fd1498Szrj return _mm_unpacklo_pi32 (__m1, __m2);
274*38fd1498Szrj }
275*38fd1498Szrj
276*38fd1498Szrj /* Add the 8-bit values in M1 to the 8-bit values in M2. */
277*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi8(__m64 __m1,__m64 __m2)278*38fd1498Szrj _mm_add_pi8 (__m64 __m1, __m64 __m2)
279*38fd1498Szrj {
280*38fd1498Szrj return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
281*38fd1498Szrj }
282*38fd1498Szrj
283*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddb(__m64 __m1,__m64 __m2)284*38fd1498Szrj _m_paddb (__m64 __m1, __m64 __m2)
285*38fd1498Szrj {
286*38fd1498Szrj return _mm_add_pi8 (__m1, __m2);
287*38fd1498Szrj }
288*38fd1498Szrj
289*38fd1498Szrj /* Add the 16-bit values in M1 to the 16-bit values in M2. */
290*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1,__m64 __m2)291*38fd1498Szrj _mm_add_pi16 (__m64 __m1, __m64 __m2)
292*38fd1498Szrj {
293*38fd1498Szrj return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
294*38fd1498Szrj }
295*38fd1498Szrj
296*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddw(__m64 __m1,__m64 __m2)297*38fd1498Szrj _m_paddw (__m64 __m1, __m64 __m2)
298*38fd1498Szrj {
299*38fd1498Szrj return _mm_add_pi16 (__m1, __m2);
300*38fd1498Szrj }
301*38fd1498Szrj
302*38fd1498Szrj /* Add the 32-bit values in M1 to the 32-bit values in M2. */
303*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1,__m64 __m2)304*38fd1498Szrj _mm_add_pi32 (__m64 __m1, __m64 __m2)
305*38fd1498Szrj {
306*38fd1498Szrj return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
307*38fd1498Szrj }
308*38fd1498Szrj
309*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddd(__m64 __m1,__m64 __m2)310*38fd1498Szrj _m_paddd (__m64 __m1, __m64 __m2)
311*38fd1498Szrj {
312*38fd1498Szrj return _mm_add_pi32 (__m1, __m2);
313*38fd1498Szrj }
314*38fd1498Szrj
315*38fd1498Szrj /* Add the 64-bit values in M1 to the 64-bit values in M2. */
316*38fd1498Szrj #ifndef __SSE2__
317*38fd1498Szrj #pragma GCC push_options
318*38fd1498Szrj #pragma GCC target("sse2,mmx")
319*38fd1498Szrj #define __DISABLE_SSE2__
320*38fd1498Szrj #endif /* __SSE2__ */
321*38fd1498Szrj
322*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_si64(__m64 __m1,__m64 __m2)323*38fd1498Szrj _mm_add_si64 (__m64 __m1, __m64 __m2)
324*38fd1498Szrj {
325*38fd1498Szrj return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
326*38fd1498Szrj }
327*38fd1498Szrj #ifdef __DISABLE_SSE2__
328*38fd1498Szrj #undef __DISABLE_SSE2__
329*38fd1498Szrj #pragma GCC pop_options
330*38fd1498Szrj #endif /* __DISABLE_SSE2__ */
331*38fd1498Szrj
332*38fd1498Szrj /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
333*38fd1498Szrj saturated arithmetic. */
334*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi8(__m64 __m1,__m64 __m2)335*38fd1498Szrj _mm_adds_pi8 (__m64 __m1, __m64 __m2)
336*38fd1498Szrj {
337*38fd1498Szrj return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
338*38fd1498Szrj }
339*38fd1498Szrj
340*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsb(__m64 __m1,__m64 __m2)341*38fd1498Szrj _m_paddsb (__m64 __m1, __m64 __m2)
342*38fd1498Szrj {
343*38fd1498Szrj return _mm_adds_pi8 (__m1, __m2);
344*38fd1498Szrj }
345*38fd1498Szrj
346*38fd1498Szrj /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
347*38fd1498Szrj saturated arithmetic. */
348*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi16(__m64 __m1,__m64 __m2)349*38fd1498Szrj _mm_adds_pi16 (__m64 __m1, __m64 __m2)
350*38fd1498Szrj {
351*38fd1498Szrj return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
352*38fd1498Szrj }
353*38fd1498Szrj
354*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsw(__m64 __m1,__m64 __m2)355*38fd1498Szrj _m_paddsw (__m64 __m1, __m64 __m2)
356*38fd1498Szrj {
357*38fd1498Szrj return _mm_adds_pi16 (__m1, __m2);
358*38fd1498Szrj }
359*38fd1498Szrj
360*38fd1498Szrj /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
361*38fd1498Szrj saturated arithmetic. */
362*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1,__m64 __m2)363*38fd1498Szrj _mm_adds_pu8 (__m64 __m1, __m64 __m2)
364*38fd1498Szrj {
365*38fd1498Szrj return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
366*38fd1498Szrj }
367*38fd1498Szrj
368*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusb(__m64 __m1,__m64 __m2)369*38fd1498Szrj _m_paddusb (__m64 __m1, __m64 __m2)
370*38fd1498Szrj {
371*38fd1498Szrj return _mm_adds_pu8 (__m1, __m2);
372*38fd1498Szrj }
373*38fd1498Szrj
374*38fd1498Szrj /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
375*38fd1498Szrj saturated arithmetic. */
376*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1,__m64 __m2)377*38fd1498Szrj _mm_adds_pu16 (__m64 __m1, __m64 __m2)
378*38fd1498Szrj {
379*38fd1498Szrj return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
380*38fd1498Szrj }
381*38fd1498Szrj
382*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusw(__m64 __m1,__m64 __m2)383*38fd1498Szrj _m_paddusw (__m64 __m1, __m64 __m2)
384*38fd1498Szrj {
385*38fd1498Szrj return _mm_adds_pu16 (__m1, __m2);
386*38fd1498Szrj }
387*38fd1498Szrj
388*38fd1498Szrj /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
389*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi8(__m64 __m1,__m64 __m2)390*38fd1498Szrj _mm_sub_pi8 (__m64 __m1, __m64 __m2)
391*38fd1498Szrj {
392*38fd1498Szrj return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
393*38fd1498Szrj }
394*38fd1498Szrj
395*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubb(__m64 __m1,__m64 __m2)396*38fd1498Szrj _m_psubb (__m64 __m1, __m64 __m2)
397*38fd1498Szrj {
398*38fd1498Szrj return _mm_sub_pi8 (__m1, __m2);
399*38fd1498Szrj }
400*38fd1498Szrj
401*38fd1498Szrj /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
402*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1,__m64 __m2)403*38fd1498Szrj _mm_sub_pi16 (__m64 __m1, __m64 __m2)
404*38fd1498Szrj {
405*38fd1498Szrj return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
406*38fd1498Szrj }
407*38fd1498Szrj
408*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubw(__m64 __m1,__m64 __m2)409*38fd1498Szrj _m_psubw (__m64 __m1, __m64 __m2)
410*38fd1498Szrj {
411*38fd1498Szrj return _mm_sub_pi16 (__m1, __m2);
412*38fd1498Szrj }
413*38fd1498Szrj
414*38fd1498Szrj /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
415*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi32(__m64 __m1,__m64 __m2)416*38fd1498Szrj _mm_sub_pi32 (__m64 __m1, __m64 __m2)
417*38fd1498Szrj {
418*38fd1498Szrj return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
419*38fd1498Szrj }
420*38fd1498Szrj
421*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubd(__m64 __m1,__m64 __m2)422*38fd1498Szrj _m_psubd (__m64 __m1, __m64 __m2)
423*38fd1498Szrj {
424*38fd1498Szrj return _mm_sub_pi32 (__m1, __m2);
425*38fd1498Szrj }
426*38fd1498Szrj
427*38fd1498Szrj /* Add the 64-bit values in M1 to the 64-bit values in M2. */
428*38fd1498Szrj #ifndef __SSE2__
429*38fd1498Szrj #pragma GCC push_options
430*38fd1498Szrj #pragma GCC target("sse2,mmx")
431*38fd1498Szrj #define __DISABLE_SSE2__
432*38fd1498Szrj #endif /* __SSE2__ */
433*38fd1498Szrj
434*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_si64(__m64 __m1,__m64 __m2)435*38fd1498Szrj _mm_sub_si64 (__m64 __m1, __m64 __m2)
436*38fd1498Szrj {
437*38fd1498Szrj return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
438*38fd1498Szrj }
439*38fd1498Szrj #ifdef __DISABLE_SSE2__
440*38fd1498Szrj #undef __DISABLE_SSE2__
441*38fd1498Szrj #pragma GCC pop_options
442*38fd1498Szrj #endif /* __DISABLE_SSE2__ */
443*38fd1498Szrj
444*38fd1498Szrj /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
445*38fd1498Szrj saturating arithmetic. */
446*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi8(__m64 __m1,__m64 __m2)447*38fd1498Szrj _mm_subs_pi8 (__m64 __m1, __m64 __m2)
448*38fd1498Szrj {
449*38fd1498Szrj return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
450*38fd1498Szrj }
451*38fd1498Szrj
452*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsb(__m64 __m1,__m64 __m2)453*38fd1498Szrj _m_psubsb (__m64 __m1, __m64 __m2)
454*38fd1498Szrj {
455*38fd1498Szrj return _mm_subs_pi8 (__m1, __m2);
456*38fd1498Szrj }
457*38fd1498Szrj
458*38fd1498Szrj /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
459*38fd1498Szrj signed saturating arithmetic. */
460*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi16(__m64 __m1,__m64 __m2)461*38fd1498Szrj _mm_subs_pi16 (__m64 __m1, __m64 __m2)
462*38fd1498Szrj {
463*38fd1498Szrj return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
464*38fd1498Szrj }
465*38fd1498Szrj
466*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsw(__m64 __m1,__m64 __m2)467*38fd1498Szrj _m_psubsw (__m64 __m1, __m64 __m2)
468*38fd1498Szrj {
469*38fd1498Szrj return _mm_subs_pi16 (__m1, __m2);
470*38fd1498Szrj }
471*38fd1498Szrj
472*38fd1498Szrj /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
473*38fd1498Szrj unsigned saturating arithmetic. */
474*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu8(__m64 __m1,__m64 __m2)475*38fd1498Szrj _mm_subs_pu8 (__m64 __m1, __m64 __m2)
476*38fd1498Szrj {
477*38fd1498Szrj return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
478*38fd1498Szrj }
479*38fd1498Szrj
480*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusb(__m64 __m1,__m64 __m2)481*38fd1498Szrj _m_psubusb (__m64 __m1, __m64 __m2)
482*38fd1498Szrj {
483*38fd1498Szrj return _mm_subs_pu8 (__m1, __m2);
484*38fd1498Szrj }
485*38fd1498Szrj
486*38fd1498Szrj /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
487*38fd1498Szrj unsigned saturating arithmetic. */
488*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu16(__m64 __m1,__m64 __m2)489*38fd1498Szrj _mm_subs_pu16 (__m64 __m1, __m64 __m2)
490*38fd1498Szrj {
491*38fd1498Szrj return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
492*38fd1498Szrj }
493*38fd1498Szrj
494*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusw(__m64 __m1,__m64 __m2)495*38fd1498Szrj _m_psubusw (__m64 __m1, __m64 __m2)
496*38fd1498Szrj {
497*38fd1498Szrj return _mm_subs_pu16 (__m1, __m2);
498*38fd1498Szrj }
499*38fd1498Szrj
500*38fd1498Szrj /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
501*38fd1498Szrj four 32-bit intermediate results, which are then summed by pairs to
502*38fd1498Szrj produce two 32-bit results. */
503*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1,__m64 __m2)504*38fd1498Szrj _mm_madd_pi16 (__m64 __m1, __m64 __m2)
505*38fd1498Szrj {
506*38fd1498Szrj return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
507*38fd1498Szrj }
508*38fd1498Szrj
509*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaddwd(__m64 __m1,__m64 __m2)510*38fd1498Szrj _m_pmaddwd (__m64 __m1, __m64 __m2)
511*38fd1498Szrj {
512*38fd1498Szrj return _mm_madd_pi16 (__m1, __m2);
513*38fd1498Szrj }
514*38fd1498Szrj
515*38fd1498Szrj /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
516*38fd1498Szrj M2 and produce the high 16 bits of the 32-bit results. */
517*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)518*38fd1498Szrj _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
519*38fd1498Szrj {
520*38fd1498Szrj return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
521*38fd1498Szrj }
522*38fd1498Szrj
523*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhw(__m64 __m1,__m64 __m2)524*38fd1498Szrj _m_pmulhw (__m64 __m1, __m64 __m2)
525*38fd1498Szrj {
526*38fd1498Szrj return _mm_mulhi_pi16 (__m1, __m2);
527*38fd1498Szrj }
528*38fd1498Szrj
529*38fd1498Szrj /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
530*38fd1498Szrj the low 16 bits of the results. */
531*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1,__m64 __m2)532*38fd1498Szrj _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
533*38fd1498Szrj {
534*38fd1498Szrj return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
535*38fd1498Szrj }
536*38fd1498Szrj
537*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmullw(__m64 __m1,__m64 __m2)538*38fd1498Szrj _m_pmullw (__m64 __m1, __m64 __m2)
539*38fd1498Szrj {
540*38fd1498Szrj return _mm_mullo_pi16 (__m1, __m2);
541*38fd1498Szrj }
542*38fd1498Szrj
543*38fd1498Szrj /* Shift four 16-bit values in M left by COUNT. */
544*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi16(__m64 __m,__m64 __count)545*38fd1498Szrj _mm_sll_pi16 (__m64 __m, __m64 __count)
546*38fd1498Szrj {
547*38fd1498Szrj return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
548*38fd1498Szrj }
549*38fd1498Szrj
550*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllw(__m64 __m,__m64 __count)551*38fd1498Szrj _m_psllw (__m64 __m, __m64 __count)
552*38fd1498Szrj {
553*38fd1498Szrj return _mm_sll_pi16 (__m, __count);
554*38fd1498Szrj }
555*38fd1498Szrj
556*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m,int __count)557*38fd1498Szrj _mm_slli_pi16 (__m64 __m, int __count)
558*38fd1498Szrj {
559*38fd1498Szrj return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
560*38fd1498Szrj }
561*38fd1498Szrj
562*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllwi(__m64 __m,int __count)563*38fd1498Szrj _m_psllwi (__m64 __m, int __count)
564*38fd1498Szrj {
565*38fd1498Szrj return _mm_slli_pi16 (__m, __count);
566*38fd1498Szrj }
567*38fd1498Szrj
568*38fd1498Szrj /* Shift two 32-bit values in M left by COUNT. */
569*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi32(__m64 __m,__m64 __count)570*38fd1498Szrj _mm_sll_pi32 (__m64 __m, __m64 __count)
571*38fd1498Szrj {
572*38fd1498Szrj return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
573*38fd1498Szrj }
574*38fd1498Szrj
575*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslld(__m64 __m,__m64 __count)576*38fd1498Szrj _m_pslld (__m64 __m, __m64 __count)
577*38fd1498Szrj {
578*38fd1498Szrj return _mm_sll_pi32 (__m, __count);
579*38fd1498Szrj }
580*38fd1498Szrj
581*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi32(__m64 __m,int __count)582*38fd1498Szrj _mm_slli_pi32 (__m64 __m, int __count)
583*38fd1498Szrj {
584*38fd1498Szrj return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
585*38fd1498Szrj }
586*38fd1498Szrj
587*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslldi(__m64 __m,int __count)588*38fd1498Szrj _m_pslldi (__m64 __m, int __count)
589*38fd1498Szrj {
590*38fd1498Szrj return _mm_slli_pi32 (__m, __count);
591*38fd1498Szrj }
592*38fd1498Szrj
593*38fd1498Szrj /* Shift the 64-bit value in M left by COUNT. */
594*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_si64(__m64 __m,__m64 __count)595*38fd1498Szrj _mm_sll_si64 (__m64 __m, __m64 __count)
596*38fd1498Szrj {
597*38fd1498Szrj return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
598*38fd1498Szrj }
599*38fd1498Szrj
600*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllq(__m64 __m,__m64 __count)601*38fd1498Szrj _m_psllq (__m64 __m, __m64 __count)
602*38fd1498Szrj {
603*38fd1498Szrj return _mm_sll_si64 (__m, __count);
604*38fd1498Szrj }
605*38fd1498Szrj
606*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m,int __count)607*38fd1498Szrj _mm_slli_si64 (__m64 __m, int __count)
608*38fd1498Szrj {
609*38fd1498Szrj return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
610*38fd1498Szrj }
611*38fd1498Szrj
612*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllqi(__m64 __m,int __count)613*38fd1498Szrj _m_psllqi (__m64 __m, int __count)
614*38fd1498Szrj {
615*38fd1498Szrj return _mm_slli_si64 (__m, __count);
616*38fd1498Szrj }
617*38fd1498Szrj
618*38fd1498Szrj /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
619*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi16(__m64 __m,__m64 __count)620*38fd1498Szrj _mm_sra_pi16 (__m64 __m, __m64 __count)
621*38fd1498Szrj {
622*38fd1498Szrj return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
623*38fd1498Szrj }
624*38fd1498Szrj
625*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psraw(__m64 __m,__m64 __count)626*38fd1498Szrj _m_psraw (__m64 __m, __m64 __count)
627*38fd1498Szrj {
628*38fd1498Szrj return _mm_sra_pi16 (__m, __count);
629*38fd1498Szrj }
630*38fd1498Szrj
631*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi16(__m64 __m,int __count)632*38fd1498Szrj _mm_srai_pi16 (__m64 __m, int __count)
633*38fd1498Szrj {
634*38fd1498Szrj return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
635*38fd1498Szrj }
636*38fd1498Szrj
637*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrawi(__m64 __m,int __count)638*38fd1498Szrj _m_psrawi (__m64 __m, int __count)
639*38fd1498Szrj {
640*38fd1498Szrj return _mm_srai_pi16 (__m, __count);
641*38fd1498Szrj }
642*38fd1498Szrj
643*38fd1498Szrj /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
644*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi32(__m64 __m,__m64 __count)645*38fd1498Szrj _mm_sra_pi32 (__m64 __m, __m64 __count)
646*38fd1498Szrj {
647*38fd1498Szrj return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
648*38fd1498Szrj }
649*38fd1498Szrj
650*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrad(__m64 __m,__m64 __count)651*38fd1498Szrj _m_psrad (__m64 __m, __m64 __count)
652*38fd1498Szrj {
653*38fd1498Szrj return _mm_sra_pi32 (__m, __count);
654*38fd1498Szrj }
655*38fd1498Szrj
656*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi32(__m64 __m,int __count)657*38fd1498Szrj _mm_srai_pi32 (__m64 __m, int __count)
658*38fd1498Szrj {
659*38fd1498Szrj return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
660*38fd1498Szrj }
661*38fd1498Szrj
662*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psradi(__m64 __m,int __count)663*38fd1498Szrj _m_psradi (__m64 __m, int __count)
664*38fd1498Szrj {
665*38fd1498Szrj return _mm_srai_pi32 (__m, __count);
666*38fd1498Szrj }
667*38fd1498Szrj
668*38fd1498Szrj /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
669*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi16(__m64 __m,__m64 __count)670*38fd1498Szrj _mm_srl_pi16 (__m64 __m, __m64 __count)
671*38fd1498Szrj {
672*38fd1498Szrj return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
673*38fd1498Szrj }
674*38fd1498Szrj
675*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlw(__m64 __m,__m64 __count)676*38fd1498Szrj _m_psrlw (__m64 __m, __m64 __count)
677*38fd1498Szrj {
678*38fd1498Szrj return _mm_srl_pi16 (__m, __count);
679*38fd1498Szrj }
680*38fd1498Szrj
681*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m,int __count)682*38fd1498Szrj _mm_srli_pi16 (__m64 __m, int __count)
683*38fd1498Szrj {
684*38fd1498Szrj return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
685*38fd1498Szrj }
686*38fd1498Szrj
687*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlwi(__m64 __m,int __count)688*38fd1498Szrj _m_psrlwi (__m64 __m, int __count)
689*38fd1498Szrj {
690*38fd1498Szrj return _mm_srli_pi16 (__m, __count);
691*38fd1498Szrj }
692*38fd1498Szrj
693*38fd1498Szrj /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
694*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi32(__m64 __m,__m64 __count)695*38fd1498Szrj _mm_srl_pi32 (__m64 __m, __m64 __count)
696*38fd1498Szrj {
697*38fd1498Szrj return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
698*38fd1498Szrj }
699*38fd1498Szrj
700*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrld(__m64 __m,__m64 __count)701*38fd1498Szrj _m_psrld (__m64 __m, __m64 __count)
702*38fd1498Szrj {
703*38fd1498Szrj return _mm_srl_pi32 (__m, __count);
704*38fd1498Szrj }
705*38fd1498Szrj
706*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m,int __count)707*38fd1498Szrj _mm_srli_pi32 (__m64 __m, int __count)
708*38fd1498Szrj {
709*38fd1498Szrj return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
710*38fd1498Szrj }
711*38fd1498Szrj
712*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrldi(__m64 __m,int __count)713*38fd1498Szrj _m_psrldi (__m64 __m, int __count)
714*38fd1498Szrj {
715*38fd1498Szrj return _mm_srli_pi32 (__m, __count);
716*38fd1498Szrj }
717*38fd1498Szrj
718*38fd1498Szrj /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
719*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_si64(__m64 __m,__m64 __count)720*38fd1498Szrj _mm_srl_si64 (__m64 __m, __m64 __count)
721*38fd1498Szrj {
722*38fd1498Szrj return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
723*38fd1498Szrj }
724*38fd1498Szrj
725*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlq(__m64 __m,__m64 __count)726*38fd1498Szrj _m_psrlq (__m64 __m, __m64 __count)
727*38fd1498Szrj {
728*38fd1498Szrj return _mm_srl_si64 (__m, __count);
729*38fd1498Szrj }
730*38fd1498Szrj
731*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m,int __count)732*38fd1498Szrj _mm_srli_si64 (__m64 __m, int __count)
733*38fd1498Szrj {
734*38fd1498Szrj return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
735*38fd1498Szrj }
736*38fd1498Szrj
737*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlqi(__m64 __m,int __count)738*38fd1498Szrj _m_psrlqi (__m64 __m, int __count)
739*38fd1498Szrj {
740*38fd1498Szrj return _mm_srli_si64 (__m, __count);
741*38fd1498Szrj }
742*38fd1498Szrj
743*38fd1498Szrj /* Bit-wise AND the 64-bit values in M1 and M2. */
744*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1,__m64 __m2)745*38fd1498Szrj _mm_and_si64 (__m64 __m1, __m64 __m2)
746*38fd1498Szrj {
747*38fd1498Szrj return __builtin_ia32_pand (__m1, __m2);
748*38fd1498Szrj }
749*38fd1498Szrj
750*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pand(__m64 __m1,__m64 __m2)751*38fd1498Szrj _m_pand (__m64 __m1, __m64 __m2)
752*38fd1498Szrj {
753*38fd1498Szrj return _mm_and_si64 (__m1, __m2);
754*38fd1498Szrj }
755*38fd1498Szrj
756*38fd1498Szrj /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
757*38fd1498Szrj 64-bit value in M2. */
758*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si64(__m64 __m1,__m64 __m2)759*38fd1498Szrj _mm_andnot_si64 (__m64 __m1, __m64 __m2)
760*38fd1498Szrj {
761*38fd1498Szrj return __builtin_ia32_pandn (__m1, __m2);
762*38fd1498Szrj }
763*38fd1498Szrj
764*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pandn(__m64 __m1,__m64 __m2)765*38fd1498Szrj _m_pandn (__m64 __m1, __m64 __m2)
766*38fd1498Szrj {
767*38fd1498Szrj return _mm_andnot_si64 (__m1, __m2);
768*38fd1498Szrj }
769*38fd1498Szrj
770*38fd1498Szrj /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
771*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1,__m64 __m2)772*38fd1498Szrj _mm_or_si64 (__m64 __m1, __m64 __m2)
773*38fd1498Szrj {
774*38fd1498Szrj return __builtin_ia32_por (__m1, __m2);
775*38fd1498Szrj }
776*38fd1498Szrj
777*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_por(__m64 __m1,__m64 __m2)778*38fd1498Szrj _m_por (__m64 __m1, __m64 __m2)
779*38fd1498Szrj {
780*38fd1498Szrj return _mm_or_si64 (__m1, __m2);
781*38fd1498Szrj }
782*38fd1498Szrj
783*38fd1498Szrj /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
784*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1,__m64 __m2)785*38fd1498Szrj _mm_xor_si64 (__m64 __m1, __m64 __m2)
786*38fd1498Szrj {
787*38fd1498Szrj return __builtin_ia32_pxor (__m1, __m2);
788*38fd1498Szrj }
789*38fd1498Szrj
790*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pxor(__m64 __m1,__m64 __m2)791*38fd1498Szrj _m_pxor (__m64 __m1, __m64 __m2)
792*38fd1498Szrj {
793*38fd1498Szrj return _mm_xor_si64 (__m1, __m2);
794*38fd1498Szrj }
795*38fd1498Szrj
796*38fd1498Szrj /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
797*38fd1498Szrj test is true and zero if false. */
798*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)799*38fd1498Szrj _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
800*38fd1498Szrj {
801*38fd1498Szrj return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
802*38fd1498Szrj }
803*38fd1498Szrj
804*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqb(__m64 __m1,__m64 __m2)805*38fd1498Szrj _m_pcmpeqb (__m64 __m1, __m64 __m2)
806*38fd1498Szrj {
807*38fd1498Szrj return _mm_cmpeq_pi8 (__m1, __m2);
808*38fd1498Szrj }
809*38fd1498Szrj
810*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)811*38fd1498Szrj _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
812*38fd1498Szrj {
813*38fd1498Szrj return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
814*38fd1498Szrj }
815*38fd1498Szrj
816*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtb(__m64 __m1,__m64 __m2)817*38fd1498Szrj _m_pcmpgtb (__m64 __m1, __m64 __m2)
818*38fd1498Szrj {
819*38fd1498Szrj return _mm_cmpgt_pi8 (__m1, __m2);
820*38fd1498Szrj }
821*38fd1498Szrj
822*38fd1498Szrj /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
823*38fd1498Szrj the test is true and zero if false. */
824*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)825*38fd1498Szrj _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
826*38fd1498Szrj {
827*38fd1498Szrj return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
828*38fd1498Szrj }
829*38fd1498Szrj
830*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqw(__m64 __m1,__m64 __m2)831*38fd1498Szrj _m_pcmpeqw (__m64 __m1, __m64 __m2)
832*38fd1498Szrj {
833*38fd1498Szrj return _mm_cmpeq_pi16 (__m1, __m2);
834*38fd1498Szrj }
835*38fd1498Szrj
836*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)837*38fd1498Szrj _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
838*38fd1498Szrj {
839*38fd1498Szrj return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
840*38fd1498Szrj }
841*38fd1498Szrj
842*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtw(__m64 __m1,__m64 __m2)843*38fd1498Szrj _m_pcmpgtw (__m64 __m1, __m64 __m2)
844*38fd1498Szrj {
845*38fd1498Szrj return _mm_cmpgt_pi16 (__m1, __m2);
846*38fd1498Szrj }
847*38fd1498Szrj
848*38fd1498Szrj /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
849*38fd1498Szrj the test is true and zero if false. */
850*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)851*38fd1498Szrj _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
852*38fd1498Szrj {
853*38fd1498Szrj return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
854*38fd1498Szrj }
855*38fd1498Szrj
856*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqd(__m64 __m1,__m64 __m2)857*38fd1498Szrj _m_pcmpeqd (__m64 __m1, __m64 __m2)
858*38fd1498Szrj {
859*38fd1498Szrj return _mm_cmpeq_pi32 (__m1, __m2);
860*38fd1498Szrj }
861*38fd1498Szrj
862*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)863*38fd1498Szrj _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
864*38fd1498Szrj {
865*38fd1498Szrj return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
866*38fd1498Szrj }
867*38fd1498Szrj
868*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtd(__m64 __m1,__m64 __m2)869*38fd1498Szrj _m_pcmpgtd (__m64 __m1, __m64 __m2)
870*38fd1498Szrj {
871*38fd1498Szrj return _mm_cmpgt_pi32 (__m1, __m2);
872*38fd1498Szrj }
873*38fd1498Szrj
874*38fd1498Szrj /* Creates a 64-bit zero. */
875*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void)876*38fd1498Szrj _mm_setzero_si64 (void)
877*38fd1498Szrj {
878*38fd1498Szrj return (__m64)0LL;
879*38fd1498Szrj }
880*38fd1498Szrj
881*38fd1498Szrj /* Creates a vector of two 32-bit values; I0 is least significant. */
882*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(int __i1,int __i0)883*38fd1498Szrj _mm_set_pi32 (int __i1, int __i0)
884*38fd1498Szrj {
885*38fd1498Szrj return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
886*38fd1498Szrj }
887*38fd1498Szrj
888*38fd1498Szrj /* Creates a vector of four 16-bit values; W0 is least significant. */
889*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(short __w3,short __w2,short __w1,short __w0)890*38fd1498Szrj _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
891*38fd1498Szrj {
892*38fd1498Szrj return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
893*38fd1498Szrj }
894*38fd1498Szrj
895*38fd1498Szrj /* Creates a vector of eight 8-bit values; B0 is least significant. */
896*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi8(char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)897*38fd1498Szrj _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
898*38fd1498Szrj char __b3, char __b2, char __b1, char __b0)
899*38fd1498Szrj {
900*38fd1498Szrj return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
901*38fd1498Szrj __b4, __b5, __b6, __b7);
902*38fd1498Szrj }
903*38fd1498Szrj
904*38fd1498Szrj /* Similar, but with the arguments in reverse order. */
905*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi32(int __i0,int __i1)906*38fd1498Szrj _mm_setr_pi32 (int __i0, int __i1)
907*38fd1498Szrj {
908*38fd1498Szrj return _mm_set_pi32 (__i1, __i0);
909*38fd1498Szrj }
910*38fd1498Szrj
911*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi16(short __w0,short __w1,short __w2,short __w3)912*38fd1498Szrj _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
913*38fd1498Szrj {
914*38fd1498Szrj return _mm_set_pi16 (__w3, __w2, __w1, __w0);
915*38fd1498Szrj }
916*38fd1498Szrj
917*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7)918*38fd1498Szrj _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
919*38fd1498Szrj char __b4, char __b5, char __b6, char __b7)
920*38fd1498Szrj {
921*38fd1498Szrj return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
922*38fd1498Szrj }
923*38fd1498Szrj
924*38fd1498Szrj /* Creates a vector of two 32-bit values, both elements containing I. */
925*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi32(int __i)926*38fd1498Szrj _mm_set1_pi32 (int __i)
927*38fd1498Szrj {
928*38fd1498Szrj return _mm_set_pi32 (__i, __i);
929*38fd1498Szrj }
930*38fd1498Szrj
931*38fd1498Szrj /* Creates a vector of four 16-bit values, all elements containing W. */
932*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi16(short __w)933*38fd1498Szrj _mm_set1_pi16 (short __w)
934*38fd1498Szrj {
935*38fd1498Szrj return _mm_set_pi16 (__w, __w, __w, __w);
936*38fd1498Szrj }
937*38fd1498Szrj
938*38fd1498Szrj /* Creates a vector of eight 8-bit values, all elements containing B. */
939*38fd1498Szrj extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi8(char __b)940*38fd1498Szrj _mm_set1_pi8 (char __b)
941*38fd1498Szrj {
942*38fd1498Szrj return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
943*38fd1498Szrj }
944*38fd1498Szrj #ifdef __DISABLE_MMX__
945*38fd1498Szrj #undef __DISABLE_MMX__
946*38fd1498Szrj #pragma GCC pop_options
947*38fd1498Szrj #endif /* __DISABLE_MMX__ */
948*38fd1498Szrj
949*38fd1498Szrj #endif /* _MMINTRIN_H_INCLUDED */
950