xref: /llvm-project/clang/test/CodeGen/X86/sse-builtins.c (revision 38fffa630ee80163dc65e759392ad29798905679)
1 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
2 // RUN: %clang_cc1 -x c -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
3 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
4 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
5 
6 
7 #include <immintrin.h>
8 #include "builtin_test_helpers.h"
9 
10 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
11 
12 __m128 test_mm_add_ps(__m128 A, __m128 B) {
13   // CHECK-LABEL: test_mm_add_ps
14   // CHECK: fadd <4 x float>
15   return _mm_add_ps(A, B);
16 }
17 TEST_CONSTEXPR(match_m128(_mm_add_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +9.0f, +4.0f, +4.0f, +5.0f));
18 
19 __m128 test_mm_add_ss(__m128 A, __m128 B) {
20   // CHECK-LABEL: test_mm_add_ss
21   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
22   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
23   // CHECK: fadd float
24   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
25   return _mm_add_ss(A, B);
26 }
27 TEST_CONSTEXPR(match_m128(_mm_add_ss((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +9.0f, +0.0f, +2.0f, +4.0f));
28 
29 __m128 test_mm_and_ps(__m128 A, __m128 B) {
30   // CHECK-LABEL: test_mm_and_ps
31   // CHECK: and <4 x i32>
32   return _mm_and_ps(A, B);
33 }
34 TEST_CONSTEXPR(match_m128(_mm_and_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), -0.0f, -0.0f, +0.0f, +7.0f));
35 
36 __m128 test_mm_andnot_ps(__m128 A, __m128 B) {
37   // CHECK-LABEL: test_mm_andnot_ps
38   // CHECK: xor <4 x i32> %{{.*}}, splat (i32 -1)
39   // CHECK: and <4 x i32>
40   return _mm_andnot_ps(A, B);
41 }
42 TEST_CONSTEXPR(match_m128(_mm_andnot_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), +0.0f, +0.0f, +0.0f, +0.0f));
43 
44 __m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
45   // CHECK-LABEL: test_mm_cmp_ps_eq_oq
46   // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
47   return _mm_cmp_ps(a, b, _CMP_EQ_OQ);
48 }
49 
50 __m128 test_mm_cmp_ps_lt_os(__m128 a, __m128 b) {
51   // CHECK-LABEL: test_mm_cmp_ps_lt_os
52   // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
53   return _mm_cmp_ps(a, b, _CMP_LT_OS);
54 }
55 
56 __m128 test_mm_cmp_ps_le_os(__m128 a, __m128 b) {
57   // CHECK-LABEL: test_mm_cmp_ps_le_os
58   // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
59   return _mm_cmp_ps(a, b, _CMP_LE_OS);
60 }
61 
62 __m128 test_mm_cmp_ps_unord_q(__m128 a, __m128 b) {
63   // CHECK-LABEL: test_mm_cmp_ps_unord_q
64   // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
65   return _mm_cmp_ps(a, b, _CMP_UNORD_Q);
66 }
67 
68 __m128 test_mm_cmp_ps_neq_uq(__m128 a, __m128 b) {
69   // CHECK-LABEL: test_mm_cmp_ps_neq_uq
70   // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
71   return _mm_cmp_ps(a, b, _CMP_NEQ_UQ);
72 }
73 
74 __m128 test_mm_cmp_ps_nlt_us(__m128 a, __m128 b) {
75   // CHECK-LABEL: test_mm_cmp_ps_nlt_us
76   // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
77   return _mm_cmp_ps(a, b, _CMP_NLT_US);
78 }
79 
80 __m128 test_mm_cmp_ps_nle_us(__m128 a, __m128 b) {
81   // CHECK-LABEL: test_mm_cmp_ps_nle_us
82   // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
83   return _mm_cmp_ps(a, b, _CMP_NLE_US);
84 }
85 
86 __m128 test_mm_cmp_ps_ord_q(__m128 a, __m128 b) {
87   // CHECK-LABEL: test_mm_cmp_ps_ord_q
88   // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
89   return _mm_cmp_ps(a, b, _CMP_ORD_Q);
90 }
91 
92 __m128 test_mm_cmp_ss(__m128 A, __m128 B) {
93   // CHECK-LABEL: test_mm_cmp_ss
94   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
95   return _mm_cmp_ss(A, B, _CMP_ORD_Q);
96 }
97 
98 __m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
99   // CHECK-LABEL: test_mm_cmpeq_ps
100   // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
101   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
102   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
103   // CHECK-NEXT:    ret <4 x float> [[BC]]
104   return _mm_cmpeq_ps(__a, __b);
105 }
106 
107 __m128 test_mm_cmpeq_ss(__m128 __a, __m128 __b) {
108   // CHECK-LABEL: test_mm_cmpeq_ss
109   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
110   return _mm_cmpeq_ss(__a, __b);
111 }
112 
113 __m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
114   // CHECK-LABEL: test_mm_cmpge_ps
115   // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
116   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
117   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
118   // CHECK-NEXT:    ret <4 x float> [[BC]]
119   return _mm_cmpge_ps(__a, __b);
120 }
121 
122 __m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
123   // CHECK-LABEL: test_mm_cmpge_ss
124   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
125   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
126   return _mm_cmpge_ss(__a, __b);
127 }
128 
129 __m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
130   // CHECK-LABEL: test_mm_cmpgt_ps
131   // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
132   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
133   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
134   // CHECK-NEXT:    ret <4 x float> [[BC]]
135   return _mm_cmpgt_ps(__a, __b);
136 }
137 
138 __m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
139   // CHECK-LABEL: test_mm_cmpgt_ss
140   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
141   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
142   return _mm_cmpgt_ss(__a, __b);
143 }
144 
145 __m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
146   // CHECK-LABEL: test_mm_cmple_ps
147   // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
148   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
149   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
150   // CHECK-NEXT:    ret <4 x float> [[BC]]
151   return _mm_cmple_ps(__a, __b);
152 }
153 
154 __m128 test_mm_cmple_ss(__m128 __a, __m128 __b) {
155   // CHECK-LABEL: test_mm_cmple_ss
156   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
157   return _mm_cmple_ss(__a, __b);
158 }
159 
160 __m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
161   // CHECK-LABEL: test_mm_cmplt_ps
162   // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
163   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
164   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
165   // CHECK-NEXT:    ret <4 x float> [[BC]]
166   return _mm_cmplt_ps(__a, __b);
167 }
168 
169 __m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
170   // CHECK-LABEL: test_mm_cmplt_ss
171   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
172   return _mm_cmplt_ss(__a, __b);
173 }
174 
175 __m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
176   // CHECK-LABEL: test_mm_cmpneq_ps
177   // CHECK:         [[CMP:%.*]] = fcmp une <4 x float>
178   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
179   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
180   // CHECK-NEXT:    ret <4 x float> [[BC]]
181   return _mm_cmpneq_ps(__a, __b);
182 }
183 
184 __m128 test_mm_cmpneq_ss(__m128 __a, __m128 __b) {
185   // CHECK-LABEL: test_mm_cmpneq_ss
186   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
187   return _mm_cmpneq_ss(__a, __b);
188 }
189 
190 __m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
191   // CHECK-LABEL: test_mm_cmpnge_ps
192   // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
193   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
194   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
195   // CHECK-NEXT:    ret <4 x float> [[BC]]
196   return _mm_cmpnge_ps(__a, __b);
197 }
198 
199 __m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
200   // CHECK-LABEL: test_mm_cmpnge_ss
201   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
202   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
203   return _mm_cmpnge_ss(__a, __b);
204 }
205 
206 __m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
207   // CHECK-LABEL: test_mm_cmpngt_ps
208   // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
209   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
210   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
211   // CHECK-NEXT:    ret <4 x float> [[BC]]
212   return _mm_cmpngt_ps(__a, __b);
213 }
214 
215 __m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
216   // CHECK-LABEL: test_mm_cmpngt_ss
217   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
218   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
219   return _mm_cmpngt_ss(__a, __b);
220 }
221 
222 __m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
223   // CHECK-LABEL: test_mm_cmpnle_ps
224   // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
225   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
226   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
227   // CHECK-NEXT:    ret <4 x float> [[BC]]
228   return _mm_cmpnle_ps(__a, __b);
229 }
230 
231 __m128 test_mm_cmpnle_ss(__m128 __a, __m128 __b) {
232   // CHECK-LABEL: test_mm_cmpnle_ss
233   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
234   return _mm_cmpnle_ss(__a, __b);
235 }
236 
237 __m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
238   // CHECK-LABEL: test_mm_cmpnlt_ps
239   // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
240   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
241   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
242   // CHECK-NEXT:    ret <4 x float> [[BC]]
243   return _mm_cmpnlt_ps(__a, __b);
244 }
245 
246 __m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
247   // CHECK-LABEL: test_mm_cmpnlt_ss
248   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
249   return _mm_cmpnlt_ss(__a, __b);
250 }
251 
252 __m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
253   // CHECK-LABEL: test_mm_cmpord_ps
254   // CHECK:         [[CMP:%.*]] = fcmp ord <4 x float>
255   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
256   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
257   // CHECK-NEXT:    ret <4 x float> [[BC]]
258   return _mm_cmpord_ps(__a, __b);
259 }
260 
261 __m128 test_mm_cmpord_ss(__m128 __a, __m128 __b) {
262   // CHECK-LABEL: test_mm_cmpord_ss
263   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
264   return _mm_cmpord_ss(__a, __b);
265 }
266 
267 __m128 test_mm_cmpunord_ps(__m128 __a, __m128 __b) {
268   // CHECK-LABEL: test_mm_cmpunord_ps
269   // CHECK:         [[CMP:%.*]] = fcmp uno <4 x float>
270   // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
271   // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
272   // CHECK-NEXT:    ret <4 x float> [[BC]]
273   return _mm_cmpunord_ps(__a, __b);
274 }
275 
276 __m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
277   // CHECK-LABEL: test_mm_cmpunord_ss
278   // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
279   return _mm_cmpunord_ss(__a, __b);
280 }
281 
282 int test_mm_comieq_ss(__m128 A, __m128 B) {
283   // CHECK-LABEL: test_mm_comieq_ss
284   // CHECK: call {{.*}}i32 @llvm.x86.sse.comieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
285   return _mm_comieq_ss(A, B);
286 }
287 
288 int test_mm_comige_ss(__m128 A, __m128 B) {
289   // CHECK-LABEL: test_mm_comige_ss
290   // CHECK: call {{.*}}i32 @llvm.x86.sse.comige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
291   return _mm_comige_ss(A, B);
292 }
293 
294 int test_mm_comigt_ss(__m128 A, __m128 B) {
295   // CHECK-LABEL: test_mm_comigt_ss
296   // CHECK: call {{.*}}i32 @llvm.x86.sse.comigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
297   return _mm_comigt_ss(A, B);
298 }
299 
300 int test_mm_comile_ss(__m128 A, __m128 B) {
301   // CHECK-LABEL: test_mm_comile_ss
302   // CHECK: call {{.*}}i32 @llvm.x86.sse.comile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
303   return _mm_comile_ss(A, B);
304 }
305 
306 int test_mm_comilt_ss(__m128 A, __m128 B) {
307   // CHECK-LABEL: test_mm_comilt_ss
308   // CHECK: call {{.*}}i32 @llvm.x86.sse.comilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
309   return _mm_comilt_ss(A, B);
310 }
311 
312 int test_mm_comineq_ss(__m128 A, __m128 B) {
313   // CHECK-LABEL: test_mm_comineq_ss
314   // CHECK: call {{.*}}i32 @llvm.x86.sse.comineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
315   return _mm_comineq_ss(A, B);
316 }
317 
318 int test_mm_cvt_ss2si(__m128 A) {
319   // CHECK-LABEL: test_mm_cvt_ss2si
320   // CHECK: call {{.*}}i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
321   return _mm_cvt_ss2si(A);
322 }
323 
324 __m128 test_mm_cvtsi32_ss(__m128 A, int B) {
325   // CHECK-LABEL: test_mm_cvtsi32_ss
326   // CHECK: sitofp i32 %{{.*}} to float
327   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
328   return _mm_cvtsi32_ss(A, B);
329 }
330 TEST_CONSTEXPR(match_m128(_mm_cvtsi32_ss((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, 42), +42.0f, +0.0f, +2.0f, +4.0f));
331 
332 __m128 test_mm_cvt_si2ss(__m128 A, int B) {
333   // CHECK-LABEL: test_mm_cvt_si2ss
334   // CHECK: sitofp i32 %{{.*}} to float
335   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
336   return _mm_cvt_si2ss(A, B);
337 }
338 TEST_CONSTEXPR(match_m128(_mm_cvt_si2ss((__m128){+4.0f, +2.0f, +0.0f, +4.0f}, -99), -99.0f, +2.0f, +0.0f, +4.0f));
339 
340 #ifdef __x86_64__
341 __m128 test_mm_cvtsi64_ss(__m128 A, long long B) {
342   // CHECK-LABEL: test_mm_cvtsi64_ss
343   // CHECK: sitofp i64 %{{.*}} to float
344   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
345   return _mm_cvtsi64_ss(A, B);
346 }
347 TEST_CONSTEXPR(match_m128(_mm_cvtsi64_ss((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, 555), +555.0f, +0.0f, +2.0f, +4.0f));
348 #endif
349 
350 float test_mm_cvtss_f32(__m128 A) {
351   // CHECK-LABEL: test_mm_cvtss_f32
352   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
353   return _mm_cvtss_f32(A);
354 }
355 TEST_CONSTEXPR(_mm_cvtss_f32((__m128){+8.0f, +4.0f, +2.0f, +1.0f}) == +8.0f);
356 
357 int test_mm_cvtss_si32(__m128 A) {
358   // CHECK-LABEL: test_mm_cvtss_si32
359   // CHECK: call {{.*}}i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
360   return _mm_cvtss_si32(A);
361 }
362 
363 #ifdef __x86_64__
364 long long test_mm_cvtss_si64(__m128 A) {
365   // CHECK-LABEL: test_mm_cvtss_si64
366   // CHECK: call {{.*}}i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
367   return _mm_cvtss_si64(A);
368 }
369 #endif
370 
371 int test_mm_cvtt_ss2si(__m128 A) {
372   // CHECK-LABEL: test_mm_cvtt_ss2si
373   // CHECK: call {{.*}}i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
374   return _mm_cvtt_ss2si(A);
375 }
376 
377 int test_mm_cvttss_si32(__m128 A) {
378   // CHECK-LABEL: test_mm_cvttss_si32
379   // CHECK: call {{.*}}i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
380   return _mm_cvttss_si32(A);
381 }
382 
383 #ifdef __x86_64__
384 long long test_mm_cvttss_si64(__m128 A) {
385   // CHECK-LABEL: test_mm_cvttss_si64
386   // CHECK: call {{.*}}i64 @llvm.x86.sse.cvttss2si64(<4 x float> %{{.*}})
387   return _mm_cvttss_si64(A);
388 }
389 #endif
390 
391 __m128 test_mm_div_ps(__m128 A, __m128 B) {
392   // CHECK-LABEL: test_mm_div_ps
393   // CHECK: fdiv <4 x float>
394   return _mm_div_ps(A, B);
395 }
396 TEST_CONSTEXPR(match_m128(_mm_div_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +0.125f, +0.0f, +1.0f, +4.0f));
397 
398 __m128 test_mm_div_ss(__m128 A, __m128 B) {
399   // CHECK-LABEL: test_mm_div_ss
400   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
401   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
402   // CHECK: fdiv float
403   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
404   return _mm_div_ss(A, B);
405 }
406 TEST_CONSTEXPR(match_m128(_mm_div_ss((__m128){+1.0f, +5.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +0.125f, +5.0f, +2.0f, +4.0f));
407 
408 unsigned int test_MM_GET_EXCEPTION_MASK(void) {
409   // CHECK-LABEL: test_MM_GET_EXCEPTION_MASK
410   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
411   // CHECK: and i32 %{{.*}}, 8064
412   return _MM_GET_EXCEPTION_MASK();
413 }
414 
415 unsigned int test_MM_GET_EXCEPTION_STATE(void) {
416   // CHECK-LABEL: test_MM_GET_EXCEPTION_STATE
417   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
418   // CHECK: and i32 %{{.*}}, 63
419   return _MM_GET_EXCEPTION_STATE();
420 }
421 
422 unsigned int test_MM_GET_FLUSH_ZERO_MODE(void) {
423   // CHECK-LABEL: test_MM_GET_FLUSH_ZERO_MODE
424   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
425   // CHECK: and i32 %{{.*}}, 32768
426   return _MM_GET_FLUSH_ZERO_MODE();
427 }
428 
429 unsigned int test_MM_GET_ROUNDING_MODE(void) {
430   // CHECK-LABEL: test_MM_GET_ROUNDING_MODE
431   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
432   // CHECK: and i32 %{{.*}}, 24576
433   return _MM_GET_ROUNDING_MODE();
434 }
435 
436 unsigned int test_mm_getcsr(void) {
437   // CHECK-LABEL: test_mm_getcsr
438   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
439   // CHECK: load i32
440   return _mm_getcsr();
441 }
442 
443 __m128 test_mm_load_ps(float* y) {
444   // CHECK-LABEL: test_mm_load_ps
445   // CHECK: load <4 x float>, ptr {{.*}}, align 16
446   return _mm_load_ps(y);
447 }
448 
449 __m128 test_mm_load_ps1(float* y) {
450   // CHECK-LABEL: test_mm_load_ps1
451   // CHECK: load float, ptr %{{.*}}, align 4
452   // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
453   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
454   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
455   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
456   return _mm_load_ps1(y);
457 }
458 
459 __m128 test_mm_load_ss(float* y) {
460   // CHECK-LABEL: test_mm_load_ss
461   // CHECK: load float, ptr {{.*}}, align 1{{$}}
462   // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
463   // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
464   // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
465   // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
466   return _mm_load_ss(y);
467 }
468 
469 __m128 test_mm_load1_ps(float* y) {
470   // CHECK-LABEL: test_mm_load1_ps
471   // CHECK: load float, ptr %{{.*}}, align 4
472   // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
473   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
474   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
475   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
476   return _mm_load1_ps(y);
477 }
478 
479 __m128 test_mm_loadh_pi(__m128 x, __m64* y) {
480   // CHECK-LABEL: test_mm_loadh_pi
481   // CHECK: load <2 x float>, ptr {{.*}}, align 1{{$}}
482   // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
483   // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
484   return _mm_loadh_pi(x,y);
485 }
486 
487 __m128 test_mm_loadl_pi(__m128 x, __m64* y) {
488   // CHECK-LABEL: test_mm_loadl_pi
489   // CHECK: load <2 x float>, ptr {{.*}}, align 1{{$}}
490   // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
491   // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
492   return _mm_loadl_pi(x,y);
493 }
494 
495 __m128 test_mm_loadr_ps(float* A) {
496   // CHECK-LABEL: test_mm_loadr_ps
497   // CHECK: load <4 x float>, ptr %{{.*}}, align 16
498   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
499   return _mm_loadr_ps(A);
500 }
501 
502 __m128 test_mm_loadu_ps(float* A) {
503   // CHECK-LABEL: test_mm_loadu_ps
504   // CHECK: load <4 x float>, ptr %{{.*}}, align 1{{$}}
505   return _mm_loadu_ps(A);
506 }
507 
508 __m128 test_mm_max_ps(__m128 A, __m128 B) {
509   // CHECK-LABEL: test_mm_max_ps
510   // CHECK: @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
511   return _mm_max_ps(A, B);
512 }
513 
514 __m128 test_mm_max_ss(__m128 A, __m128 B) {
515   // CHECK-LABEL: test_mm_max_ss
516   // CHECK: @llvm.x86.sse.max.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
517   return _mm_max_ss(A, B);
518 }
519 
520 __m128 test_mm_min_ps(__m128 A, __m128 B) {
521   // CHECK-LABEL: test_mm_min_ps
522   // CHECK: @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
523   return _mm_min_ps(A, B);
524 }
525 
526 __m128 test_mm_min_ss(__m128 A, __m128 B) {
527   // CHECK-LABEL: test_mm_min_ss
528   // CHECK: @llvm.x86.sse.min.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
529   return _mm_min_ss(A, B);
530 }
531 
532 __m128 test_mm_move_ss(__m128 A, __m128 B) {
533   // CHECK-LABEL: test_mm_move_ss
534   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
535   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
536   return _mm_move_ss(A, B);
537 }
538 TEST_CONSTEXPR(match_m128(_mm_move_ss((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +8.0f, +0.0f, +2.0f, +4.0f));
539 
540 __m128 test_mm_movehl_ps(__m128 A, __m128 B) {
541   // CHECK-LABEL: test_mm_movehl_ps
542   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
543   return _mm_movehl_ps(A, B);
544 }
545 TEST_CONSTEXPR(match_m128(_mm_movehl_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +2.0f, +1.0f, +2.0f, +4.0f));
546 
547 __m128 test_mm_movelh_ps(__m128 A, __m128 B) {
548   // CHECK-LABEL: test_mm_movelh_ps
549   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
550   return _mm_movelh_ps(A, B);
551 }
552 TEST_CONSTEXPR(match_m128(_mm_movelh_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +1.0f, +0.0f, +8.0f, +4.0f));
553 
554 int test_mm_movemask_ps(__m128 A) {
555   // CHECK-LABEL: test_mm_movemask_ps
556   // CHECK: call {{.*}}i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
557   return _mm_movemask_ps(A);
558 }
559 
560 __m128 test_mm_mul_ps(__m128 A, __m128 B) {
561   // CHECK-LABEL: test_mm_mul_ps
562   // CHECK: fmul <4 x float>
563   return _mm_mul_ps(A, B);
564 }
565 TEST_CONSTEXPR(match_m128(_mm_mul_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +8.0f, +0.0f, +4.0f, +4.0f));
566 
567 __m128 test_mm_mul_ss(__m128 A, __m128 B) {
568   // CHECK-LABEL: test_mm_mul_ss
569   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
570   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
571   // CHECK: fmul float
572   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
573   return _mm_mul_ss(A, B);
574 }
575 TEST_CONSTEXPR(match_m128(_mm_mul_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +8.0f, +0.0f, +4.0f, +4.0f));
576 
577 __m128 test_mm_or_ps(__m128 A, __m128 B) {
578   // CHECK-LABEL: test_mm_or_ps
579   // CHECK: or <4 x i32>
580   return _mm_or_ps(A, B);
581 }
582 TEST_CONSTEXPR(match_m128(_mm_or_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), -4.0f, -5.0f, -6.0f, +7.0f));
583 
584 void test_mm_prefetch(char const* p) {
585   // CHECK-LABEL: test_mm_prefetch
586   // CHECK: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 0, i32 1)
587   _mm_prefetch(p, 0);
588 }
589 
590 __m128 test_mm_rcp_ps(__m128 x) {
591   // CHECK-LABEL: test_mm_rcp_ps
592   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse.rcp.ps(<4 x float> {{.*}})
593   return _mm_rcp_ps(x);
594 }
595 
596 __m128 test_mm_rcp_ss(__m128 x) {
597   // CHECK-LABEL: test_mm_rcp_ss
598   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse.rcp.ss(<4 x float> {{.*}})
599   return _mm_rcp_ss(x);
600 }
601 
602 __m128 test_mm_rsqrt_ps(__m128 x) {
603   // CHECK-LABEL: test_mm_rsqrt_ps
604   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> {{.*}})
605   return _mm_rsqrt_ps(x);
606 }
607 
608 __m128 test_mm_rsqrt_ss(__m128 x) {
609   // CHECK-LABEL: test_mm_rsqrt_ss
610   // CHECK: call {{.*}}<4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> {{.*}})
611   return _mm_rsqrt_ss(x);
612 }
613 
614 void test_MM_SET_EXCEPTION_MASK(unsigned int A) {
615   // CHECK-LABEL: test_MM_SET_EXCEPTION_MASK
616   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
617   // CHECK: load i32
618   // CHECK: and i32 {{.*}}, -8065
619   // CHECK: or i32
620   // CHECK: store i32
621   // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
622   _MM_SET_EXCEPTION_MASK(A);
623 }
624 
625 void test_MM_SET_EXCEPTION_STATE(unsigned int A) {
626   // CHECK-LABEL: test_MM_SET_EXCEPTION_STATE
627   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
628   // CHECK: load i32
629   // CHECK: and i32 {{.*}}, -64
630   // CHECK: or i32
631   // CHECK: store i32
632   // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
633   _MM_SET_EXCEPTION_STATE(A);
634 }
635 
636 void test_MM_SET_FLUSH_ZERO_MODE(unsigned int A) {
637   // CHECK-LABEL: test_MM_SET_FLUSH_ZERO_MODE
638   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
639   // CHECK: load i32
640   // CHECK: and i32 {{.*}}, -32769
641   // CHECK: or i32
642   // CHECK: store i32
643   // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
644   _MM_SET_FLUSH_ZERO_MODE(A);
645 }
646 
647 __m128 test_mm_set_ps(float A, float B, float C, float D) {
648   // CHECK-LABEL: test_mm_set_ps
649   // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
650   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
651   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
652   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
653   return _mm_set_ps(A, B, C, D);
654 }
655 TEST_CONSTEXPR(match_m128(_mm_set_ps(+0.0f, +1.0f, +2.0f, +3.0f), +3.0f, +2.0f, +1.0f, +.0f));
656 
657 __m128 test_mm_set_ps1(float A) {
658   // CHECK-LABEL: test_mm_set_ps1
659   // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
660   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
661   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
662   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
663   return _mm_set_ps1(A);
664 }
665 TEST_CONSTEXPR(match_m128(_mm_set_ps1(-2.0f), -2.0f, -2.0f, -2.0f, -2.0f));
666 
667 void test_MM_SET_ROUNDING_MODE(unsigned int A) {
668   // CHECK-LABEL: test_MM_SET_ROUNDING_MODE
669   // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
670   // CHECK: load i32
671   // CHECK: and i32 {{.*}}, -24577
672   // CHECK: or i32
673   // CHECK: store i32
674   // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
675   _MM_SET_ROUNDING_MODE(A);
676 }
677 
678 __m128 test_mm_set_ss(float A) {
679   // CHECK-LABEL: test_mm_set_ss
680   // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
681   // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 1
682   // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 2
683   // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 3
684   return _mm_set_ss(A);
685 }
686 TEST_CONSTEXPR(match_m128(_mm_set_ss(1.0f), +1.0f, +0.0f, +0.0f, +0.0f));
687 
688 __m128 test_mm_set1_ps(float A) {
689   // CHECK-LABEL: test_mm_set1_ps
690   // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
691   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
692   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
693   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
694   return _mm_set1_ps(A);
695 }
696 TEST_CONSTEXPR(match_m128(_mm_set1_ps(2.0f), +2.0f, +2.0f, +2.0f, +2.0f));
697 
698 void test_mm_setcsr(unsigned int A) {
699   // CHECK-LABEL: test_mm_setcsr
700   // CHECK: store i32
701   // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
702   _mm_setcsr(A);
703 }
704 
705 __m128 test_mm_setr_ps(float A, float B, float C, float D) {
706   // CHECK-LABEL: test_mm_setr_ps
707   // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
708   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
709   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
710   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
711   return _mm_setr_ps(A, B, C, D);
712 }
713 TEST_CONSTEXPR(match_m128(_mm_setr_ps(+0.0f, +1.0f, +2.0f, +3.0f), +0.0f, +1.0f, +2.0f, +3.0f));
714 
715 __m128 test_mm_setzero_ps(void) {
716   // CHECK-LABEL: test_mm_setzero_ps
717   // CHECK: store <4 x float> zeroinitializer
718   return _mm_setzero_ps();
719 }
720 TEST_CONSTEXPR(match_m128(_mm_setzero_ps(), +0.0f, +0.0f, +0.0f, +0.0f));
721 
722 void test_mm_sfence(void) {
723   // CHECK-LABEL: test_mm_sfence
724   // CHECK: call void @llvm.x86.sse.sfence()
725   _mm_sfence();
726 }
727 
728 __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
729   // CHECK-LABEL: test_mm_shuffle_ps
730   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
731   return _mm_shuffle_ps(A, B, 0);
732 }
733 
734 __m128 test_mm_sqrt_ps(__m128 x) {
735   // CHECK-LABEL: test_mm_sqrt_ps
736   // CHECK: call {{.*}}<4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
737   return _mm_sqrt_ps(x);
738 }
739 
740 __m128 test_mm_sqrt_ss(__m128 x) {
741   // CHECK-LABEL: test_mm_sqrt_ss
742   // CHECK: extractelement <4 x float> {{.*}}, i64 0
743   // CHECK: call float @llvm.sqrt.f32(float {{.*}})
744   // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
745   return _mm_sqrt_ss(x);
746 }
747 
748 void test_mm_store_ps(float* x, __m128 y) {
749   // CHECK-LABEL: test_mm_store_ps
750   // CHECK: store <4 x float> %{{.*}}, ptr {{.*}}, align 16
751   _mm_store_ps(x, y);
752 }
753 
754 void test_mm_store_ps1(float* x, __m128 y) {
755   // CHECK-LABEL: test_mm_store_ps1
756   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
757   // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16
758   _mm_store_ps1(x, y);
759 }
760 
761 void test_mm_store_ss(float* x, __m128 y) {
762   // CHECK-LABEL: test_mm_store_ss
763   // CHECK: extractelement <4 x float> {{.*}}, i32 0
764   // CHECK: store float %{{.*}}, ptr {{.*}}, align 1{{$}}
765   _mm_store_ss(x, y);
766 }
767 
768 void test_mm_store1_ps(float* x, __m128 y) {
769   // CHECK-LABEL: test_mm_store1_ps
770   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
771   // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16
772   _mm_store1_ps(x, y);
773 }
774 
775 void test_mm_storeh_pi(__m64* x,  __m128 y) {
776   // CHECK-LABEL: test_mm_storeh_pi
777   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 2, i32 3>
778   // CHECK: store <2 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
779   _mm_storeh_pi(x, y);
780 }
781 
782 void test_mm_storel_pi(__m64* x,  __m128 y) {
783   // CHECK-LABEL: test_mm_storel_pi
784   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
785   // CHECK: store <2 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
786   _mm_storel_pi(x, y);
787 }
788 
789 void test_mm_storer_ps(float* x,  __m128 y) {
790   // CHECK-LABEL: test_mm_storer_ps
791   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
792   // CHECK: store <4 x float> %{{.*}}, ptr {{.*}}, align 16
793   _mm_storer_ps(x, y);
794 }
795 
796 void test_mm_storeu_ps(float* x,  __m128 y) {
797   // CHECK-LABEL: test_mm_storeu_ps
798   // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
799   // CHECK-NEXT: ret void
800   _mm_storeu_ps(x, y);
801 }
802 
803 void test_mm_stream_ps(float*A, __m128 B) {
804   // CHECK-LABEL: test_mm_stream_ps
805   // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
806   _mm_stream_ps(A, B);
807 }
808 
809 void test_mm_stream_ps_void(void *A, __m128 B) {
810   // CHECK-LABEL: test_mm_stream_ps_void
811   // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
812   _mm_stream_ps(A, B);
813 }
814 
815 __m128 test_mm_sub_ps(__m128 A, __m128 B) {
816   // CHECK-LABEL: test_mm_sub_ps
817   // CHECK: fsub <4 x float>
818   return _mm_sub_ps(A, B);
819 }
820 TEST_CONSTEXPR(match_m128(_mm_sub_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), -7.0f, -4.0f, +0.0f, +3.0f));
821 
822 __m128 test_mm_sub_ss(__m128 A, __m128 B) {
823   // CHECK-LABEL: test_mm_sub_ss
824   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
825   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
826   // CHECK: fsub float
827   // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
828   return _mm_sub_ss(A, B);
829 }
830 TEST_CONSTEXPR(match_m128(_mm_sub_ss((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), -7.0f, +0.0f, +2.0f, +4.0f));
831 
832 void test_MM_TRANSPOSE4_PS(__m128 *A, __m128 *B, __m128 *C, __m128 *D) {
833   // CHECK-LABEL: test_MM_TRANSPOSE4_PS
834   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
835   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
836   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
837   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
838   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
839   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
840   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
841   // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
842   _MM_TRANSPOSE4_PS(*A, *B, *C, *D);
843 }
844 
845 int test_mm_ucomieq_ss(__m128 A, __m128 B) {
846   // CHECK-LABEL: test_mm_ucomieq_ss
847   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
848   return _mm_ucomieq_ss(A, B);
849 }
850 
851 int test_mm_ucomige_ss(__m128 A, __m128 B) {
852   // CHECK-LABEL: test_mm_ucomige_ss
853   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
854   return _mm_ucomige_ss(A, B);
855 }
856 
857 int test_mm_ucomigt_ss(__m128 A, __m128 B) {
858   // CHECK-LABEL: test_mm_ucomigt_ss
859   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
860   return _mm_ucomigt_ss(A, B);
861 }
862 
863 int test_mm_ucomile_ss(__m128 A, __m128 B) {
864   // CHECK-LABEL: test_mm_ucomile_ss
865   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
866   return _mm_ucomile_ss(A, B);
867 }
868 
869 int test_mm_ucomilt_ss(__m128 A, __m128 B) {
870   // CHECK-LABEL: test_mm_ucomilt_ss
871   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
872   return _mm_ucomilt_ss(A, B);
873 }
874 
875 int test_mm_ucomineq_ss(__m128 A, __m128 B) {
876   // CHECK-LABEL: test_mm_ucomineq_ss
877   // CHECK: call {{.*}}i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
878   return _mm_ucomineq_ss(A, B);
879 }
880 
881 __m128 test_mm_undefined_ps(void) {
882   // CHECK-LABEL: test_mm_undefined_ps
883   // CHECK: ret <4 x float> zeroinitializer
884   return _mm_undefined_ps();
885 }
886 
887 __m128 test_mm_unpackhi_ps(__m128 A, __m128 B) {
888   // CHECK-LABEL: test_mm_unpackhi_ps
889   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
890   return _mm_unpackhi_ps(A, B);
891 }
892 TEST_CONSTEXPR(match_m128(_mm_unpackhi_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +2.0f, +2.0f, +4.0f, +1.0f));
893 
894 __m128 test_mm_unpacklo_ps(__m128 A, __m128 B) {
895   // CHECK-LABEL: test_mm_unpacklo_ps
896   // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
897   return _mm_unpacklo_ps(A, B);
898 }
899 TEST_CONSTEXPR(match_m128(_mm_unpacklo_ps((__m128){+1.0f, +0.0f, +2.0f, +4.0f}, (__m128){+8.0f, +4.0f, +2.0f, +1.0f}), +1.0f, +8.0f, +0.0f, +4.0f));
900 
901 __m128 test_mm_xor_ps(__m128 A, __m128 B) {
902   // CHECK-LABEL: test_mm_xor_ps
903   // CHECK: xor <4 x i32>
904   return _mm_xor_ps(A, B);
905 }
906 TEST_CONSTEXPR(match_m128(_mm_xor_ps((__m128){-4.0f, -5.0f, +6.0f, +7.0f}, (__m128){+0.0f, -0.0f, -0.0f, +7.0f}), -4.0f, +5.0f, -6.0f, +0.0f));
907