xref: /llvm-project/llvm/test/CodeGen/X86/avx512fp16-mov.ll (revision 2096e57905a20903f668848ffd11e6130bfa58e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86
4
5define <8 x half> @broadcastph128(ptr %x) {
6; X64-LABEL: broadcastph128:
7; X64:       # %bb.0:
8; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
9; X64-NEXT:    retq
10;
11; X86-LABEL: broadcastph128:
12; X86:       # %bb.0:
13; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
14; X86-NEXT:    vpbroadcastw (%eax), %xmm0
15; X86-NEXT:    retl
16  %l1 = load half, ptr %x, align 2
17  %vec = insertelement <8 x half> undef, half %l1, i32 0
18  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
19  ret <8 x half> %res
20}
21
22define <16 x half> @broadcastph256(ptr %x) {
23; X64-LABEL: broadcastph256:
24; X64:       # %bb.0:
25; X64-NEXT:    vpbroadcastw (%rdi), %ymm0
26; X64-NEXT:    retq
27;
28; X86-LABEL: broadcastph256:
29; X86:       # %bb.0:
30; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
31; X86-NEXT:    vpbroadcastw (%eax), %ymm0
32; X86-NEXT:    retl
33  %l1 = load half, ptr %x, align 2
34  %vec = insertelement <16 x half> undef, half %l1, i32 0
35  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
36  ret <16 x half> %res
37}
38
39define <32 x half> @broadcastph512(ptr %x) {
40; X64-LABEL: broadcastph512:
41; X64:       # %bb.0:
42; X64-NEXT:    vpbroadcastw (%rdi), %zmm0
43; X64-NEXT:    retq
44;
45; X86-LABEL: broadcastph512:
46; X86:       # %bb.0:
47; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
48; X86-NEXT:    vpbroadcastw (%eax), %zmm0
49; X86-NEXT:    retl
50  %l1 = load half, ptr %x, align 2
51  %vec = insertelement <32 x half> undef, half %l1, i32 0
52  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
53  ret <32 x half> %res
54}
55
56define <8 x half> @broadcastph128_scalar(half %x) {
57; X64-LABEL: broadcastph128_scalar:
58; X64:       # %bb.0:
59; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
60; X64-NEXT:    retq
61;
62; X86-LABEL: broadcastph128_scalar:
63; X86:       # %bb.0:
64; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
65; X86-NEXT:    retl
66  %vec = insertelement <8 x half> undef, half %x, i32 0
67  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
68  ret <8 x half> %res
69}
70
71define <16 x half> @broadcastph256_scalar(half %x) {
72; X64-LABEL: broadcastph256_scalar:
73; X64:       # %bb.0:
74; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
75; X64-NEXT:    retq
76;
77; X86-LABEL: broadcastph256_scalar:
78; X86:       # %bb.0:
79; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
80; X86-NEXT:    retl
81  %vec = insertelement <16 x half> undef, half %x, i32 0
82  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
83  ret <16 x half> %res
84}
85
86define <32 x half> @broadcastph512_scalar(half %x) {
87; X64-LABEL: broadcastph512_scalar:
88; X64:       # %bb.0:
89; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
90; X64-NEXT:    retq
91;
92; X86-LABEL: broadcastph512_scalar:
93; X86:       # %bb.0:
94; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
95; X86-NEXT:    retl
96  %vec = insertelement <32 x half> undef, half %x, i32 0
97  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
98  ret <32 x half> %res
99}
100
101define <8 x half> @broadcastph128_reg(<8 x half> %x) {
102; CHECK-LABEL: broadcastph128_reg:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
105; CHECK-NEXT:    ret{{[l|q]}}
106  %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
107  ret <8 x half> %res
108}
109
110define <16 x half> @broadcastph256_reg(<16 x half> %x) {
111; CHECK-LABEL: broadcastph256_reg:
112; CHECK:       # %bb.0:
113; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
114; CHECK-NEXT:    ret{{[l|q]}}
115  %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
116  ret <16 x half> %res
117}
118
119define <32 x half> @broadcastph512_reg(<32 x half> %x) {
120; CHECK-LABEL: broadcastph512_reg:
121; CHECK:       # %bb.0:
122; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
123; CHECK-NEXT:    ret{{[l|q]}}
124  %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
125  ret <32 x half> %res
126}
127
128define i16 @test1(half %x) {
129; X64-LABEL: test1:
130; X64:       # %bb.0:
131; X64-NEXT:    vmovw %xmm0, %eax
132; X64-NEXT:    # kill: def $ax killed $ax killed $eax
133; X64-NEXT:    retq
134;
135; X86-LABEL: test1:
136; X86:       # %bb.0:
137; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
138; X86-NEXT:    retl
139   %res = bitcast half %x to i16
140   ret i16 %res
141}
142
143define <8 x i16> @test2(i16 %x) {
144; X64-LABEL: test2:
145; X64:       # %bb.0:
146; X64-NEXT:    vmovw %edi, %xmm0
147; X64-NEXT:    retq
148;
149; X86-LABEL: test2:
150; X86:       # %bb.0:
151; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
152; X86-NEXT:    retl
153   %res = insertelement <8 x i16>undef, i16 %x, i32 0
154   ret <8 x i16>%res
155}
156
157define <8 x i16> @test4(ptr %x) {
158; X64-LABEL: test4:
159; X64:       # %bb.0:
160; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
161; X64-NEXT:    retq
162;
163; X86-LABEL: test4:
164; X86:       # %bb.0:
165; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
166; X86-NEXT:    vpbroadcastw (%eax), %xmm0
167; X86-NEXT:    retl
168   %y = load i16, ptr %x
169   %res = insertelement <8 x i16>undef, i16 %y, i32 0
170   ret <8 x i16>%res
171}
172
173define void @test5(half %x, ptr %y) {
174; X64-LABEL: test5:
175; X64:       # %bb.0:
176; X64-NEXT:    vmovsh %xmm0, (%rdi)
177; X64-NEXT:    retq
178;
179; X86-LABEL: test5:
180; X86:       # %bb.0:
181; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
182; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
183; X86-NEXT:    vmovsh %xmm0, (%eax)
184; X86-NEXT:    retl
185   store half %x, ptr %y, align 2
186   ret void
187}
188
189define half @test7(ptr %x) {
190; X64-LABEL: test7:
191; X64:       # %bb.0:
192; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
193; X64-NEXT:    retq
194;
195; X86-LABEL: test7:
196; X86:       # %bb.0:
197; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
198; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
199; X86-NEXT:    retl
200   %y = load i16, ptr %x
201   %res = bitcast i16 %y to half
202   ret half %res
203}
204
205define <8 x i16> @test10(ptr %x) {
206; X64-LABEL: test10:
207; X64:       # %bb.0:
208; X64-NEXT:    vmovw (%rdi), %xmm0
209; X64-NEXT:    retq
210;
211; X86-LABEL: test10:
212; X86:       # %bb.0:
213; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
214; X86-NEXT:    vmovw (%eax), %xmm0
215; X86-NEXT:    retl
216   %y = load i16, ptr %x, align 2
217   %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
218   ret <8 x i16>%res
219}
220
221define <16 x i16> @test10b(ptr %x) {
222; X64-LABEL: test10b:
223; X64:       # %bb.0:
224; X64-NEXT:    vmovw (%rdi), %xmm0
225; X64-NEXT:    retq
226;
227; X86-LABEL: test10b:
228; X86:       # %bb.0:
229; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
230; X86-NEXT:    vmovw (%eax), %xmm0
231; X86-NEXT:    retl
232   %y = load i16, ptr %x, align 2
233   %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
234   ret <16 x i16>%res
235}
236
237define <32 x i16> @test10c(ptr %x) {
238; X64-LABEL: test10c:
239; X64:       # %bb.0:
240; X64-NEXT:    vmovw (%rdi), %xmm0
241; X64-NEXT:    retq
242;
243; X86-LABEL: test10c:
244; X86:       # %bb.0:
245; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
246; X86-NEXT:    vmovw (%eax), %xmm0
247; X86-NEXT:    retl
248   %y = load i16, ptr %x, align 2
249   %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
250   ret <32 x i16>%res
251}
252
253define <8 x half> @test11(ptr %x) {
254; X64-LABEL: test11:
255; X64:       # %bb.0:
256; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
257; X64-NEXT:    retq
258;
259; X86-LABEL: test11:
260; X86:       # %bb.0:
261; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
262; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
263; X86-NEXT:    retl
264   %y = load half, ptr %x, align 2
265   %res = insertelement <8 x half>zeroinitializer, half %y, i32 0
266   ret <8 x half>%res
267}
268
269define <16 x half> @test11b(ptr %x) {
270; X64-LABEL: test11b:
271; X64:       # %bb.0:
272; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
273; X64-NEXT:    retq
274;
275; X86-LABEL: test11b:
276; X86:       # %bb.0:
277; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
278; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
279; X86-NEXT:    retl
280   %y = load half, ptr %x, align 2
281   %res = insertelement <16 x half>zeroinitializer, half %y, i32 0
282   ret <16 x half>%res
283}
284
285define <32 x half> @test11c(ptr %x) {
286; X64-LABEL: test11c:
287; X64:       # %bb.0:
288; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
289; X64-NEXT:    retq
290;
291; X86-LABEL: test11c:
292; X86:       # %bb.0:
293; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
294; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
295; X86-NEXT:    retl
296   %y = load half, ptr %x, align 2
297   %res = insertelement <32 x half>zeroinitializer, half %y, i32 0
298   ret <32 x half>%res
299}
300
301define <8 x half> @test14(half %x) {
302; X64-LABEL: test14:
303; X64:       # %bb.0:
304; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
305; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
306; X64-NEXT:    retq
307;
308; X86-LABEL: test14:
309; X86:       # %bb.0:
310; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
311; X86-NEXT:    retl
312   %res = insertelement <8 x half>zeroinitializer, half %x, i32 0
313   ret <8 x half>%res
314}
315
316define <16 x half> @test14b(half %x) {
317; X64-LABEL: test14b:
318; X64:       # %bb.0:
319; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
320; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
321; X64-NEXT:    retq
322;
323; X86-LABEL: test14b:
324; X86:       # %bb.0:
325; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
326; X86-NEXT:    retl
327   %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
328   ret <16 x half>%res
329}
330
331define <32 x half> @test14c(half %x) {
332; X64-LABEL: test14c:
333; X64:       # %bb.0:
334; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
335; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
336; X64-NEXT:    retq
337;
338; X86-LABEL: test14c:
339; X86:       # %bb.0:
340; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
341; X86-NEXT:    retl
342   %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
343   ret <32 x half>%res
344}
345
346define <8 x i16> @test15(i16 %x) {
347; X64-LABEL: test15:
348; X64:       # %bb.0:
349; X64-NEXT:    vmovw %edi, %xmm0
350; X64-NEXT:    retq
351;
352; X86-LABEL: test15:
353; X86:       # %bb.0:
354; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
355; X86-NEXT:    retl
356   %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
357   ret <8 x i16>%res
358}
359
360define <16 x i16> @test16(i16 %x) {
361; X64-LABEL: test16:
362; X64:       # %bb.0:
363; X64-NEXT:    vmovw %edi, %xmm0
364; X64-NEXT:    retq
365;
366; X86-LABEL: test16:
367; X86:       # %bb.0:
368; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
369; X86-NEXT:    retl
370   %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
371   ret <16 x i16>%res
372}
373
374define <32 x i16> @test17(i16 %x) {
375; X64-LABEL: test17:
376; X64:       # %bb.0:
377; X64-NEXT:    vmovw %edi, %xmm0
378; X64-NEXT:    retq
379;
380; X86-LABEL: test17:
381; X86:       # %bb.0:
382; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
383; X86-NEXT:    retl
384   %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
385   ret <32 x i16>%res
386}
387
388define <8 x i16> @test18(i16 %x) {
389; X64-LABEL: test18:
390; X64:       # %bb.0:
391; X64-NEXT:    vmovw %edi, %xmm0
392; X64-NEXT:    retq
393;
394; X86-LABEL: test18:
395; X86:       # %bb.0:
396; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
397; X86-NEXT:    retl
398   %res = insertelement <8 x i16> undef, i16 %x, i32 0
399   ret <8 x i16>%res
400}
401
402define <16 x i16> @test19(i16 %x) {
403; X64-LABEL: test19:
404; X64:       # %bb.0:
405; X64-NEXT:    vmovw %edi, %xmm0
406; X64-NEXT:    retq
407;
408; X86-LABEL: test19:
409; X86:       # %bb.0:
410; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
411; X86-NEXT:    retl
412   %res = insertelement <16 x i16> undef, i16 %x, i32 0
413   ret <16 x i16>%res
414}
415
416define <32 x i16> @test20(i16 %x) {
417; X64-LABEL: test20:
418; X64:       # %bb.0:
419; X64-NEXT:    vmovw %edi, %xmm0
420; X64-NEXT:    retq
421;
422; X86-LABEL: test20:
423; X86:       # %bb.0:
424; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
425; X86-NEXT:    retl
426   %res = insertelement <32 x i16> undef, i16 %x, i32 0
427   ret <32 x i16>%res
428}
429
430@g8f16 = external global <8 x half>
431@g8f16u = external global <8 x half>, align 8
432@g16f16 = external global <16 x half>
433@g16f16u = external global <16 x half>, align 8
434@g32f16 = external global <32 x half>
435@g32f16u = external global <32 x half>, align 8
436
437define <32 x half> @load32f16(ptr %a) {
438; X64-LABEL: load32f16:
439; X64:       # %bb.0:
440; X64-NEXT:    vmovaps (%rdi), %zmm0
441; X64-NEXT:    retq
442;
443; X86-LABEL: load32f16:
444; X86:       # %bb.0:
445; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
446; X86-NEXT:    vmovaps (%eax), %zmm0
447; X86-NEXT:    retl
448  %res = load <32 x half>, ptr %a
449  ret <32 x half> %res
450}
451
452define <32 x half> @load32f16mask(ptr %a, <32 x half> %b, i32 %c) {
453; X64-LABEL: load32f16mask:
454; X64:       # %bb.0:
455; X64-NEXT:    kmovd %esi, %k1
456; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
457; X64-NEXT:    retq
458;
459; X86-LABEL: load32f16mask:
460; X86:       # %bb.0:
461; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
462; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
463; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
464; X86-NEXT:    retl
465  %msk = bitcast i32 %c to <32 x i1>
466  %res0 = load <32 x half>, ptr %a
467  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
468  ret <32 x half> %res
469}
470
471define <32 x half> @load32f16maskz(ptr %a, i32 %c) {
472; X64-LABEL: load32f16maskz:
473; X64:       # %bb.0:
474; X64-NEXT:    kmovd %esi, %k1
475; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
476; X64-NEXT:    retq
477;
478; X86-LABEL: load32f16maskz:
479; X86:       # %bb.0:
480; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
482; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
483; X86-NEXT:    retl
484  %msk = bitcast i32 %c to <32 x i1>
485  %res0 = load <32 x half>, ptr %a
486  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
487  ret <32 x half> %res
488}
489
490define <32 x half> @loadu32f16(ptr %a) {
491; X64-LABEL: loadu32f16:
492; X64:       # %bb.0:
493; X64-NEXT:    vmovups (%rdi), %zmm0
494; X64-NEXT:    retq
495;
496; X86-LABEL: loadu32f16:
497; X86:       # %bb.0:
498; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
499; X86-NEXT:    vmovups (%eax), %zmm0
500; X86-NEXT:    retl
501  %res = load <32 x half>, ptr %a, align 8
502  ret <32 x half> %res
503}
504
505define <32 x half> @loadu32f16mask(ptr %a, <32 x half> %b, i32 %c) {
506; X64-LABEL: loadu32f16mask:
507; X64:       # %bb.0:
508; X64-NEXT:    kmovd %esi, %k1
509; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
510; X64-NEXT:    retq
511;
512; X86-LABEL: loadu32f16mask:
513; X86:       # %bb.0:
514; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
515; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
516; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
517; X86-NEXT:    retl
518  %msk = bitcast i32 %c to <32 x i1>
519  %res0 = load <32 x half>, ptr %a, align 8
520  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
521  ret <32 x half> %res
522}
523
524define <32 x half> @loadu32f16maskz(ptr %a, i32 %c) {
525; X64-LABEL: loadu32f16maskz:
526; X64:       # %bb.0:
527; X64-NEXT:    kmovd %esi, %k1
528; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
529; X64-NEXT:    retq
530;
531; X86-LABEL: loadu32f16maskz:
532; X86:       # %bb.0:
533; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
534; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
535; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
536; X86-NEXT:    retl
537  %msk = bitcast i32 %c to <32 x i1>
538  %res0 = load <32 x half>, ptr %a, align 8
539  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
540  ret <32 x half> %res
541}
542
543define void @store32f16(<32 x half> %a) {
544; X64-LABEL: store32f16:
545; X64:       # %bb.0:
546; X64-NEXT:    movq g32f16@GOTPCREL(%rip), %rax
547; X64-NEXT:    vmovaps %zmm0, (%rax)
548; X64-NEXT:    vzeroupper
549; X64-NEXT:    retq
550;
551; X86-LABEL: store32f16:
552; X86:       # %bb.0:
553; X86-NEXT:    vmovaps %zmm0, g32f16
554; X86-NEXT:    vzeroupper
555; X86-NEXT:    retl
556  store <32 x half> %a, ptr @g32f16
557  ret void
558}
559
560define void @storeu32f16(<32 x half> %a) {
561; X64-LABEL: storeu32f16:
562; X64:       # %bb.0:
563; X64-NEXT:    movq g32f16u@GOTPCREL(%rip), %rax
564; X64-NEXT:    vmovups %zmm0, (%rax)
565; X64-NEXT:    vzeroupper
566; X64-NEXT:    retq
567;
568; X86-LABEL: storeu32f16:
569; X86:       # %bb.0:
570; X86-NEXT:    vmovups %zmm0, g32f16u
571; X86-NEXT:    vzeroupper
572; X86-NEXT:    retl
573  store <32 x half> %a, ptr @g32f16u, align 8
574  ret void
575}
576
577declare void @llvm.masked.store.v32f16.p0(<32 x half>, ptr, i32, <32 x i1>)
578declare <32 x half> @llvm.masked.load.v32f16.p0(ptr, i32,  <32 x i1>, <32 x half>)
579
580define void @storeu32f16mask(<32 x i1> %mask, ptr %addr, <32 x half> %val) {
581; X64-LABEL: storeu32f16mask:
582; X64:       # %bb.0:
583; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
584; X64-NEXT:    vpmovb2m %ymm0, %k1
585; X64-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
586; X64-NEXT:    vzeroupper
587; X64-NEXT:    retq
588;
589; X86-LABEL: storeu32f16mask:
590; X86:       # %bb.0:
591; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
592; X86-NEXT:    vpmovb2m %ymm0, %k1
593; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
594; X86-NEXT:    vmovdqu16 %zmm1, (%eax) {%k1}
595; X86-NEXT:    vzeroupper
596; X86-NEXT:    retl
597  call void @llvm.masked.store.v32f16.p0(<32 x half> %val, ptr %addr, i32 4, <32 x i1>%mask)
598  ret void
599}
600
601define <32 x half> @maskloadu32f16(ptr %addr, <32 x half> %val, <32 x i1> %mask) {
602; X64-LABEL: maskloadu32f16:
603; X64:       # %bb.0:
604; X64-NEXT:    vpsllw $7, %ymm1, %ymm1
605; X64-NEXT:    vpmovb2m %ymm1, %k1
606; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
607; X64-NEXT:    retq
608;
609; X86-LABEL: maskloadu32f16:
610; X86:       # %bb.0:
611; X86-NEXT:    vpsllw $7, %ymm1, %ymm1
612; X86-NEXT:    vpmovb2m %ymm1, %k1
613; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
614; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
615; X86-NEXT:    retl
616  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
617  ret <32 x half> %res
618}
619
620define <32 x half> @maskuloadu32f16(ptr %addr, <32 x i1> %mask) {
621; X64-LABEL: maskuloadu32f16:
622; X64:       # %bb.0:
623; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
624; X64-NEXT:    vpmovb2m %ymm0, %k1
625; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
626; X64-NEXT:    retq
627;
628; X86-LABEL: maskuloadu32f16:
629; X86:       # %bb.0:
630; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
631; X86-NEXT:    vpmovb2m %ymm0, %k1
632; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
633; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
634; X86-NEXT:    retl
635  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
636  ret <32 x half> %res
637}
638
639define <32 x half> @maskzloadu32f16(ptr %addr, <32 x i1> %mask) {
640; X64-LABEL: maskzloadu32f16:
641; X64:       # %bb.0:
642; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
643; X64-NEXT:    vpmovb2m %ymm0, %k1
644; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
645; X64-NEXT:    retq
646;
647; X86-LABEL: maskzloadu32f16:
648; X86:       # %bb.0:
649; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
650; X86-NEXT:    vpmovb2m %ymm0, %k1
651; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
652; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
653; X86-NEXT:    retl
654  %res = call <32 x half> @llvm.masked.load.v32f16.p0(ptr %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
655  ret <32 x half> %res
656}
657
658define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
659; CHECK-LABEL: movrr32f16:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    vmovaps %zmm1, %zmm0
662; CHECK-NEXT:    ret{{[l|q]}}
663  ret <32 x half> %b
664}
665
666define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
667; X64-LABEL: movrrk32f16:
668; X64:       # %bb.0:
669; X64-NEXT:    kmovd %edi, %k1
670; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
671; X64-NEXT:    retq
672;
673; X86-LABEL: movrrk32f16:
674; X86:       # %bb.0:
675; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
676; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
677; X86-NEXT:    retl
678  %mask = bitcast i32 %msk to <32 x i1>
679  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
680  ret <32 x half> %res
681}
682
683define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
684; X64-LABEL: movrrkz32f16:
685; X64:       # %bb.0:
686; X64-NEXT:    kmovd %edi, %k1
687; X64-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
688; X64-NEXT:    retq
689;
690; X86-LABEL: movrrkz32f16:
691; X86:       # %bb.0:
692; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
693; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
694; X86-NEXT:    retl
695  %mask = bitcast i32 %msk to <32 x i1>
696  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
697  ret <32 x half> %res
698}
699
700define <16 x half> @load16f16(ptr %a) {
701; X64-LABEL: load16f16:
702; X64:       # %bb.0:
703; X64-NEXT:    vmovaps (%rdi), %ymm0
704; X64-NEXT:    retq
705;
706; X86-LABEL: load16f16:
707; X86:       # %bb.0:
708; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
709; X86-NEXT:    vmovaps (%eax), %ymm0
710; X86-NEXT:    retl
711  %res = load <16 x half>, ptr %a
712  ret <16 x half> %res
713}
714
715define <16 x half> @load16f16mask(ptr %a, <16 x half> %b, i16 %c) {
716; X64-LABEL: load16f16mask:
717; X64:       # %bb.0:
718; X64-NEXT:    kmovd %esi, %k1
719; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
720; X64-NEXT:    retq
721;
722; X86-LABEL: load16f16mask:
723; X86:       # %bb.0:
724; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
725; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
726; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
727; X86-NEXT:    retl
728  %msk = bitcast i16 %c to <16 x i1>
729  %res0 = load <16 x half>, ptr %a
730  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
731  ret <16 x half> %res
732}
733
734define <16 x half> @load16f16maskz(ptr %a, i16 %c) {
735; X64-LABEL: load16f16maskz:
736; X64:       # %bb.0:
737; X64-NEXT:    kmovd %esi, %k1
738; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
739; X64-NEXT:    retq
740;
741; X86-LABEL: load16f16maskz:
742; X86:       # %bb.0:
743; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
744; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
745; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
746; X86-NEXT:    retl
747  %msk = bitcast i16 %c to <16 x i1>
748  %res0 = load <16 x half>, ptr %a
749  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
750  ret <16 x half> %res
751}
752
753define <16 x half> @loadu16f16(ptr %a) {
754; X64-LABEL: loadu16f16:
755; X64:       # %bb.0:
756; X64-NEXT:    vmovups (%rdi), %ymm0
757; X64-NEXT:    retq
758;
759; X86-LABEL: loadu16f16:
760; X86:       # %bb.0:
761; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
762; X86-NEXT:    vmovups (%eax), %ymm0
763; X86-NEXT:    retl
764  %res = load <16 x half>, ptr %a, align 8
765  ret <16 x half> %res
766}
767
768define <16 x half> @loadu16f16mask(ptr %a, <16 x half> %b, i16 %c) {
769; X64-LABEL: loadu16f16mask:
770; X64:       # %bb.0:
771; X64-NEXT:    kmovd %esi, %k1
772; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
773; X64-NEXT:    retq
774;
775; X86-LABEL: loadu16f16mask:
776; X86:       # %bb.0:
777; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
778; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
779; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
780; X86-NEXT:    retl
781  %msk = bitcast i16 %c to <16 x i1>
782  %res0 = load <16 x half>, ptr %a, align 8
783  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
784  ret <16 x half> %res
785}
786
787define <16 x half> @loadu16f16maskz(ptr %a, i16 %c) {
788; X64-LABEL: loadu16f16maskz:
789; X64:       # %bb.0:
790; X64-NEXT:    kmovd %esi, %k1
791; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
792; X64-NEXT:    retq
793;
794; X86-LABEL: loadu16f16maskz:
795; X86:       # %bb.0:
796; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
797; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
798; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
799; X86-NEXT:    retl
800  %msk = bitcast i16 %c to <16 x i1>
801  %res0 = load <16 x half>, ptr %a, align 8
802  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
803  ret <16 x half> %res
804}
805
806define void @store16f16(<16 x half> %a) {
807; X64-LABEL: store16f16:
808; X64:       # %bb.0:
809; X64-NEXT:    movq g16f16@GOTPCREL(%rip), %rax
810; X64-NEXT:    vmovaps %ymm0, (%rax)
811; X64-NEXT:    vzeroupper
812; X64-NEXT:    retq
813;
814; X86-LABEL: store16f16:
815; X86:       # %bb.0:
816; X86-NEXT:    vmovaps %ymm0, g16f16
817; X86-NEXT:    vzeroupper
818; X86-NEXT:    retl
819  store <16 x half> %a, ptr @g16f16
820  ret void
821}
822
823define void @storeu16f16(<16 x half> %a) {
824; X64-LABEL: storeu16f16:
825; X64:       # %bb.0:
826; X64-NEXT:    movq g16f16u@GOTPCREL(%rip), %rax
827; X64-NEXT:    vmovups %ymm0, (%rax)
828; X64-NEXT:    vzeroupper
829; X64-NEXT:    retq
830;
831; X86-LABEL: storeu16f16:
832; X86:       # %bb.0:
833; X86-NEXT:    vmovups %ymm0, g16f16u
834; X86-NEXT:    vzeroupper
835; X86-NEXT:    retl
836  store <16 x half> %a, ptr @g16f16u, align 8
837  ret void
838}
839
840declare void @llvm.masked.store.v16f16.p0(<16 x half>, ptr, i32, <16 x i1>)
841declare <16 x half> @llvm.masked.load.v16f16.p0(ptr, i32,  <16 x i1>, <16 x half>)
842
843define void @storeu16f16mask(<16 x i1> %mask, ptr %addr, <16 x half> %val) {
844; X64-LABEL: storeu16f16mask:
845; X64:       # %bb.0:
846; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
847; X64-NEXT:    vpmovb2m %xmm0, %k1
848; X64-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
849; X64-NEXT:    vzeroupper
850; X64-NEXT:    retq
851;
852; X86-LABEL: storeu16f16mask:
853; X86:       # %bb.0:
854; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
855; X86-NEXT:    vpmovb2m %xmm0, %k1
856; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
857; X86-NEXT:    vmovdqu16 %ymm1, (%eax) {%k1}
858; X86-NEXT:    vzeroupper
859; X86-NEXT:    retl
860  call void @llvm.masked.store.v16f16.p0(<16 x half> %val, ptr %addr, i32 4, <16 x i1>%mask)
861  ret void
862}
863
864define <16 x half> @maskloadu16f16(ptr %addr, <16 x half> %val, <16 x i1> %mask) {
865; X64-LABEL: maskloadu16f16:
866; X64:       # %bb.0:
867; X64-NEXT:    vpsllw $7, %xmm1, %xmm1
868; X64-NEXT:    vpmovb2m %xmm1, %k1
869; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
870; X64-NEXT:    retq
871;
872; X86-LABEL: maskloadu16f16:
873; X86:       # %bb.0:
874; X86-NEXT:    vpsllw $7, %xmm1, %xmm1
875; X86-NEXT:    vpmovb2m %xmm1, %k1
876; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
877; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
878; X86-NEXT:    retl
879  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
880  ret <16 x half> %res
881}
882
883define <16 x half> @maskuloadu16f16(ptr %addr, <16 x i1> %mask) {
884; X64-LABEL: maskuloadu16f16:
885; X64:       # %bb.0:
886; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
887; X64-NEXT:    vpmovb2m %xmm0, %k1
888; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
889; X64-NEXT:    retq
890;
891; X86-LABEL: maskuloadu16f16:
892; X86:       # %bb.0:
893; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
894; X86-NEXT:    vpmovb2m %xmm0, %k1
895; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
896; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
897; X86-NEXT:    retl
898  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
899  ret <16 x half> %res
900}
901
902define <16 x half> @maskzloadu16f16(ptr %addr, <16 x i1> %mask) {
903; X64-LABEL: maskzloadu16f16:
904; X64:       # %bb.0:
905; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
906; X64-NEXT:    vpmovb2m %xmm0, %k1
907; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
908; X64-NEXT:    retq
909;
910; X86-LABEL: maskzloadu16f16:
911; X86:       # %bb.0:
912; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
913; X86-NEXT:    vpmovb2m %xmm0, %k1
914; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
915; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
916; X86-NEXT:    retl
917  %res = call <16 x half> @llvm.masked.load.v16f16.p0(ptr %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
918  ret <16 x half> %res
919}
920
921define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
922; CHECK-LABEL: movrr16f16:
923; CHECK:       # %bb.0:
924; CHECK-NEXT:    vmovaps %ymm1, %ymm0
925; CHECK-NEXT:    ret{{[l|q]}}
926  ret <16 x half> %b
927}
928
929define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
930; X64-LABEL: movrrk16f16:
931; X64:       # %bb.0:
932; X64-NEXT:    kmovd %edi, %k1
933; X64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
934; X64-NEXT:    retq
935;
936; X86-LABEL: movrrk16f16:
937; X86:       # %bb.0:
938; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
939; X86-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
940; X86-NEXT:    retl
941  %mask = bitcast i16 %msk to <16 x i1>
942  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
943  ret <16 x half> %res
944}
945
946define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
947; X64-LABEL: movrrkz16f16:
948; X64:       # %bb.0:
949; X64-NEXT:    kmovd %edi, %k1
950; X64-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
951; X64-NEXT:    retq
952;
953; X86-LABEL: movrrkz16f16:
954; X86:       # %bb.0:
955; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
956; X86-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
957; X86-NEXT:    retl
958  %mask = bitcast i16 %msk to <16 x i1>
959  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
960  ret <16 x half> %res
961}
962
963define <8 x half> @load8f16(ptr %a) {
964; X64-LABEL: load8f16:
965; X64:       # %bb.0:
966; X64-NEXT:    vmovaps (%rdi), %xmm0
967; X64-NEXT:    retq
968;
969; X86-LABEL: load8f16:
970; X86:       # %bb.0:
971; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
972; X86-NEXT:    vmovaps (%eax), %xmm0
973; X86-NEXT:    retl
974  %res = load <8 x half>, ptr %a
975  ret <8 x half> %res
976}
977
978define <8 x half> @load8f16mask(ptr %a, <8 x half> %b, i8 %c) {
979; X64-LABEL: load8f16mask:
980; X64:       # %bb.0:
981; X64-NEXT:    kmovd %esi, %k1
982; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
983; X64-NEXT:    retq
984;
985; X86-LABEL: load8f16mask:
986; X86:       # %bb.0:
987; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
988; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
989; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
990; X86-NEXT:    retl
991  %msk = bitcast i8 %c to <8 x i1>
992  %res0 = load <8 x half>, ptr %a
993  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
994  ret <8 x half> %res
995}
996
997define <8 x half> @load8f16maskz(ptr %a, i8 %c) {
998; X64-LABEL: load8f16maskz:
999; X64:       # %bb.0:
1000; X64-NEXT:    kmovd %esi, %k1
1001; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1002; X64-NEXT:    retq
1003;
1004; X86-LABEL: load8f16maskz:
1005; X86:       # %bb.0:
1006; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1007; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
1008; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
1009; X86-NEXT:    retl
1010  %msk = bitcast i8 %c to <8 x i1>
1011  %res0 = load <8 x half>, ptr %a
1012  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1013  ret <8 x half> %res
1014}
1015
1016define <8 x half> @loadu8f16(ptr %a) {
1017; X64-LABEL: loadu8f16:
1018; X64:       # %bb.0:
1019; X64-NEXT:    vmovups (%rdi), %xmm0
1020; X64-NEXT:    retq
1021;
1022; X86-LABEL: loadu8f16:
1023; X86:       # %bb.0:
1024; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1025; X86-NEXT:    vmovups (%eax), %xmm0
1026; X86-NEXT:    retl
1027  %res = load <8 x half>, ptr %a, align 8
1028  ret <8 x half> %res
1029}
1030
1031define <8 x half> @loadu8f16mask(ptr %a, <8 x half> %b, i8 %c) {
1032; X64-LABEL: loadu8f16mask:
1033; X64:       # %bb.0:
1034; X64-NEXT:    kmovd %esi, %k1
1035; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
1036; X64-NEXT:    retq
1037;
1038; X86-LABEL: loadu8f16mask:
1039; X86:       # %bb.0:
1040; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1041; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
1042; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
1043; X86-NEXT:    retl
1044  %msk = bitcast i8 %c to <8 x i1>
1045  %res0 = load <8 x half>, ptr %a, align 8
1046  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
1047  ret <8 x half> %res
1048}
1049
1050define <8 x half> @loadu8f16maskz(ptr %a, i8 %c) {
1051; X64-LABEL: loadu8f16maskz:
1052; X64:       # %bb.0:
1053; X64-NEXT:    kmovd %esi, %k1
1054; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1055; X64-NEXT:    retq
1056;
1057; X86-LABEL: loadu8f16maskz:
1058; X86:       # %bb.0:
1059; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1060; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
1061; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
1062; X86-NEXT:    retl
1063  %msk = bitcast i8 %c to <8 x i1>
1064  %res0 = load <8 x half>, ptr %a, align 8
1065  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
1066  ret <8 x half> %res
1067}
1068
1069define void @store8f16(<8 x half> %a) {
1070; X64-LABEL: store8f16:
1071; X64:       # %bb.0:
1072; X64-NEXT:    movq g8f16@GOTPCREL(%rip), %rax
1073; X64-NEXT:    vmovaps %xmm0, (%rax)
1074; X64-NEXT:    retq
1075;
1076; X86-LABEL: store8f16:
1077; X86:       # %bb.0:
1078; X86-NEXT:    vmovaps %xmm0, g8f16
1079; X86-NEXT:    retl
1080  store <8 x half> %a, ptr @g8f16
1081  ret void
1082}
1083
1084define void @storeu8f16(<8 x half> %a) {
1085; X64-LABEL: storeu8f16:
1086; X64:       # %bb.0:
1087; X64-NEXT:    movq g8f16u@GOTPCREL(%rip), %rax
1088; X64-NEXT:    vmovups %xmm0, (%rax)
1089; X64-NEXT:    retq
1090;
1091; X86-LABEL: storeu8f16:
1092; X86:       # %bb.0:
1093; X86-NEXT:    vmovups %xmm0, g8f16u
1094; X86-NEXT:    retl
1095  store <8 x half> %a, ptr @g8f16u, align 8
1096  ret void
1097}
1098
1099declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32, <8 x i1>)
1100declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32,  <8 x i1>, <8 x half>)
1101
1102define void @storeu8f16mask(<8 x i1> %mask, ptr %addr, <8 x half> %val) {
1103; X64-LABEL: storeu8f16mask:
1104; X64:       # %bb.0:
1105; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
1106; X64-NEXT:    vpmovw2m %xmm0, %k1
1107; X64-NEXT:    vmovdqu16 %xmm1, (%rdi) {%k1}
1108; X64-NEXT:    retq
1109;
1110; X86-LABEL: storeu8f16mask:
1111; X86:       # %bb.0:
1112; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
1113; X86-NEXT:    vpmovw2m %xmm0, %k1
1114; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1115; X86-NEXT:    vmovdqu16 %xmm1, (%eax) {%k1}
1116; X86-NEXT:    retl
1117  call void @llvm.masked.store.v8f16.p0(<8 x half> %val, ptr %addr, i32 4, <8 x i1>%mask)
1118  ret void
1119}
1120
1121define <8 x half> @maskloadu8f16(ptr %addr, <8 x half> %val, <8 x i1> %mask) {
1122; X64-LABEL: maskloadu8f16:
1123; X64:       # %bb.0:
1124; X64-NEXT:    vpsllw $15, %xmm1, %xmm1
1125; X64-NEXT:    vpmovw2m %xmm1, %k1
1126; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
1127; X64-NEXT:    retq
1128;
1129; X86-LABEL: maskloadu8f16:
1130; X86:       # %bb.0:
1131; X86-NEXT:    vpsllw $15, %xmm1, %xmm1
1132; X86-NEXT:    vpmovw2m %xmm1, %k1
1133; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1134; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
1135; X86-NEXT:    retl
1136  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
1137  ret <8 x half> %res
1138}
1139
1140define <8 x half> @maskuloadu8f16(ptr %addr, <8 x i1> %mask) {
1141; X64-LABEL: maskuloadu8f16:
1142; X64:       # %bb.0:
1143; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
1144; X64-NEXT:    vpmovw2m %xmm0, %k1
1145; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1146; X64-NEXT:    retq
1147;
1148; X86-LABEL: maskuloadu8f16:
1149; X86:       # %bb.0:
1150; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
1151; X86-NEXT:    vpmovw2m %xmm0, %k1
1152; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1153; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
1154; X86-NEXT:    retl
1155  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
1156  ret <8 x half> %res
1157}
1158
1159define <8 x half> @maskzloadu8f16(ptr %addr, <8 x i1> %mask) {
1160; X64-LABEL: maskzloadu8f16:
1161; X64:       # %bb.0:
1162; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
1163; X64-NEXT:    vpmovw2m %xmm0, %k1
1164; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
1165; X64-NEXT:    retq
1166;
1167; X86-LABEL: maskzloadu8f16:
1168; X86:       # %bb.0:
1169; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
1170; X86-NEXT:    vpmovw2m %xmm0, %k1
1171; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1172; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
1173; X86-NEXT:    retl
1174  %res = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
1175  ret <8 x half> %res
1176}
1177
1178define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
1179; CHECK-LABEL: movrr8f16:
1180; CHECK:       # %bb.0:
1181; CHECK-NEXT:    vmovaps %xmm1, %xmm0
1182; CHECK-NEXT:    ret{{[l|q]}}
1183  ret <8 x half> %b
1184}
1185
1186define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
1187; X64-LABEL: movrrk8f16:
1188; X64:       # %bb.0:
1189; X64-NEXT:    kmovd %edi, %k1
1190; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1191; X64-NEXT:    retq
1192;
1193; X86-LABEL: movrrk8f16:
1194; X86:       # %bb.0:
1195; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
1196; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
1197; X86-NEXT:    retl
1198  %mask = bitcast i8 %msk to <8 x i1>
1199  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
1200  ret <8 x half> %res
1201}
1202
1203define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
1204; X64-LABEL: movrrkz8f16:
1205; X64:       # %bb.0:
1206; X64-NEXT:    kmovd %edi, %k1
1207; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1208; X64-NEXT:    retq
1209;
1210; X86-LABEL: movrrkz8f16:
1211; X86:       # %bb.0:
1212; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
1213; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
1214; X86-NEXT:    retl
1215  %mask = bitcast i8 %msk to <8 x i1>
1216  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
1217  ret <8 x half> %res
1218}
1219
1220define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
1221; CHECK-LABEL: movsh:
1222; CHECK:       # %bb.0:
1223; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1224; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
1225; CHECK-NEXT:    vaddph %xmm0, %xmm2, %xmm0
1226; CHECK-NEXT:    ret{{[l|q]}}
1227  %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1228  %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1229  %res = fadd <8 x half> %res1, %res2
1230  ret <8 x half> %res
1231}
1232
1233define i16 @test_movw(half %x) {
1234; X64-LABEL: test_movw:
1235; X64:       # %bb.0:
1236; X64-NEXT:    vmovw %xmm0, %eax
1237; X64-NEXT:    # kill: def $ax killed $ax killed $eax
1238; X64-NEXT:    retq
1239;
1240; X86-LABEL: test_movw:
1241; X86:       # %bb.0:
1242; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1243; X86-NEXT:    retl
1244  %res = bitcast half %x to i16
1245  ret i16 %res
1246}
1247
1248define half @test_movw2(i16 %x) {
1249; X64-LABEL: test_movw2:
1250; X64:       # %bb.0:
1251; X64-NEXT:    vmovw %edi, %xmm0
1252; X64-NEXT:    retq
1253;
1254; X86-LABEL: test_movw2:
1255; X86:       # %bb.0:
1256; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1257; X86-NEXT:    retl
1258  %res = bitcast i16 %x to half
1259  ret half %res
1260}
1261
1262; sext avoids having a truncate in front of the bitcast input due to calling
1263; convention or i16 op promotion.
1264define half @test_movw3(i8 %x) {
1265; X64-LABEL: test_movw3:
1266; X64:       # %bb.0:
1267; X64-NEXT:    movsbl %dil, %eax
1268; X64-NEXT:    vmovw %eax, %xmm0
1269; X64-NEXT:    retq
1270;
1271; X86-LABEL: test_movw3:
1272; X86:       # %bb.0:
1273; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
1274; X86-NEXT:    vmovw %eax, %xmm0
1275; X86-NEXT:    retl
1276  %z = sext i8 %x to i16
1277  %a = bitcast i16 %z to half
1278  ret half %a
1279}
1280
1281define half @extract_f16_0(<8 x half> %x) {
1282; CHECK-LABEL: extract_f16_0:
1283; CHECK:       # %bb.0:
1284; CHECK-NEXT:    ret{{[l|q]}}
1285   %res = extractelement <8 x half> %x, i32 0
1286   ret half %res
1287}
1288
1289define half @extract_f16_1(<8 x half> %x) {
1290; CHECK-LABEL: extract_f16_1:
1291; CHECK:       # %bb.0:
1292; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
1293; CHECK-NEXT:    ret{{[l|q]}}
1294   %res = extractelement <8 x half> %x, i32 1
1295   ret half %res
1296}
1297
1298define half @extract_f16_2(<8 x half> %x) {
1299; CHECK-LABEL: extract_f16_2:
1300; CHECK:       # %bb.0:
1301; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1302; CHECK-NEXT:    ret{{[l|q]}}
1303   %res = extractelement <8 x half> %x, i32 2
1304   ret half %res
1305}
1306
1307define half @extract_f16_3(<8 x half> %x) {
1308; CHECK-LABEL: extract_f16_3:
1309; CHECK:       # %bb.0:
1310; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
1311; CHECK-NEXT:    ret{{[l|q]}}
1312   %res = extractelement <8 x half> %x, i32 3
1313   ret half %res
1314}
1315
1316define half @extract_f16_4(<8 x half> %x) {
1317; CHECK-LABEL: extract_f16_4:
1318; CHECK:       # %bb.0:
1319; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1320; CHECK-NEXT:    ret{{[l|q]}}
1321   %res = extractelement <8 x half> %x, i32 4
1322   ret half %res
1323}
1324
1325define half @extract_f16_5(<8 x half> %x) {
1326; CHECK-LABEL: extract_f16_5:
1327; CHECK:       # %bb.0:
1328; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1329; CHECK-NEXT:    ret{{[l|q]}}
1330   %res = extractelement <8 x half> %x, i32 5
1331   ret half %res
1332}
1333
1334define half @extract_f16_6(<8 x half> %x) {
1335; CHECK-LABEL: extract_f16_6:
1336; CHECK:       # %bb.0:
1337; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1338; CHECK-NEXT:    ret{{[l|q]}}
1339   %res = extractelement <8 x half> %x, i32 6
1340   ret half %res
1341}
1342
1343define half @extract_f16_7(<8 x half> %x) {
1344; CHECK-LABEL: extract_f16_7:
1345; CHECK:       # %bb.0:
1346; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1347; CHECK-NEXT:    ret{{[l|q]}}
1348   %res = extractelement <8 x half> %x, i32 7
1349   ret half %res
1350}
1351
1352define half @extract_f16_8(<32 x half> %x, i64 %idx) nounwind {
1353; X64-LABEL: extract_f16_8:
1354; X64:       # %bb.0:
1355; X64-NEXT:    pushq %rbp
1356; X64-NEXT:    movq %rsp, %rbp
1357; X64-NEXT:    andq $-64, %rsp
1358; X64-NEXT:    subq $128, %rsp
1359; X64-NEXT:    andl $31, %edi
1360; X64-NEXT:    vmovaps %zmm0, (%rsp)
1361; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1362; X64-NEXT:    movq %rbp, %rsp
1363; X64-NEXT:    popq %rbp
1364; X64-NEXT:    vzeroupper
1365; X64-NEXT:    retq
1366;
1367; X86-LABEL: extract_f16_8:
1368; X86:       # %bb.0:
1369; X86-NEXT:    pushl %ebp
1370; X86-NEXT:    movl %esp, %ebp
1371; X86-NEXT:    andl $-64, %esp
1372; X86-NEXT:    subl $128, %esp
1373; X86-NEXT:    movl 8(%ebp), %eax
1374; X86-NEXT:    andl $31, %eax
1375; X86-NEXT:    vmovaps %zmm0, (%esp)
1376; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1377; X86-NEXT:    movl %ebp, %esp
1378; X86-NEXT:    popl %ebp
1379; X86-NEXT:    vzeroupper
1380; X86-NEXT:    retl
1381   %res = extractelement <32 x half> %x, i64 %idx
1382   ret half %res
1383}
1384
1385define half @extract_f16_9(<64 x half> %x, i64 %idx) nounwind {
1386; X64-LABEL: extract_f16_9:
1387; X64:       # %bb.0:
1388; X64-NEXT:    pushq %rbp
1389; X64-NEXT:    movq %rsp, %rbp
1390; X64-NEXT:    andq $-64, %rsp
1391; X64-NEXT:    subq $192, %rsp
1392; X64-NEXT:    andl $63, %edi
1393; X64-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
1394; X64-NEXT:    vmovaps %zmm0, (%rsp)
1395; X64-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1396; X64-NEXT:    movq %rbp, %rsp
1397; X64-NEXT:    popq %rbp
1398; X64-NEXT:    vzeroupper
1399; X64-NEXT:    retq
1400;
1401; X86-LABEL: extract_f16_9:
1402; X86:       # %bb.0:
1403; X86-NEXT:    pushl %ebp
1404; X86-NEXT:    movl %esp, %ebp
1405; X86-NEXT:    andl $-64, %esp
1406; X86-NEXT:    subl $192, %esp
1407; X86-NEXT:    movl 8(%ebp), %eax
1408; X86-NEXT:    andl $63, %eax
1409; X86-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp)
1410; X86-NEXT:    vmovaps %zmm0, (%esp)
1411; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1412; X86-NEXT:    movl %ebp, %esp
1413; X86-NEXT:    popl %ebp
1414; X86-NEXT:    vzeroupper
1415; X86-NEXT:    retl
1416   %res = extractelement <64 x half> %x, i64 %idx
1417   ret half %res
1418}
1419
1420define i16 @extract_i16_0(<8 x i16> %x) {
1421; CHECK-LABEL: extract_i16_0:
1422; CHECK:       # %bb.0:
1423; CHECK-NEXT:    vmovw %xmm0, %eax
1424; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1425; CHECK-NEXT:    ret{{[l|q]}}
1426   %res = extractelement <8 x i16> %x, i32 0
1427   ret i16 %res
1428}
1429
1430define i16 @extract_i16_1(<8 x i16> %x) {
1431; CHECK-LABEL: extract_i16_1:
1432; CHECK:       # %bb.0:
1433; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
1434; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1435; CHECK-NEXT:    ret{{[l|q]}}
1436   %res = extractelement <8 x i16> %x, i32 1
1437   ret i16 %res
1438}
1439
1440define i16 @extract_i16_2(<8 x i16> %x) {
1441; CHECK-LABEL: extract_i16_2:
1442; CHECK:       # %bb.0:
1443; CHECK-NEXT:    vpextrw $2, %xmm0, %eax
1444; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1445; CHECK-NEXT:    ret{{[l|q]}}
1446   %res = extractelement <8 x i16> %x, i32 2
1447   ret i16 %res
1448}
1449
1450define i16 @extract_i16_3(<8 x i16> %x) {
1451; CHECK-LABEL: extract_i16_3:
1452; CHECK:       # %bb.0:
1453; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
1454; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1455; CHECK-NEXT:    ret{{[l|q]}}
1456   %res = extractelement <8 x i16> %x, i32 3
1457   ret i16 %res
1458}
1459
1460define i16 @extract_i16_4(<8 x i16> %x) {
1461; CHECK-LABEL: extract_i16_4:
1462; CHECK:       # %bb.0:
1463; CHECK-NEXT:    vpextrw $4, %xmm0, %eax
1464; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1465; CHECK-NEXT:    ret{{[l|q]}}
1466   %res = extractelement <8 x i16> %x, i32 4
1467   ret i16 %res
1468}
1469
1470define i16 @extract_i16_5(<8 x i16> %x) {
1471; CHECK-LABEL: extract_i16_5:
1472; CHECK:       # %bb.0:
1473; CHECK-NEXT:    vpextrw $5, %xmm0, %eax
1474; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1475; CHECK-NEXT:    ret{{[l|q]}}
1476   %res = extractelement <8 x i16> %x, i32 5
1477   ret i16 %res
1478}
1479
1480define i16 @extract_i16_6(<8 x i16> %x) {
1481; CHECK-LABEL: extract_i16_6:
1482; CHECK:       # %bb.0:
1483; CHECK-NEXT:    vpextrw $6, %xmm0, %eax
1484; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1485; CHECK-NEXT:    ret{{[l|q]}}
1486   %res = extractelement <8 x i16> %x, i32 6
1487   ret i16 %res
1488}
1489
1490define i16 @extract_i16_7(<8 x i16> %x) {
1491; CHECK-LABEL: extract_i16_7:
1492; CHECK:       # %bb.0:
1493; CHECK-NEXT:    vpextrw $7, %xmm0, %eax
1494; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
1495; CHECK-NEXT:    ret{{[l|q]}}
1496   %res = extractelement <8 x i16> %x, i32 7
1497   ret i16 %res
1498}
1499
1500define void @extract_store_f16_0(<8 x half> %x, ptr %y) {
1501; X64-LABEL: extract_store_f16_0:
1502; X64:       # %bb.0:
1503; X64-NEXT:    vmovsh %xmm0, (%rdi)
1504; X64-NEXT:    retq
1505;
1506; X86-LABEL: extract_store_f16_0:
1507; X86:       # %bb.0:
1508; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1509; X86-NEXT:    vmovsh %xmm0, (%eax)
1510; X86-NEXT:    retl
1511   %res = extractelement <8 x half> %x, i32 0
1512   store half %res, ptr %y
1513   ret void
1514}
1515
1516define void @extract_store_f16_1(<8 x half> %x, ptr %y) {
1517; X64-LABEL: extract_store_f16_1:
1518; X64:       # %bb.0:
1519; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
1520; X64-NEXT:    vmovsh %xmm0, (%rdi)
1521; X64-NEXT:    retq
1522;
1523; X86-LABEL: extract_store_f16_1:
1524; X86:       # %bb.0:
1525; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1526; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
1527; X86-NEXT:    vmovsh %xmm0, (%eax)
1528; X86-NEXT:    retl
1529   %res = extractelement <8 x half> %x, i32 1
1530   store half %res, ptr %y
1531   ret void
1532}
1533
1534define void @extract_store_f16_2(<8 x half> %x, ptr %y) {
1535; X64-LABEL: extract_store_f16_2:
1536; X64:       # %bb.0:
1537; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1538; X64-NEXT:    vmovsh %xmm0, (%rdi)
1539; X64-NEXT:    retq
1540;
1541; X86-LABEL: extract_store_f16_2:
1542; X86:       # %bb.0:
1543; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1544; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1545; X86-NEXT:    vmovsh %xmm0, (%eax)
1546; X86-NEXT:    retl
1547   %res = extractelement <8 x half> %x, i32 2
1548   store half %res, ptr %y
1549   ret void
1550}
1551
1552define void @extract_store_f16_3(<8 x half> %x, ptr %y) {
1553; X64-LABEL: extract_store_f16_3:
1554; X64:       # %bb.0:
1555; X64-NEXT:    vpsrlq $48, %xmm0, %xmm0
1556; X64-NEXT:    vmovsh %xmm0, (%rdi)
1557; X64-NEXT:    retq
1558;
1559; X86-LABEL: extract_store_f16_3:
1560; X86:       # %bb.0:
1561; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1562; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
1563; X86-NEXT:    vmovsh %xmm0, (%eax)
1564; X86-NEXT:    retl
1565   %res = extractelement <8 x half> %x, i32 3
1566   store half %res, ptr %y
1567   ret void
1568}
1569
1570define void @extract_store_f16_4(<8 x half> %x, ptr %y) {
1571; X64-LABEL: extract_store_f16_4:
1572; X64:       # %bb.0:
1573; X64-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1574; X64-NEXT:    vmovsh %xmm0, (%rdi)
1575; X64-NEXT:    retq
1576;
1577; X86-LABEL: extract_store_f16_4:
1578; X86:       # %bb.0:
1579; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1580; X86-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1581; X86-NEXT:    vmovsh %xmm0, (%eax)
1582; X86-NEXT:    retl
1583   %res = extractelement <8 x half> %x, i32 4
1584   store half %res, ptr %y
1585   ret void
1586}
1587
1588define void @extract_store_f16_5(<8 x half> %x, ptr %y) {
1589; X64-LABEL: extract_store_f16_5:
1590; X64:       # %bb.0:
1591; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1592; X64-NEXT:    vmovsh %xmm0, (%rdi)
1593; X64-NEXT:    retq
1594;
1595; X86-LABEL: extract_store_f16_5:
1596; X86:       # %bb.0:
1597; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1598; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1599; X86-NEXT:    vmovsh %xmm0, (%eax)
1600; X86-NEXT:    retl
1601   %res = extractelement <8 x half> %x, i32 5
1602   store half %res, ptr %y
1603   ret void
1604}
1605
1606define void @extract_store_f16_6(<8 x half> %x, ptr %y) {
1607; X64-LABEL: extract_store_f16_6:
1608; X64:       # %bb.0:
1609; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1610; X64-NEXT:    vmovsh %xmm0, (%rdi)
1611; X64-NEXT:    retq
1612;
1613; X86-LABEL: extract_store_f16_6:
1614; X86:       # %bb.0:
1615; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1616; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1617; X86-NEXT:    vmovsh %xmm0, (%eax)
1618; X86-NEXT:    retl
1619   %res = extractelement <8 x half> %x, i32 6
1620   store half %res, ptr %y
1621   ret void
1622}
1623
1624define void @extract_store_f16_7(<8 x half> %x, ptr %y) {
1625; X64-LABEL: extract_store_f16_7:
1626; X64:       # %bb.0:
1627; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1628; X64-NEXT:    vmovsh %xmm0, (%rdi)
1629; X64-NEXT:    retq
1630;
1631; X86-LABEL: extract_store_f16_7:
1632; X86:       # %bb.0:
1633; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1634; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1635; X86-NEXT:    vmovsh %xmm0, (%eax)
1636; X86-NEXT:    retl
1637   %res = extractelement <8 x half> %x, i32 7
1638   store half %res, ptr %y
1639   ret void
1640}
1641
1642define void @extract_store_i16_0(<8 x i16> %x, ptr %y) {
1643; X64-LABEL: extract_store_i16_0:
1644; X64:       # %bb.0:
1645; X64-NEXT:    vpextrw $0, %xmm0, (%rdi)
1646; X64-NEXT:    retq
1647;
1648; X86-LABEL: extract_store_i16_0:
1649; X86:       # %bb.0:
1650; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1651; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
1652; X86-NEXT:    retl
1653   %res = extractelement <8 x i16> %x, i32 0
1654   store i16 %res, ptr %y
1655   ret void
1656}
1657
1658define void @extract_store_i16_1(<8 x i16> %x, ptr %y) {
1659; X64-LABEL: extract_store_i16_1:
1660; X64:       # %bb.0:
1661; X64-NEXT:    vpextrw $1, %xmm0, (%rdi)
1662; X64-NEXT:    retq
1663;
1664; X86-LABEL: extract_store_i16_1:
1665; X86:       # %bb.0:
1666; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1667; X86-NEXT:    vpextrw $1, %xmm0, (%eax)
1668; X86-NEXT:    retl
1669   %res = extractelement <8 x i16> %x, i32 1
1670   store i16 %res, ptr %y
1671   ret void
1672}
1673
1674define void @extract_store_i16_2(<8 x i16> %x, ptr %y) {
1675; X64-LABEL: extract_store_i16_2:
1676; X64:       # %bb.0:
1677; X64-NEXT:    vpextrw $2, %xmm0, (%rdi)
1678; X64-NEXT:    retq
1679;
1680; X86-LABEL: extract_store_i16_2:
1681; X86:       # %bb.0:
1682; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1683; X86-NEXT:    vpextrw $2, %xmm0, (%eax)
1684; X86-NEXT:    retl
1685   %res = extractelement <8 x i16> %x, i32 2
1686   store i16 %res, ptr %y
1687   ret void
1688}
1689
1690define void @extract_store_i16_3(<8 x i16> %x, ptr %y) {
1691; X64-LABEL: extract_store_i16_3:
1692; X64:       # %bb.0:
1693; X64-NEXT:    vpextrw $3, %xmm0, (%rdi)
1694; X64-NEXT:    retq
1695;
1696; X86-LABEL: extract_store_i16_3:
1697; X86:       # %bb.0:
1698; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1699; X86-NEXT:    vpextrw $3, %xmm0, (%eax)
1700; X86-NEXT:    retl
1701   %res = extractelement <8 x i16> %x, i32 3
1702   store i16 %res, ptr %y
1703   ret void
1704}
1705
1706define void @extract_store_i16_4(<8 x i16> %x, ptr %y) {
1707; X64-LABEL: extract_store_i16_4:
1708; X64:       # %bb.0:
1709; X64-NEXT:    vpextrw $4, %xmm0, (%rdi)
1710; X64-NEXT:    retq
1711;
1712; X86-LABEL: extract_store_i16_4:
1713; X86:       # %bb.0:
1714; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1715; X86-NEXT:    vpextrw $4, %xmm0, (%eax)
1716; X86-NEXT:    retl
1717   %res = extractelement <8 x i16> %x, i32 4
1718   store i16 %res, ptr %y
1719   ret void
1720}
1721
1722define void @extract_store_i16_5(<8 x i16> %x, ptr %y) {
1723; X64-LABEL: extract_store_i16_5:
1724; X64:       # %bb.0:
1725; X64-NEXT:    vpextrw $5, %xmm0, (%rdi)
1726; X64-NEXT:    retq
1727;
1728; X86-LABEL: extract_store_i16_5:
1729; X86:       # %bb.0:
1730; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1731; X86-NEXT:    vpextrw $5, %xmm0, (%eax)
1732; X86-NEXT:    retl
1733   %res = extractelement <8 x i16> %x, i32 5
1734   store i16 %res, ptr %y
1735   ret void
1736}
1737
1738define void @extract_store_i16_6(<8 x i16> %x, ptr %y) {
1739; X64-LABEL: extract_store_i16_6:
1740; X64:       # %bb.0:
1741; X64-NEXT:    vpextrw $6, %xmm0, (%rdi)
1742; X64-NEXT:    retq
1743;
1744; X86-LABEL: extract_store_i16_6:
1745; X86:       # %bb.0:
1746; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1747; X86-NEXT:    vpextrw $6, %xmm0, (%eax)
1748; X86-NEXT:    retl
1749   %res = extractelement <8 x i16> %x, i32 6
1750   store i16 %res, ptr %y
1751   ret void
1752}
1753
1754define void @extract_store_i16_7(<8 x i16> %x, ptr %y) {
1755; X64-LABEL: extract_store_i16_7:
1756; X64:       # %bb.0:
1757; X64-NEXT:    vpextrw $7, %xmm0, (%rdi)
1758; X64-NEXT:    retq
1759;
1760; X86-LABEL: extract_store_i16_7:
1761; X86:       # %bb.0:
1762; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1763; X86-NEXT:    vpextrw $7, %xmm0, (%eax)
1764; X86-NEXT:    retl
1765   %res = extractelement <8 x i16> %x, i32 7
1766   store i16 %res, ptr %y
1767   ret void
1768}
1769
1770define i32 @extract_zext_i16_0(<8 x i16> %x) {
1771; CHECK-LABEL: extract_zext_i16_0:
1772; CHECK:       # %bb.0:
1773; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
1774; CHECK-NEXT:    ret{{[l|q]}}
1775   %res = extractelement <8 x i16> %x, i32 0
1776   %res2 = zext i16 %res to i32
1777   ret i32 %res2
1778}
1779
1780define i32 @extract_zext_i16_1(<8 x i16> %x) {
1781; CHECK-LABEL: extract_zext_i16_1:
1782; CHECK:       # %bb.0:
1783; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
1784; CHECK-NEXT:    ret{{[l|q]}}
1785   %res = extractelement <8 x i16> %x, i32 1
1786   %res2 = zext i16 %res to i32
1787   ret i32 %res2
1788}
1789
1790define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
1791; X64-LABEL: build_vector_xxxxuuuu:
1792; X64:       # %bb.0:
1793; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1794; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1795; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1796; X64-NEXT:    retq
1797;
1798; X86-LABEL: build_vector_xxxxuuuu:
1799; X86:       # %bb.0:
1800; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1801; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1802; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1803; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1804; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1805; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1806; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
1807; X86-NEXT:    retl
1808  %a = insertelement <8 x half> undef, half %a0, i32 0
1809  %b = insertelement <8 x half> %a, half %a1, i32 1
1810  %c = insertelement <8 x half> %b, half %a2, i32 2
1811  %d = insertelement <8 x half> %c, half %a3, i32 3
1812  ret <8 x half> %d
1813}
1814
1815define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
1816; X64-LABEL: build_vector_uuuuxxxx:
1817; X64:       # %bb.0:
1818; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1819; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1820; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1821; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
1822; X64-NEXT:    retq
1823;
1824; X86-LABEL: build_vector_uuuuxxxx:
1825; X86:       # %bb.0:
1826; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1827; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1828; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1829; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1830; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1831; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1832; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1833; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
1834; X86-NEXT:    retl
1835  %a = insertelement <8 x half> undef, half %a0, i32 4
1836  %b = insertelement <8 x half> %a, half %a1, i32 5
1837  %c = insertelement <8 x half> %b, half %a2, i32 6
1838  %d = insertelement <8 x half> %c, half %a3, i32 7
1839  ret <8 x half> %d
1840}
1841
1842define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1843; X64-LABEL: build_vector_xxxxxxxx:
1844; X64:       # %bb.0:
1845; X64-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1846; X64-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1847; X64-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1848; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1849; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1850; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1851; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1852; X64-NEXT:    retq
1853;
1854; X86-LABEL: build_vector_xxxxxxxx:
1855; X86:       # %bb.0:
1856; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1857; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1858; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1859; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1860; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1861; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1862; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1863; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1864; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1865; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1866; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1867; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
1868; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1869; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1870; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1871; X86-NEXT:    retl
1872  %a = insertelement <8 x half> undef, half %a0, i32 0
1873  %b = insertelement <8 x half> %a, half %a1, i32 1
1874  %c = insertelement <8 x half> %b, half %a2, i32 2
1875  %d = insertelement <8 x half> %c, half %a3, i32 3
1876  %e = insertelement <8 x half> %d, half %a4, i32 4
1877  %f = insertelement <8 x half> %e, half %a5, i32 5
1878  %g = insertelement <8 x half> %f, half %a6, i32 6
1879  %h = insertelement <8 x half> %g, half %a7, i32 7
1880  ret <8 x half> %h
1881}
1882
1883define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
1884; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1885; X64:       # %bb.0:
1886; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1887; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1888; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
1889; X64-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
1890; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1891; X64-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1892; X64-NEXT:    vpbroadcastq %xmm1, %xmm1
1893; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1894; X64-NEXT:    retq
1895;
1896; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
1897; X86:       # %bb.0:
1898; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
1899; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1900; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1901; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1902; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1903; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1904; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1905; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
1906; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1907; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1908; X86-NEXT:    vmovsh {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero
1909; X86-NEXT:    vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero
1910; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1911; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
1912; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
1913; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1914; X86-NEXT:    retl
1915  %a = insertelement <16 x half> undef, half %a0, i32 0
1916  %b = insertelement <16 x half> %a, half %a1, i32 1
1917  %c = insertelement <16 x half> %b, half %a2, i32 2
1918  %d = insertelement <16 x half> %c, half %a3, i32 3
1919  %e = insertelement <16 x half> %d, half %a4, i32 12
1920  %f = insertelement <16 x half> %e, half %a5, i32 13
1921  %g = insertelement <16 x half> %f, half %a6, i32 14
1922  %h = insertelement <16 x half> %g, half %a7, i32 15
1923  ret <16 x half> %h
1924}
1925
1926define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
1927; CHECK-LABEL: regression1:
1928; CHECK:       # %bb.0:
1929; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
1930; CHECK-NEXT:    ret{{[l|q]}}
1931  %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
1932  ret <8 x half> %res
1933}
1934
1935define <4 x float> @regression2(ptr addrspace(1) %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, ptr %4) {
1936; X64-LABEL: regression2:
1937; X64:       # %bb.0:
1938; X64-NEXT:    vmovw (%rsi), %xmm0
1939; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1940; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
1941; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1942; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1943; X64-NEXT:    retq
1944;
1945; X86-LABEL: regression2:
1946; X86:       # %bb.0:
1947; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1948; X86-NEXT:    vmovw (%eax), %xmm0
1949; X86-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1950; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
1951; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
1952; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
1953; X86-NEXT:    retl
1954  %6 = load i8, ptr %4, align 1
1955  %7 = getelementptr i8, ptr %4, i64 1
1956  %8 = addrspacecast ptr %7 to ptr addrspace(4)
1957  %9 = load i8, ptr addrspace(4) %8, align 1
1958  %10 = insertelement <2 x i8> poison, i8 %6, i32 0
1959  %11 = insertelement <2 x i8> %10, i8 %9, i32 1
1960  %12 = uitofp <2 x i8> %11 to <2 x float>
1961  %13 = shufflevector <2 x float> %12, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1962  %14 = shufflevector <4 x float> %13, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
1963  %15 = fmul contract <4 x float> %14, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
1964  ret <4 x float> %15
1965}
1966
1967; Make sure load/stores of v4f16 are handled well on 32-bit targets where
1968; default widening legalization can't use i64.
1969define void @load_store_v4f16(ptr %x, ptr %y, ptr %z) {
1970; X64-LABEL: load_store_v4f16:
1971; X64:       # %bb.0:
1972; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1973; X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1974; X64-NEXT:    vaddph %xmm1, %xmm0, %xmm0
1975; X64-NEXT:    vmovlps %xmm0, (%rdx)
1976; X64-NEXT:    retq
1977;
1978; X86-LABEL: load_store_v4f16:
1979; X86:       # %bb.0:
1980; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1981; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1982; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1983; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1984; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1985; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
1986; X86-NEXT:    vmovlps %xmm0, (%eax)
1987; X86-NEXT:    retl
1988  %a = load <4 x half>, ptr %x
1989  %b = load <4 x half>, ptr %y
1990  %c = fadd <4 x half> %a, %b
1991  store <4 x half> %c, ptr %z
1992  ret void
1993}
1994
1995define <8 x half> @test21(half %a, half %b, half %c) nounwind {
1996; X64-LABEL: test21:
1997; X64:       # %bb.0:
1998; X64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1999; X64-NEXT:    vmovsh %xmm2, %xmm3, %xmm2
2000; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2001; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2002; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2003; X64-NEXT:    vpbroadcastw %xmm1, %xmm1
2004; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2005; X64-NEXT:    retq
2006;
2007; X86-LABEL: test21:
2008; X86:       # %bb.0:
2009; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
2010; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
2011; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2012; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
2013; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2014; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2015; X86-NEXT:    vpbroadcastw %xmm1, %xmm1
2016; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
2017; X86-NEXT:    retl
2018  %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0
2019  %2 = insertelement <8 x half> %1, half %b, i32 1
2020  %3 = insertelement <8 x half> %2, half %c, i32 2
2021  ret <8 x half> %3
2022}
2023
2024define <16 x i16> @test22(ptr %mem) nounwind {
2025; X64-LABEL: test22:
2026; X64:       # %bb.0:
2027; X64-NEXT:    movzwl 0, %eax
2028; X64-NEXT:    andw (%rdi), %ax
2029; X64-NEXT:    vmovw %eax, %xmm0
2030; X64-NEXT:    retq
2031;
2032; X86-LABEL: test22:
2033; X86:       # %bb.0:
2034; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2035; X86-NEXT:    movzwl 0, %ecx
2036; X86-NEXT:    andw (%eax), %cx
2037; X86-NEXT:    vmovw %ecx, %xmm0
2038; X86-NEXT:    retl
2039  %1 = load i16, ptr null, align 2
2040  %2 = load i16, ptr %mem, align 2
2041  %3 = and i16 %1, %2
2042  %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0
2043  ret <16 x i16> %4
2044}
2045
2046define void @pr52560(i8 %0, <2 x i16> %1, ptr %c) nounwind {
2047; X64-LABEL: pr52560:
2048; X64:       # %bb.0: # %entry
2049; X64-NEXT:    movsbl %dil, %eax
2050; X64-NEXT:    vmovw %eax, %xmm1
2051; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2052; X64-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
2053; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2054; X64-NEXT:    vmovw %xmm0, %eax
2055; X64-NEXT:    testw %ax, %ax
2056; X64-NEXT:    je .LBB123_2
2057; X64-NEXT:  # %bb.1: # %for.body.preheader
2058; X64-NEXT:    movb $0, (%rsi)
2059; X64-NEXT:  .LBB123_2: # %for.end
2060; X64-NEXT:    retq
2061;
2062; X86-LABEL: pr52560:
2063; X86:       # %bb.0: # %entry
2064; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
2065; X86-NEXT:    vmovw %eax, %xmm1
2066; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2067; X86-NEXT:    vpcmpgtw %xmm2, %xmm1, %k1
2068; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2069; X86-NEXT:    vmovw %xmm0, %eax
2070; X86-NEXT:    testw %ax, %ax
2071; X86-NEXT:    je .LBB123_2
2072; X86-NEXT:  # %bb.1: # %for.body.preheader
2073; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2074; X86-NEXT:    movb $0, (%eax)
2075; X86-NEXT:  .LBB123_2: # %for.end
2076; X86-NEXT:    retl
2077entry:
2078  %conv = sext i8 %0 to i16
2079  %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
2080  %3 = icmp sgt <2 x i16> %2, zeroinitializer
2081  %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison>
2082  %5 = extractelement <2 x i16> %4, i32 0
2083  %tobool.not14 = icmp eq i16 %5, 0
2084  br i1 %tobool.not14, label %for.end, label %for.body.preheader
2085
2086for.body.preheader:                               ; preds = %entry
2087  store i8 0, ptr %c, align 1
2088  br label %for.end
2089
2090for.end:                                          ; preds = %for.body.preheader, %entry
2091  ret void
2092}
2093
2094define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind {
2095; X64-LABEL: pr52561:
2096; X64:       # %bb.0:
2097; X64-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
2098; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
2099; X64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
2100; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
2101; X64-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
2102; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2103; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2104; X64-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
2105; X64-NEXT:    retq
2106;
2107; X86-LABEL: pr52561:
2108; X86:       # %bb.0:
2109; X86-NEXT:    pushl %ebp
2110; X86-NEXT:    movl %esp, %ebp
2111; X86-NEXT:    andl $-32, %esp
2112; X86-NEXT:    subl $32, %esp
2113; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
2114; X86-NEXT:    vpaddd 8(%ebp), %ymm1, %ymm1
2115; X86-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112]
2116; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
2117; X86-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
2118; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
2119; X86-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2120; X86-NEXT:    vmovsh %xmm0, %xmm2, %xmm0
2121; X86-NEXT:    movl %ebp, %esp
2122; X86-NEXT:    popl %ebp
2123; X86-NEXT:    retl
2124  %1 = add <16 x i32> %a, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
2125  %2 = add <16 x i32> %1, %b
2126  %3 = and <16 x i32> %2, <i32 65535, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 65535>
2127  ret <16 x i32> %3
2128}
2129
2130define <8 x i16> @pr59628_xmm(i16 %arg) {
2131; X64-LABEL: pr59628_xmm:
2132; X64:       # %bb.0:
2133; X64-NEXT:    vmovw %edi, %xmm0
2134; X64-NEXT:    vpbroadcastw %edi, %xmm1
2135; X64-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
2136; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2137; X64-NEXT:    retq
2138;
2139; X86-LABEL: pr59628_xmm:
2140; X86:       # %bb.0:
2141; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2142; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2143; X86-NEXT:    vpbroadcastw %eax, %xmm1
2144; X86-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
2145; X86-NEXT:    vpcmpneqw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %k1
2146; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
2147; X86-NEXT:    retl
2148  %I1 = insertelement <8 x i16> zeroinitializer, i16 %arg, i16 0
2149  %I2 = insertelement <8 x i16> %I1, i16 0, i16 %arg
2150  ret <8 x i16> %I2
2151}
2152