xref: /llvm-project/llvm/test/CodeGen/X86/avx512-arith.ll (revision b5d35feacb7246573c6a4ab2bddc4919a4228ed5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
7
8define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
9; CHECK-LABEL: addpd512:
10; CHECK:       # %bb.0: # %entry
11; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
12; CHECK-NEXT:    retq
13entry:
14  %add.i = fadd <8 x double> %x, %y
15  ret <8 x double> %add.i
16}
17
18define <8 x double> @addpd512fold(<8 x double> %y) {
19; CHECK-LABEL: addpd512fold:
20; CHECK:       # %bb.0: # %entry
21; CHECK-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
22; CHECK-NEXT:    retq
23entry:
24  %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
25  ret <8 x double> %add.i
26}
27
28define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
29; CHECK-LABEL: addps512:
30; CHECK:       # %bb.0: # %entry
31; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
32; CHECK-NEXT:    retq
33entry:
34  %add.i = fadd <16 x float> %x, %y
35  ret <16 x float> %add.i
36}
37
38define <16 x float> @addps512fold(<16 x float> %y) {
39; CHECK-LABEL: addps512fold:
40; CHECK:       # %bb.0: # %entry
41; CHECK-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
42; CHECK-NEXT:    retq
43entry:
44  %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
45  ret <16 x float> %add.i
46}
47
48define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
49; CHECK-LABEL: subpd512:
50; CHECK:       # %bb.0: # %entry
51; CHECK-NEXT:    vsubpd %zmm0, %zmm1, %zmm0
52; CHECK-NEXT:    retq
53entry:
54  %sub.i = fsub <8 x double> %x, %y
55  ret <8 x double> %sub.i
56}
57
58define <8 x double> @subpd512fold(<8 x double> %y, ptr %x) {
59; CHECK-LABEL: subpd512fold:
60; CHECK:       # %bb.0: # %entry
61; CHECK-NEXT:    vsubpd (%rdi), %zmm0, %zmm0
62; CHECK-NEXT:    retq
63entry:
64  %tmp2 = load <8 x double>, ptr %x, align 8
65  %sub.i = fsub <8 x double> %y, %tmp2
66  ret <8 x double> %sub.i
67}
68
69define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
70; CHECK-LABEL: subps512:
71; CHECK:       # %bb.0: # %entry
72; CHECK-NEXT:    vsubps %zmm0, %zmm1, %zmm0
73; CHECK-NEXT:    retq
74entry:
75  %sub.i = fsub <16 x float> %x, %y
76  ret <16 x float> %sub.i
77}
78
79define <16 x float> @subps512fold(<16 x float> %y, ptr %x) {
80; CHECK-LABEL: subps512fold:
81; CHECK:       # %bb.0: # %entry
82; CHECK-NEXT:    vsubps (%rdi), %zmm0, %zmm0
83; CHECK-NEXT:    retq
84entry:
85  %tmp2 = load <16 x float>, ptr %x, align 4
86  %sub.i = fsub <16 x float> %y, %tmp2
87  ret <16 x float> %sub.i
88}
89
90define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
91; AVX512F-LABEL: imulq512:
92; AVX512F:       # %bb.0:
93; AVX512F-NEXT:    vpsrlq $32, %zmm1, %zmm2
94; AVX512F-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
95; AVX512F-NEXT:    vpsrlq $32, %zmm0, %zmm3
96; AVX512F-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
97; AVX512F-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
98; AVX512F-NEXT:    vpsllq $32, %zmm2, %zmm2
99; AVX512F-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
100; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
101; AVX512F-NEXT:    retq
102;
103; AVX512VL-LABEL: imulq512:
104; AVX512VL:       # %bb.0:
105; AVX512VL-NEXT:    vpsrlq $32, %zmm1, %zmm2
106; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
107; AVX512VL-NEXT:    vpsrlq $32, %zmm0, %zmm3
108; AVX512VL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
109; AVX512VL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
110; AVX512VL-NEXT:    vpsllq $32, %zmm2, %zmm2
111; AVX512VL-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
112; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
113; AVX512VL-NEXT:    retq
114;
115; AVX512BW-LABEL: imulq512:
116; AVX512BW:       # %bb.0:
117; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm2
118; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm2, %zmm2
119; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm3
120; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm1, %zmm3
121; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
122; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
123; AVX512BW-NEXT:    vpmuludq %zmm0, %zmm1, %zmm0
124; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
125; AVX512BW-NEXT:    retq
126;
127; AVX512DQ-LABEL: imulq512:
128; AVX512DQ:       # %bb.0:
129; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
130; AVX512DQ-NEXT:    retq
131;
132; SKX-LABEL: imulq512:
133; SKX:       # %bb.0:
134; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
135; SKX-NEXT:    retq
136  %z = mul <8 x i64>%x, %y
137  ret <8 x i64>%z
138}
139
140define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
141; AVX512F-LABEL: imulq256:
142; AVX512F:       # %bb.0:
143; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
144; AVX512F-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
145; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm3
146; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
147; AVX512F-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
148; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
149; AVX512F-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
150; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
151; AVX512F-NEXT:    retq
152;
153; AVX512VL-LABEL: imulq256:
154; AVX512VL:       # %bb.0:
155; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm2
156; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
157; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm3
158; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
159; AVX512VL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
160; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
161; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
162; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
163; AVX512VL-NEXT:    retq
164;
165; AVX512BW-LABEL: imulq256:
166; AVX512BW:       # %bb.0:
167; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm2
168; AVX512BW-NEXT:    vpmuludq %ymm0, %ymm2, %ymm2
169; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm3
170; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm1, %ymm3
171; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
172; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
173; AVX512BW-NEXT:    vpmuludq %ymm0, %ymm1, %ymm0
174; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
175; AVX512BW-NEXT:    retq
176;
177; AVX512DQ-LABEL: imulq256:
178; AVX512DQ:       # %bb.0:
179; AVX512DQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
180; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
181; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
182; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
183; AVX512DQ-NEXT:    retq
184;
185; SKX-LABEL: imulq256:
186; SKX:       # %bb.0:
187; SKX-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
188; SKX-NEXT:    retq
189  %z = mul <4 x i64>%x, %y
190  ret <4 x i64>%z
191}
192
193define <4 x i64> @imulq256_bcast(<4 x i64> %x) {
194; AVX512F-LABEL: imulq256_bcast:
195; AVX512F:       # %bb.0:
196; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337]
197; AVX512F-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
198; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
199; AVX512F-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
200; AVX512F-NEXT:    vpsllq $32, %ymm0, %ymm0
201; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
202; AVX512F-NEXT:    retq
203;
204; AVX512VL-LABEL: imulq256_bcast:
205; AVX512VL:       # %bb.0:
206; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337]
207; AVX512VL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
208; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
209; AVX512VL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
210; AVX512VL-NEXT:    vpsllq $32, %ymm0, %ymm0
211; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
212; AVX512VL-NEXT:    retq
213;
214; AVX512BW-LABEL: imulq256_bcast:
215; AVX512BW:       # %bb.0:
216; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337]
217; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
218; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm0
219; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
220; AVX512BW-NEXT:    vpsllq $32, %ymm0, %ymm0
221; AVX512BW-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
222; AVX512BW-NEXT:    retq
223;
224; AVX512DQ-LABEL: imulq256_bcast:
225; AVX512DQ:       # %bb.0:
226; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
227; AVX512DQ-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
228; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
229; AVX512DQ-NEXT:    retq
230;
231; SKX-LABEL: imulq256_bcast:
232; SKX:       # %bb.0:
233; SKX-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
234; SKX-NEXT:    retq
235  %z = mul <4 x i64> %x, <i64 1337, i64 1337, i64 1337, i64 1337>
236  ret <4 x i64>%z
237}
238
239define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
240; AVX512F-LABEL: imulq128:
241; AVX512F:       # %bb.0:
242; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
243; AVX512F-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
244; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm3
245; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
246; AVX512F-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
247; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
248; AVX512F-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
249; AVX512F-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
250; AVX512F-NEXT:    retq
251;
252; AVX512VL-LABEL: imulq128:
253; AVX512VL:       # %bb.0:
254; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm2
255; AVX512VL-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
256; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm3
257; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
258; AVX512VL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
259; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
260; AVX512VL-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
261; AVX512VL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
262; AVX512VL-NEXT:    retq
263;
264; AVX512BW-LABEL: imulq128:
265; AVX512BW:       # %bb.0:
266; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm2
267; AVX512BW-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
268; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm3
269; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
270; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
271; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
272; AVX512BW-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
273; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
274; AVX512BW-NEXT:    retq
275;
276; AVX512DQ-LABEL: imulq128:
277; AVX512DQ:       # %bb.0:
278; AVX512DQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
279; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
280; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
281; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
282; AVX512DQ-NEXT:    vzeroupper
283; AVX512DQ-NEXT:    retq
284;
285; SKX-LABEL: imulq128:
286; SKX:       # %bb.0:
287; SKX-NEXT:    vpmullq %xmm0, %xmm1, %xmm0
288; SKX-NEXT:    retq
289  %z = mul <2 x i64>%x, %y
290  ret <2 x i64>%z
291}
292
293define <2 x i64> @imulq128_bcast(<2 x i64> %x) {
294; AVX512F-LABEL: imulq128_bcast:
295; AVX512F:       # %bb.0:
296; AVX512F-NEXT:    vpmovsxwq {{.*#+}} xmm1 = [8086,8086]
297; AVX512F-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
298; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
299; AVX512F-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
300; AVX512F-NEXT:    vpsllq $32, %xmm0, %xmm0
301; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
302; AVX512F-NEXT:    retq
303;
304; AVX512VL-LABEL: imulq128_bcast:
305; AVX512VL:       # %bb.0:
306; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [8086,8086]
307; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
308; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
309; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
310; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
311; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
312; AVX512VL-NEXT:    retq
313;
314; AVX512BW-LABEL: imulq128_bcast:
315; AVX512BW:       # %bb.0:
316; AVX512BW-NEXT:    vpmovsxwq {{.*#+}} xmm1 = [8086,8086]
317; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
318; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm0
319; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
320; AVX512BW-NEXT:    vpsllq $32, %xmm0, %xmm0
321; AVX512BW-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
322; AVX512BW-NEXT:    retq
323;
324; AVX512DQ-LABEL: imulq128_bcast:
325; AVX512DQ:       # %bb.0:
326; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
327; AVX512DQ-NEXT:    vpmovsxwq {{.*#+}} xmm1 = [8086,8086]
328; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
329; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
330; AVX512DQ-NEXT:    vzeroupper
331; AVX512DQ-NEXT:    retq
332;
333; SKX-LABEL: imulq128_bcast:
334; SKX:       # %bb.0:
335; SKX-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
336; SKX-NEXT:    retq
337  %z = mul <2 x i64> %x, <i64 8086, i64 8086>
338  ret <2 x i64>%z
339}
340
341define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
342; CHECK-LABEL: mulpd512:
343; CHECK:       # %bb.0: # %entry
344; CHECK-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
345; CHECK-NEXT:    retq
346entry:
347  %mul.i = fmul <8 x double> %x, %y
348  ret <8 x double> %mul.i
349}
350
351define <8 x double> @mulpd512fold(<8 x double> %y) {
352; CHECK-LABEL: mulpd512fold:
353; CHECK:       # %bb.0: # %entry
354; CHECK-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
355; CHECK-NEXT:    retq
356entry:
357  %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
358  ret <8 x double> %mul.i
359}
360
361define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
362; CHECK-LABEL: mulps512:
363; CHECK:       # %bb.0: # %entry
364; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
365; CHECK-NEXT:    retq
366entry:
367  %mul.i = fmul <16 x float> %x, %y
368  ret <16 x float> %mul.i
369}
370
371define <16 x float> @mulps512fold(<16 x float> %y) {
372; CHECK-LABEL: mulps512fold:
373; CHECK:       # %bb.0: # %entry
374; CHECK-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
375; CHECK-NEXT:    retq
376entry:
377  %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
378  ret <16 x float> %mul.i
379}
380
381define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
382; CHECK-LABEL: divpd512:
383; CHECK:       # %bb.0: # %entry
384; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
385; CHECK-NEXT:    retq
386entry:
387  %div.i = fdiv <8 x double> %x, %y
388  ret <8 x double> %div.i
389}
390
391define <8 x double> @divpd512fold(<8 x double> %y) {
392; CHECK-LABEL: divpd512fold:
393; CHECK:       # %bb.0: # %entry
394; CHECK-NEXT:    vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
395; CHECK-NEXT:    retq
396entry:
397  %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
398  ret <8 x double> %div.i
399}
400
401define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
402; CHECK-LABEL: divps512:
403; CHECK:       # %bb.0: # %entry
404; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm0
405; CHECK-NEXT:    retq
406entry:
407  %div.i = fdiv <16 x float> %x, %y
408  ret <16 x float> %div.i
409}
410
411define <16 x float> @divps512fold(<16 x float> %y) {
412; CHECK-LABEL: divps512fold:
413; CHECK:       # %bb.0: # %entry
414; CHECK-NEXT:    vdivps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
415; CHECK-NEXT:    retq
416entry:
417  %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
418  ret <16 x float> %div.i
419}
420
421define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
422; CHECK-LABEL: vpaddq_test:
423; CHECK:       # %bb.0:
424; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
425; CHECK-NEXT:    retq
426  %x = add <8 x i64> %i, %j
427  ret <8 x i64> %x
428}
429
430define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, ptr %j) nounwind {
431; CHECK-LABEL: vpaddq_fold_test:
432; CHECK:       # %bb.0:
433; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
434; CHECK-NEXT:    retq
435  %tmp = load <8 x i64>, ptr %j, align 4
436  %x = add <8 x i64> %i, %tmp
437  ret <8 x i64> %x
438}
439
440define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
441; CHECK-LABEL: vpaddq_broadcast_test:
442; CHECK:       # %bb.0:
443; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
444; CHECK-NEXT:    retq
445  %x = add <8 x i64> %i, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
446  ret <8 x i64> %x
447}
448
449define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, ptr %j) nounwind {
450; CHECK-LABEL: vpaddq_broadcast2_test:
451; CHECK:       # %bb.0:
452; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
453; CHECK-NEXT:    retq
454  %tmp = load i64, ptr %j
455  %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
456  %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
457  %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
458  %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
459  %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
460  %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
461  %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
462  %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
463  %x = add <8 x i64> %i, %j.7
464  ret <8 x i64> %x
465}
466
467define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
468; CHECK-LABEL: vpaddd_test:
469; CHECK:       # %bb.0:
470; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
471; CHECK-NEXT:    retq
472  %x = add <16 x i32> %i, %j
473  ret <16 x i32> %x
474}
475
476define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, ptr %j) nounwind {
477; CHECK-LABEL: vpaddd_fold_test:
478; CHECK:       # %bb.0:
479; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
480; CHECK-NEXT:    retq
481  %tmp = load <16 x i32>, ptr %j, align 4
482  %x = add <16 x i32> %i, %tmp
483  ret <16 x i32> %x
484}
485
486define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
487; CHECK-LABEL: vpaddd_broadcast_test:
488; CHECK:       # %bb.0:
489; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
490; CHECK-NEXT:    retq
491  %x = add <16 x i32> %i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
492  ret <16 x i32> %x
493}
494
495define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
496; CHECK-LABEL: vpaddd_mask_test:
497; CHECK:       # %bb.0:
498; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
499; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
500; CHECK-NEXT:    retq
501  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
502  %x = add <16 x i32> %i, %j
503  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
504  ret <16 x i32> %r
505}
506
507define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
508; CHECK-LABEL: vpaddd_maskz_test:
509; CHECK:       # %bb.0:
510; CHECK-NEXT:    vptestmd %zmm2, %zmm2, %k1
511; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
512; CHECK-NEXT:    retq
513  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
514  %x = add <16 x i32> %i, %j
515  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
516  ret <16 x i32> %r
517}
518
519define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, ptr %j.ptr, <16 x i32> %mask1) nounwind readnone {
520; CHECK-LABEL: vpaddd_mask_fold_test:
521; CHECK:       # %bb.0:
522; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
523; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
524; CHECK-NEXT:    retq
525  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
526  %j = load <16 x i32>, ptr %j.ptr
527  %x = add <16 x i32> %i, %j
528  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
529  ret <16 x i32> %r
530}
531
532define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
533; CHECK-LABEL: vpaddd_mask_broadcast_test:
534; CHECK:       # %bb.0:
535; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
536; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
537; CHECK-NEXT:    retq
538  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
539  %x = add <16 x i32> %i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
540  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
541  ret <16 x i32> %r
542}
543
544define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, ptr %j.ptr, <16 x i32> %mask1) nounwind readnone {
545; CHECK-LABEL: vpaddd_maskz_fold_test:
546; CHECK:       # %bb.0:
547; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
548; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
549; CHECK-NEXT:    retq
550  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
551  %j = load <16 x i32>, ptr %j.ptr
552  %x = add <16 x i32> %i, %j
553  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
554  ret <16 x i32> %r
555}
556
557define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
558; CHECK-LABEL: vpaddd_maskz_broadcast_test:
559; CHECK:       # %bb.0:
560; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
561; CHECK-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
562; CHECK-NEXT:    retq
563  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
564  %x = add <16 x i32> %i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
565  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
566  ret <16 x i32> %r
567}
568
569define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
570; CHECK-LABEL: vpsubq_test:
571; CHECK:       # %bb.0:
572; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
573; CHECK-NEXT:    retq
574  %x = sub <8 x i64> %i, %j
575  ret <8 x i64> %x
576}
577
578define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
579; CHECK-LABEL: vpsubd_test:
580; CHECK:       # %bb.0:
581; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
582; CHECK-NEXT:    retq
583  %x = sub <16 x i32> %i, %j
584  ret <16 x i32> %x
585}
586
587define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
588; CHECK-LABEL: vpmulld_test:
589; CHECK:       # %bb.0:
590; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
591; CHECK-NEXT:    retq
592  %x = mul <16 x i32> %i, %j
593  ret <16 x i32> %x
594}
595
596declare float @sqrtf(float) readnone
597define float @sqrtA(float %a) nounwind uwtable readnone ssp {
598; CHECK-LABEL: sqrtA:
599; CHECK:       # %bb.0: # %entry
600; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
601; CHECK-NEXT:    retq
602entry:
603  %conv1 = tail call float @sqrtf(float %a) nounwind readnone
604  ret float %conv1
605}
606
607declare double @sqrt(double) readnone
608define double @sqrtB(double %a) nounwind uwtable readnone ssp {
609; CHECK-LABEL: sqrtB:
610; CHECK:       # %bb.0: # %entry
611; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
612; CHECK-NEXT:    retq
613entry:
614  %call = tail call double @sqrt(double %a) nounwind readnone
615  ret double %call
616}
617
618declare float @llvm.sqrt.f32(float)
619define float @sqrtC(float %a) nounwind {
620; CHECK-LABEL: sqrtC:
621; CHECK:       # %bb.0:
622; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
623; CHECK-NEXT:    retq
624  %b = call float @llvm.sqrt.f32(float %a)
625  ret float %b
626}
627
628declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
629define <16 x float> @sqrtD(<16 x float> %a) nounwind {
630; CHECK-LABEL: sqrtD:
631; CHECK:       # %bb.0:
632; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
633; CHECK-NEXT:    retq
634  %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
635  ret <16 x float> %b
636}
637
638declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
639define <8 x double> @sqrtE(<8 x double> %a) nounwind {
640; CHECK-LABEL: sqrtE:
641; CHECK:       # %bb.0:
642; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
643; CHECK-NEXT:    retq
644  %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
645  ret <8 x double> %b
646}
647
648define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
649; CHECK-LABEL: fadd_broadcast:
650; CHECK:       # %bb.0:
651; CHECK-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
652; CHECK-NEXT:    retq
653  %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
654  ret <16 x float> %b
655}
656
657define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
658; CHECK-LABEL: addq_broadcast:
659; CHECK:       # %bb.0:
660; CHECK-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
661; CHECK-NEXT:    retq
662  %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
663  ret <8 x i64> %b
664}
665
666define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
667; AVX512F-LABEL: orq_broadcast:
668; AVX512F:       # %bb.0:
669; AVX512F-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
670; AVX512F-NEXT:    retq
671;
672; AVX512VL-LABEL: orq_broadcast:
673; AVX512VL:       # %bb.0:
674; AVX512VL-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
675; AVX512VL-NEXT:    retq
676;
677; AVX512BW-LABEL: orq_broadcast:
678; AVX512BW:       # %bb.0:
679; AVX512BW-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
680; AVX512BW-NEXT:    retq
681;
682; AVX512DQ-LABEL: orq_broadcast:
683; AVX512DQ:       # %bb.0:
684; AVX512DQ-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
685; AVX512DQ-NEXT:    retq
686;
687; SKX-LABEL: orq_broadcast:
688; SKX:       # %bb.0:
689; SKX-NEXT:    vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
690; SKX-NEXT:    retq
691  %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
692  ret <8 x i64> %b
693}
694
695define <16 x i32> @andd512fold(<16 x i32> %y, ptr %x) {
696; AVX512F-LABEL: andd512fold:
697; AVX512F:       # %bb.0: # %entry
698; AVX512F-NEXT:    vpandd (%rdi), %zmm0, %zmm0
699; AVX512F-NEXT:    retq
700;
701; AVX512VL-LABEL: andd512fold:
702; AVX512VL:       # %bb.0: # %entry
703; AVX512VL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
704; AVX512VL-NEXT:    retq
705;
706; AVX512BW-LABEL: andd512fold:
707; AVX512BW:       # %bb.0: # %entry
708; AVX512BW-NEXT:    vpandd (%rdi), %zmm0, %zmm0
709; AVX512BW-NEXT:    retq
710;
711; AVX512DQ-LABEL: andd512fold:
712; AVX512DQ:       # %bb.0: # %entry
713; AVX512DQ-NEXT:    vandps (%rdi), %zmm0, %zmm0
714; AVX512DQ-NEXT:    retq
715;
716; SKX-LABEL: andd512fold:
717; SKX:       # %bb.0: # %entry
718; SKX-NEXT:    vandps (%rdi), %zmm0, %zmm0
719; SKX-NEXT:    retq
720entry:
721  %a = load <16 x i32>, ptr %x, align 4
722  %b = and <16 x i32> %y, %a
723  ret <16 x i32> %b
724}
725
726define <8 x i64> @andqbrst(<8 x i64> %p1, ptr %ap) {
727; AVX512F-LABEL: andqbrst:
728; AVX512F:       # %bb.0: # %entry
729; AVX512F-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
730; AVX512F-NEXT:    retq
731;
732; AVX512VL-LABEL: andqbrst:
733; AVX512VL:       # %bb.0: # %entry
734; AVX512VL-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
735; AVX512VL-NEXT:    retq
736;
737; AVX512BW-LABEL: andqbrst:
738; AVX512BW:       # %bb.0: # %entry
739; AVX512BW-NEXT:    vpandq (%rdi){1to8}, %zmm0, %zmm0
740; AVX512BW-NEXT:    retq
741;
742; AVX512DQ-LABEL: andqbrst:
743; AVX512DQ:       # %bb.0: # %entry
744; AVX512DQ-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
745; AVX512DQ-NEXT:    retq
746;
747; SKX-LABEL: andqbrst:
748; SKX:       # %bb.0: # %entry
749; SKX-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
750; SKX-NEXT:    retq
751entry:
752  %a = load i64, ptr %ap, align 8
753  %b = insertelement <8 x i64> undef, i64 %a, i32 0
754  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
755  %d = and <8 x i64> %p1, %c
756  ret <8 x i64>%d
757}
758
759define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
760; CHECK-LABEL: test_mask_vaddps:
761; CHECK:       # %bb.0:
762; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
763; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
764; CHECK-NEXT:    retq
765                                     <16 x float> %j, <16 x i32> %mask1)
766                                     nounwind readnone {
767  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
768  %x = fadd <16 x float> %i, %j
769  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
770  ret <16 x float> %r
771}
772
773define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
774; CHECK-LABEL: test_mask_vmulps:
775; CHECK:       # %bb.0:
776; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
777; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
778; CHECK-NEXT:    retq
779                                     <16 x float> %j, <16 x i32> %mask1)
780                                     nounwind readnone {
781  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
782  %x = fmul <16 x float> %i, %j
783  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
784  ret <16 x float> %r
785}
786
787define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
788; CHECK-LABEL: test_mask_vminps:
789; CHECK:       # %bb.0:
790; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
791; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
792; CHECK-NEXT:    retq
793                                     <16 x float> %j, <16 x i32> %mask1)
794                                     nounwind readnone {
795  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
796  %cmp_res = fcmp olt <16 x float> %i, %j
797  %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
798  %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst
799  ret <16 x float> %r
800}
801
802define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
803; AVX512F-LABEL: test_mask_vminpd:
804; AVX512F:       # %bb.0:
805; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
806; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
807; AVX512F-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
808; AVX512F-NEXT:    retq
809;
810; AVX512VL-LABEL: test_mask_vminpd:
811; AVX512VL:       # %bb.0:
812; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
813; AVX512VL-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
814; AVX512VL-NEXT:    retq
815;
816; AVX512BW-LABEL: test_mask_vminpd:
817; AVX512BW:       # %bb.0:
818; AVX512BW-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
819; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
820; AVX512BW-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
821; AVX512BW-NEXT:    retq
822;
823; AVX512DQ-LABEL: test_mask_vminpd:
824; AVX512DQ:       # %bb.0:
825; AVX512DQ-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
826; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
827; AVX512DQ-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
828; AVX512DQ-NEXT:    retq
829;
830; SKX-LABEL: test_mask_vminpd:
831; SKX:       # %bb.0:
832; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
833; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
834; SKX-NEXT:    retq
835                                     <8 x double> %j, <8 x i32> %mask1)
836                                     nounwind readnone {
837  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
838  %cmp_res = fcmp olt <8 x double> %i, %j
839  %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
840  %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst
841  ret <8 x double> %r
842}
843
844define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
845; CHECK-LABEL: test_mask_vmaxps:
846; CHECK:       # %bb.0:
847; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
848; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
849; CHECK-NEXT:    retq
850                                     <16 x float> %j, <16 x i32> %mask1)
851                                     nounwind readnone {
852  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
853  %cmp_res = fcmp ogt <16 x float> %i, %j
854  %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
855  %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst
856  ret <16 x float> %r
857}
858
859define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
860; AVX512F-LABEL: test_mask_vmaxpd:
861; AVX512F:       # %bb.0:
862; AVX512F-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
863; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
864; AVX512F-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
865; AVX512F-NEXT:    retq
866;
867; AVX512VL-LABEL: test_mask_vmaxpd:
868; AVX512VL:       # %bb.0:
869; AVX512VL-NEXT:    vptestmd %ymm3, %ymm3, %k1
870; AVX512VL-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
871; AVX512VL-NEXT:    retq
872;
873; AVX512BW-LABEL: test_mask_vmaxpd:
874; AVX512BW:       # %bb.0:
875; AVX512BW-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
876; AVX512BW-NEXT:    vptestmd %zmm3, %zmm3, %k1
877; AVX512BW-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
878; AVX512BW-NEXT:    retq
879;
880; AVX512DQ-LABEL: test_mask_vmaxpd:
881; AVX512DQ:       # %bb.0:
882; AVX512DQ-NEXT:    # kill: def $ymm3 killed $ymm3 def $zmm3
883; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k1
884; AVX512DQ-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
885; AVX512DQ-NEXT:    retq
886;
887; SKX-LABEL: test_mask_vmaxpd:
888; SKX:       # %bb.0:
889; SKX-NEXT:    vptestmd %ymm3, %ymm3, %k1
890; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
891; SKX-NEXT:    retq
892                                     <8 x double> %j, <8 x i32> %mask1)
893                                     nounwind readnone {
894  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
895  %cmp_res = fcmp ogt <8 x double> %i, %j
896  %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
897  %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst
898  ret <8 x double> %r
899}
900
901define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
902; CHECK-LABEL: test_mask_vsubps:
903; CHECK:       # %bb.0:
904; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
905; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
906; CHECK-NEXT:    retq
907                                     <16 x float> %j, <16 x i32> %mask1)
908                                     nounwind readnone {
909  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
910  %x = fsub <16 x float> %i, %j
911  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
912  ret <16 x float> %r
913}
914
915define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
916; CHECK-LABEL: test_mask_vdivps:
917; CHECK:       # %bb.0:
918; CHECK-NEXT:    vptestmd %zmm3, %zmm3, %k1
919; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
920; CHECK-NEXT:    retq
921                                     <16 x float> %j, <16 x i32> %mask1)
922                                     nounwind readnone {
923  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
924  %x = fdiv <16 x float> %i, %j
925  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
926  ret <16 x float> %r
927}
928
929define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
930; CHECK-LABEL: test_mask_vaddpd:
931; CHECK:       # %bb.0:
932; CHECK-NEXT:    vptestmq %zmm3, %zmm3, %k1
933; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
934; CHECK-NEXT:    retq
935                                     <8 x double> %j, <8 x i64> %mask1)
936                                     nounwind readnone {
937  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
938  %x = fadd <8 x double> %i, %j
939  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
940  ret <8 x double> %r
941}
942
943define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
944; CHECK-LABEL: test_maskz_vaddpd:
945; CHECK:       # %bb.0:
946; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
947; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
948; CHECK-NEXT:    retq
949                                      <8 x i64> %mask1) nounwind readnone {
950  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
951  %x = fadd <8 x double> %i, %j
952  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
953  ret <8 x double> %r
954}
955
956define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
957; CHECK-LABEL: test_mask_fold_vaddpd:
958; CHECK:       # %bb.0:
959; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
960; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
961; CHECK-NEXT:    retq
962                                     ptr %j,  <8 x i64> %mask1)
963                                     nounwind {
964  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
965  %tmp = load <8 x double>, ptr %j, align 8
966  %x = fadd <8 x double> %i, %tmp
967  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
968  ret <8 x double> %r
969}
970
971define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, ptr %j,
972; CHECK-LABEL: test_maskz_fold_vaddpd:
973; CHECK:       # %bb.0:
974; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
975; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
976; CHECK-NEXT:    retq
977                                      <8 x i64> %mask1) nounwind {
978  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
979  %tmp = load <8 x double>, ptr %j, align 8
980  %x = fadd <8 x double> %i, %tmp
981  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
982  ret <8 x double> %r
983}
984
985define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, ptr %j) nounwind {
986; CHECK-LABEL: test_broadcast_vaddpd:
987; CHECK:       # %bb.0:
988; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
989; CHECK-NEXT:    retq
990  %tmp = load double, ptr %j
991  %b = insertelement <8 x double> undef, double %tmp, i32 0
992  %c = shufflevector <8 x double> %b, <8 x double> undef,
993                     <8 x i32> zeroinitializer
994  %x = fadd <8 x double> %c, %i
995  ret <8 x double> %x
996}
997
998define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
999; CHECK-LABEL: test_mask_broadcast_vaddpd:
1000; CHECK:       # %bb.0:
1001; CHECK-NEXT:    vmovapd %zmm1, %zmm0
1002; CHECK-NEXT:    vptestmq %zmm2, %zmm2, %k1
1003; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm0 {%k1}
1004; CHECK-NEXT:    retq
1005                                      ptr %j, <8 x i64> %mask1) nounwind {
1006  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
1007  %tmp = load double, ptr %j
1008  %b = insertelement <8 x double> undef, double %tmp, i32 0
1009  %c = shufflevector <8 x double> %b, <8 x double> undef,
1010                     <8 x i32> zeroinitializer
1011  %x = fadd <8 x double> %c, %i
1012  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i
1013  ret <8 x double> %r
1014}
1015
1016define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, ptr %j,
1017; CHECK-LABEL: test_maskz_broadcast_vaddpd:
1018; CHECK:       # %bb.0:
1019; CHECK-NEXT:    vptestmq %zmm1, %zmm1, %k1
1020; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
1021; CHECK-NEXT:    retq
1022                                       <8 x i64> %mask1) nounwind {
1023  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
1024  %tmp = load double, ptr %j
1025  %b = insertelement <8 x double> undef, double %tmp, i32 0
1026  %c = shufflevector <8 x double> %b, <8 x double> undef,
1027                     <8 x i32> zeroinitializer
1028  %x = fadd <8 x double> %c, %i
1029  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
1030  ret <8 x double> %r
1031}
1032
1033define <16 x float>  @test_fxor(<16 x float> %a) {
1034; AVX512F-LABEL: test_fxor:
1035; AVX512F:       # %bb.0:
1036; AVX512F-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1037; AVX512F-NEXT:    retq
1038;
1039; AVX512VL-LABEL: test_fxor:
1040; AVX512VL:       # %bb.0:
1041; AVX512VL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1042; AVX512VL-NEXT:    retq
1043;
1044; AVX512BW-LABEL: test_fxor:
1045; AVX512BW:       # %bb.0:
1046; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1047; AVX512BW-NEXT:    retq
1048;
1049; AVX512DQ-LABEL: test_fxor:
1050; AVX512DQ:       # %bb.0:
1051; AVX512DQ-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1052; AVX512DQ-NEXT:    retq
1053;
1054; SKX-LABEL: test_fxor:
1055; SKX:       # %bb.0:
1056; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1057; SKX-NEXT:    retq
1058
1059  %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
1060  ret <16 x float>%res
1061}
1062
1063define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
1064; AVX512F-LABEL: test_fxor_8f32:
1065; AVX512F:       # %bb.0:
1066; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1067; AVX512F-NEXT:    vxorps %ymm1, %ymm0, %ymm0
1068; AVX512F-NEXT:    retq
1069;
1070; AVX512VL-LABEL: test_fxor_8f32:
1071; AVX512VL:       # %bb.0:
1072; AVX512VL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1073; AVX512VL-NEXT:    retq
1074;
1075; AVX512BW-LABEL: test_fxor_8f32:
1076; AVX512BW:       # %bb.0:
1077; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1078; AVX512BW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
1079; AVX512BW-NEXT:    retq
1080;
1081; AVX512DQ-LABEL: test_fxor_8f32:
1082; AVX512DQ:       # %bb.0:
1083; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1084; AVX512DQ-NEXT:    vxorps %ymm1, %ymm0, %ymm0
1085; AVX512DQ-NEXT:    retq
1086;
1087; SKX-LABEL: test_fxor_8f32:
1088; SKX:       # %bb.0:
1089; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1090; SKX-NEXT:    retq
1091  %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
1092  ret <8 x float>%res
1093}
1094
1095define <8 x double> @fabs_v8f64(<8 x double> %p)
1096; AVX512F-LABEL: fabs_v8f64:
1097; AVX512F:       # %bb.0:
1098; AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1099; AVX512F-NEXT:    retq
1100;
1101; AVX512VL-LABEL: fabs_v8f64:
1102; AVX512VL:       # %bb.0:
1103; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1104; AVX512VL-NEXT:    retq
1105;
1106; AVX512BW-LABEL: fabs_v8f64:
1107; AVX512BW:       # %bb.0:
1108; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1109; AVX512BW-NEXT:    retq
1110;
1111; AVX512DQ-LABEL: fabs_v8f64:
1112; AVX512DQ:       # %bb.0:
1113; AVX512DQ-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1114; AVX512DQ-NEXT:    retq
1115;
1116; SKX-LABEL: fabs_v8f64:
1117; SKX:       # %bb.0:
1118; SKX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1119; SKX-NEXT:    retq
1120{
1121  %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
1122  ret <8 x double> %t
1123}
1124declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
1125
1126define <16 x float> @fabs_v16f32(<16 x float> %p)
1127; AVX512F-LABEL: fabs_v16f32:
1128; AVX512F:       # %bb.0:
1129; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1130; AVX512F-NEXT:    retq
1131;
1132; AVX512VL-LABEL: fabs_v16f32:
1133; AVX512VL:       # %bb.0:
1134; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1135; AVX512VL-NEXT:    retq
1136;
1137; AVX512BW-LABEL: fabs_v16f32:
1138; AVX512BW:       # %bb.0:
1139; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1140; AVX512BW-NEXT:    retq
1141;
1142; AVX512DQ-LABEL: fabs_v16f32:
1143; AVX512DQ:       # %bb.0:
1144; AVX512DQ-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1145; AVX512DQ-NEXT:    retq
1146;
1147; SKX-LABEL: fabs_v16f32:
1148; SKX:       # %bb.0:
1149; SKX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
1150; SKX-NEXT:    retq
1151{
1152  %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
1153  ret <16 x float> %t
1154}
1155declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
1156
1157define <16 x i32> @masked_inc_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
1158; CHECK-LABEL: masked_inc_test:
1159; CHECK:       # %bb.0:
1160; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
1161; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
1162; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1}
1163; CHECK-NEXT:    retq
1164  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
1165  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1166  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
1167  ret <16 x i32> %r
1168}
1169
1170define <16 x i32> @masked_dec_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
1171; CHECK-LABEL: masked_dec_test:
1172; CHECK:       # %bb.0:
1173; CHECK-NEXT:    vptestmd %zmm1, %zmm1, %k1
1174; CHECK-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
1175; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
1176; CHECK-NEXT:    retq
1177  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
1178  %x = sub <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1179  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
1180  ret <16 x i32> %r
1181}
1182