xref: /llvm-project/llvm/test/Transforms/InstCombine/X86/x86-avx512-inseltpoison.ll (revision fda7649b3c3797ddbb35a46746ae7876ab147612)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s
3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
4
5declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
6
7define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
8;
9; CHECK-LABEL: @test_add_ss(
10; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
11; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
12; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
13; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
14; CHECK-NEXT:    ret <4 x float> [[TMP4]]
15;
16  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
17  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
18  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
19  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
20  ret <4 x float> %4
21}
22
23define <4 x float> @test_add_ss_round(<4 x float> %a, <4 x float> %b) {
24;
25; CHECK-LABEL: @test_add_ss_round(
26; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 8)
27; CHECK-NEXT:    ret <4 x float> [[TMP1]]
28;
29  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
30  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
31  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
32  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
33  ret <4 x float> %4
34}
35
36define <4 x float> @test_add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
37;
38; CHECK-LABEL: @test_add_ss_mask(
39; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
40; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
41; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
42; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
43; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
44; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]]
45; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
46; CHECK-NEXT:    ret <4 x float> [[TMP7]]
47;
48  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
49  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
50  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
51  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
52  ret <4 x float> %4
53}
54
55define <4 x float> @test_add_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
56;
57; CHECK-LABEL: @test_add_ss_mask_round(
58; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
59; CHECK-NEXT:    ret <4 x float> [[TMP1]]
60;
61  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
62  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
63  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
64  %4 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
65  ret <4 x float> %4
66}
67
68define float @test_add_ss_1(float %a, float %b) {
69;
70; CHECK-LABEL: @test_add_ss_1(
71; CHECK-NEXT:    ret float 1.000000e+00
72;
73  %1 = insertelement <4 x float> poison, float %a, i32 0
74  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
75  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
76  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
77  %5 = insertelement <4 x float> poison, float %b, i32 0
78  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
79  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
80  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
81  %9 = tail call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
82  %10 = extractelement <4 x float> %9, i32 1
83  ret float %10
84}
85
86declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
87
88define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
89;
90; CHECK-LABEL: @test_add_sd(
91; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
92; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
93; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
94; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
95; CHECK-NEXT:    ret <2 x double> [[TMP4]]
96;
97  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
98  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
99  ret <2 x double> %2
100}
101
102define <2 x double> @test_add_sd_round(<2 x double> %a, <2 x double> %b) {
103;
104; CHECK-LABEL: @test_add_sd_round(
105; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 8)
106; CHECK-NEXT:    ret <2 x double> [[TMP1]]
107;
108  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
109  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
110  ret <2 x double> %2
111}
112
113define <2 x double> @test_add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
114;
115; CHECK-LABEL: @test_add_sd_mask(
116; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
117; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
118; CHECK-NEXT:    [[TMP3:%.*]] = fadd double [[TMP1]], [[TMP2]]
119; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
120; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
121; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]]
122; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
123; CHECK-NEXT:    ret <2 x double> [[TMP7]]
124;
125  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
126  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
127  ret <2 x double> %2
128}
129
130define <2 x double> @test_add_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
131;
132; CHECK-LABEL: @test_add_sd_mask_round(
133; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
134; CHECK-NEXT:    ret <2 x double> [[TMP1]]
135;
136  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
137  %2 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
138  ret <2 x double> %2
139}
140
141define double @test_add_sd_1(double %a, double %b) {
142;
143; CHECK-LABEL: @test_add_sd_1(
144; CHECK-NEXT:    ret double 1.000000e+00
145;
146  %1 = insertelement <2 x double> poison, double %a, i32 0
147  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
148  %3 = insertelement <2 x double> poison, double %b, i32 0
149  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
150  %5 = tail call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
151  %6 = extractelement <2 x double> %5, i32 1
152  ret double %6
153}
154
155declare <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
156
157define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
158;
159; CHECK-LABEL: @test_sub_ss(
160; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
161; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
162; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
163; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
164; CHECK-NEXT:    ret <4 x float> [[TMP4]]
165;
166  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
167  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
168  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
169  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
170  ret <4 x float> %4
171}
172
173define <4 x float> @test_sub_ss_round(<4 x float> %a, <4 x float> %b) {
174;
175; CHECK-LABEL: @test_sub_ss_round(
176; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 8)
177; CHECK-NEXT:    ret <4 x float> [[TMP1]]
178;
179  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
180  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
181  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
182  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
183  ret <4 x float> %4
184}
185
186define <4 x float> @test_sub_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
187;
188; CHECK-LABEL: @test_sub_ss_mask(
189; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
190; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
191; CHECK-NEXT:    [[TMP3:%.*]] = fsub float [[TMP1]], [[TMP2]]
192; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
193; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
194; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]]
195; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
196; CHECK-NEXT:    ret <4 x float> [[TMP7]]
197;
198  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
199  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
200  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
201  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
202  ret <4 x float> %4
203}
204
205define <4 x float> @test_sub_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
206;
207; CHECK-LABEL: @test_sub_ss_mask_round(
208; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
209; CHECK-NEXT:    ret <4 x float> [[TMP1]]
210;
211  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
212  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
213  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
214  %4 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
215  ret <4 x float> %4
216}
217
218define float @test_sub_ss_1(float %a, float %b) {
219;
220; CHECK-LABEL: @test_sub_ss_1(
221; CHECK-NEXT:    ret float 1.000000e+00
222;
223  %1 = insertelement <4 x float> poison, float %a, i32 0
224  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
225  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
226  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
227  %5 = insertelement <4 x float> poison, float %b, i32 0
228  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
229  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
230  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
231  %9 = tail call <4 x float> @llvm.x86.avx512.mask.sub.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
232  %10 = extractelement <4 x float> %9, i32 1
233  ret float %10
234}
235
236declare <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
237
238define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
239;
240; CHECK-LABEL: @test_sub_sd(
241; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
242; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
243; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
244; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
245; CHECK-NEXT:    ret <2 x double> [[TMP4]]
246;
247  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
248  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
249  ret <2 x double> %2
250}
251
252define <2 x double> @test_sub_sd_round(<2 x double> %a, <2 x double> %b) {
253;
254; CHECK-LABEL: @test_sub_sd_round(
255; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 8)
256; CHECK-NEXT:    ret <2 x double> [[TMP1]]
257;
258  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
259  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
260  ret <2 x double> %2
261}
262
263define <2 x double> @test_sub_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
264;
265; CHECK-LABEL: @test_sub_sd_mask(
266; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
267; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
268; CHECK-NEXT:    [[TMP3:%.*]] = fsub double [[TMP1]], [[TMP2]]
269; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
270; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
271; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]]
272; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
273; CHECK-NEXT:    ret <2 x double> [[TMP7]]
274;
275  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
276  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
277  ret <2 x double> %2
278}
279
280define <2 x double> @test_sub_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
281;
282; CHECK-LABEL: @test_sub_sd_mask_round(
283; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
284; CHECK-NEXT:    ret <2 x double> [[TMP1]]
285;
286  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
287  %2 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
288  ret <2 x double> %2
289}
290
291define double @test_sub_sd_1(double %a, double %b) {
292;
293; CHECK-LABEL: @test_sub_sd_1(
294; CHECK-NEXT:    ret double 1.000000e+00
295;
296  %1 = insertelement <2 x double> poison, double %a, i32 0
297  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
298  %3 = insertelement <2 x double> poison, double %b, i32 0
299  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
300  %5 = tail call <2 x double> @llvm.x86.avx512.mask.sub.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
301  %6 = extractelement <2 x double> %5, i32 1
302  ret double %6
303}
304
305declare <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
306
307define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
308;
309; CHECK-LABEL: @test_mul_ss(
310; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
311; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
312; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
313; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
314; CHECK-NEXT:    ret <4 x float> [[TMP4]]
315;
316  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
317  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
318  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
319  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
320  ret <4 x float> %4
321}
322
323define <4 x float> @test_mul_ss_round(<4 x float> %a, <4 x float> %b) {
324;
325; CHECK-LABEL: @test_mul_ss_round(
326; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 8)
327; CHECK-NEXT:    ret <4 x float> [[TMP1]]
328;
329  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
330  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
331  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
332  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
333  ret <4 x float> %4
334}
335
336define <4 x float> @test_mul_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
337;
338; CHECK-LABEL: @test_mul_ss_mask(
339; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
340; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
341; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP1]], [[TMP2]]
342; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
343; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
344; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]]
345; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
346; CHECK-NEXT:    ret <4 x float> [[TMP7]]
347;
348  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
349  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
350  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
351  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
352  ret <4 x float> %4
353}
354
355define <4 x float> @test_mul_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
356;
357; CHECK-LABEL: @test_mul_ss_mask_round(
358; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
359; CHECK-NEXT:    ret <4 x float> [[TMP1]]
360;
361  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
362  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
363  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
364  %4 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
365  ret <4 x float> %4
366}
367
368define float @test_mul_ss_1(float %a, float %b) {
369;
370; CHECK-LABEL: @test_mul_ss_1(
371; CHECK-NEXT:    ret float 1.000000e+00
372;
373  %1 = insertelement <4 x float> poison, float %a, i32 0
374  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
375  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
376  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
377  %5 = insertelement <4 x float> poison, float %b, i32 0
378  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
379  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
380  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
381  %9 = tail call <4 x float> @llvm.x86.avx512.mask.mul.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
382  %10 = extractelement <4 x float> %9, i32 1
383  ret float %10
384}
385
386declare <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
387
388define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
389;
390; CHECK-LABEL: @test_mul_sd(
391; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
392; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
393; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
394; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
395; CHECK-NEXT:    ret <2 x double> [[TMP4]]
396;
397  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
398  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
399  ret <2 x double> %2
400}
401
402define <2 x double> @test_mul_sd_round(<2 x double> %a, <2 x double> %b) {
403;
404; CHECK-LABEL: @test_mul_sd_round(
405; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 8)
406; CHECK-NEXT:    ret <2 x double> [[TMP1]]
407;
408  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
409  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
410  ret <2 x double> %2
411}
412
413define <2 x double> @test_mul_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
414;
415; CHECK-LABEL: @test_mul_sd_mask(
416; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
417; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
418; CHECK-NEXT:    [[TMP3:%.*]] = fmul double [[TMP1]], [[TMP2]]
419; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
420; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
421; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]]
422; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
423; CHECK-NEXT:    ret <2 x double> [[TMP7]]
424;
425  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
426  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
427  ret <2 x double> %2
428}
429
430define <2 x double> @test_mul_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
431;
432; CHECK-LABEL: @test_mul_sd_mask_round(
433; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
434; CHECK-NEXT:    ret <2 x double> [[TMP1]]
435;
436  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
437  %2 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
438  ret <2 x double> %2
439}
440
441define double @test_mul_sd_1(double %a, double %b) {
442;
443; CHECK-LABEL: @test_mul_sd_1(
444; CHECK-NEXT:    ret double 1.000000e+00
445;
446  %1 = insertelement <2 x double> poison, double %a, i32 0
447  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
448  %3 = insertelement <2 x double> poison, double %b, i32 0
449  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
450  %5 = tail call <2 x double> @llvm.x86.avx512.mask.mul.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
451  %6 = extractelement <2 x double> %5, i32 1
452  ret double %6
453}
454
455declare <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
456
457define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
458;
459; CHECK-LABEL: @test_div_ss(
460; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
461; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
462; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
463; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[A]], float [[TMP3]], i64 0
464; CHECK-NEXT:    ret <4 x float> [[TMP4]]
465;
466  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
467  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
468  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
469  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
470  ret <4 x float> %4
471}
472
473define <4 x float> @test_div_ss_round(<4 x float> %a, <4 x float> %b) {
474;
475; CHECK-LABEL: @test_div_ss_round(
476; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 8)
477; CHECK-NEXT:    ret <4 x float> [[TMP1]]
478;
479  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
480  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
481  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
482  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 8)
483  ret <4 x float> %4
484}
485
486define <4 x float> @test_div_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
487;
488; CHECK-LABEL: @test_div_ss_mask(
489; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
490; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
491; CHECK-NEXT:    [[TMP3:%.*]] = fdiv float [[TMP1]], [[TMP2]]
492; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
493; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
494; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], float [[TMP3]], float [[TMP5]]
495; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
496; CHECK-NEXT:    ret <4 x float> [[TMP7]]
497;
498  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
499  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
500  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
501  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
502  ret <4 x float> %4
503}
504
505define <4 x float> @test_div_ss_mask_round(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
506;
507; CHECK-LABEL: @test_div_ss_mask_round(
508; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
509; CHECK-NEXT:    ret <4 x float> [[TMP1]]
510;
511  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
512  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
513  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
514  %4 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 8)
515  ret <4 x float> %4
516}
517
518define float @test_div_ss_1(float %a, float %b) {
519;
520; CHECK-LABEL: @test_div_ss_1(
521; CHECK-NEXT:    ret float 1.000000e+00
522;
523  %1 = insertelement <4 x float> poison, float %a, i32 0
524  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
525  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
526  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
527  %5 = insertelement <4 x float> poison, float %b, i32 0
528  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
529  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
530  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
531  %9 = tail call <4 x float> @llvm.x86.avx512.mask.div.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
532  %10 = extractelement <4 x float> %9, i32 1
533  ret float %10
534}
535
536declare <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
537
538define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
539;
540; CHECK-LABEL: @test_div_sd(
541; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
542; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
543; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
544; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[A]], double [[TMP3]], i64 0
545; CHECK-NEXT:    ret <2 x double> [[TMP4]]
546;
547  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
548  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
549  ret <2 x double> %2
550}
551
552define <2 x double> @test_div_sd_round(<2 x double> %a, <2 x double> %b) {
553;
554; CHECK-LABEL: @test_div_sd_round(
555; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 8)
556; CHECK-NEXT:    ret <2 x double> [[TMP1]]
557;
558  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
559  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 8)
560  ret <2 x double> %2
561}
562
563define <2 x double> @test_div_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
564;
565; CHECK-LABEL: @test_div_sd_mask(
566; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
567; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
568; CHECK-NEXT:    [[TMP3:%.*]] = fdiv double [[TMP1]], [[TMP2]]
569; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK:%.*]] to i1
570; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
571; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP4]], double [[TMP3]], double [[TMP5]]
572; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
573; CHECK-NEXT:    ret <2 x double> [[TMP7]]
574;
575  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
576  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
577  ret <2 x double> %2
578}
579
580define <2 x double> @test_div_sd_mask_round(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
581;
582; CHECK-LABEL: @test_div_sd_mask_round(
583; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 8)
584; CHECK-NEXT:    ret <2 x double> [[TMP1]]
585;
586  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
587  %2 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 8)
588  ret <2 x double> %2
589}
590
591define double @test_div_sd_1(double %a, double %b) {
592;
593; CHECK-LABEL: @test_div_sd_1(
594; CHECK-NEXT:    ret double 1.000000e+00
595;
596  %1 = insertelement <2 x double> poison, double %a, i32 0
597  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
598  %3 = insertelement <2 x double> poison, double %b, i32 0
599  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
600  %5 = tail call <2 x double> @llvm.x86.avx512.mask.div.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
601  %6 = extractelement <2 x double> %5, i32 1
602  ret double %6
603}
604
605declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
606
607define <4 x float> @test_max_ss(<4 x float> %a, <4 x float> %b) {
608;
609; CHECK-LABEL: @test_max_ss(
610; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 4)
611; CHECK-NEXT:    ret <4 x float> [[TMP1]]
612;
613  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
614  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
615  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
616  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
617  ret <4 x float> %4
618}
619
620define <4 x float> @test_max_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
621;
622; CHECK-LABEL: @test_max_ss_mask(
623; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
624; CHECK-NEXT:    ret <4 x float> [[TMP1]]
625;
626  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
627  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
628  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
629  %4 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
630  ret <4 x float> %4
631}
632
633define float @test_max_ss_1(float %a, float %b) {
634;
635; CHECK-LABEL: @test_max_ss_1(
636; CHECK-NEXT:    ret float 1.000000e+00
637;
638  %1 = insertelement <4 x float> poison, float %a, i32 0
639  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
640  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
641  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
642  %5 = insertelement <4 x float> poison, float %b, i32 0
643  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
644  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
645  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
646  %9 = tail call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
647  %10 = extractelement <4 x float> %9, i32 1
648  ret float %10
649}
650
651declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
652
653define <2 x double> @test_max_sd(<2 x double> %a, <2 x double> %b) {
654;
655; CHECK-LABEL: @test_max_sd(
656; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 4)
657; CHECK-NEXT:    ret <2 x double> [[TMP1]]
658;
659  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
660  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
661  ret <2 x double> %2
662}
663
664define <2 x double> @test_max_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
665;
666; CHECK-LABEL: @test_max_sd_mask(
667; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
668; CHECK-NEXT:    ret <2 x double> [[TMP1]]
669;
670  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
671  %2 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
672  ret <2 x double> %2
673}
674
675define double @test_max_sd_1(double %a, double %b) {
676;
677; CHECK-LABEL: @test_max_sd_1(
678; CHECK-NEXT:    ret double 1.000000e+00
679;
680  %1 = insertelement <2 x double> poison, double %a, i32 0
681  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
682  %3 = insertelement <2 x double> poison, double %b, i32 0
683  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
684  %5 = tail call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
685  %6 = extractelement <2 x double> %5, i32 1
686  ret double %6
687}
688
689declare <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32)
690
691define <4 x float> @test_min_ss(<4 x float> %a, <4 x float> %b) {
692;
693; CHECK-LABEL: @test_min_ss(
694; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> <float undef, float poison, float poison, float poison>, i8 -1, i32 4)
695; CHECK-NEXT:    ret <4 x float> [[TMP1]]
696;
697  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
698  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
699  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
700  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %3, <4 x float> undef, i8 -1, i32 4)
701  ret <4 x float> %4
702}
703
704define <4 x float> @test_min_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
705;
706; CHECK-LABEL: @test_min_ss_mask(
707; CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
708; CHECK-NEXT:    ret <4 x float> [[TMP1]]
709;
710  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
711  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
712  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
713  %4 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %a, <4 x float> %b, <4 x float> %3, i8 %mask, i32 4)
714  ret <4 x float> %4
715}
716
717define float @test_min_ss_1(float %a, float %b) {
718;
719; CHECK-LABEL: @test_min_ss_1(
720; CHECK-NEXT:    ret float 1.000000e+00
721;
722  %1 = insertelement <4 x float> poison, float %a, i32 0
723  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
724  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
725  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
726  %5 = insertelement <4 x float> poison, float %b, i32 0
727  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
728  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
729  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
730  %9 = tail call <4 x float> @llvm.x86.avx512.mask.min.ss.round(<4 x float> %4, <4 x float> %8, <4 x float> undef, i8 -1, i32 8)
731  %10 = extractelement <4 x float> %9, i32 1
732  ret float %10
733}
734
735declare <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32)
736
737define <2 x double> @test_min_sd(<2 x double> %a, <2 x double> %b) {
738;
739; CHECK-LABEL: @test_min_sd(
740; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> <double undef, double poison>, i8 -1, i32 4)
741; CHECK-NEXT:    ret <2 x double> [[TMP1]]
742;
743  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
744  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %1, <2 x double> undef, i8 -1, i32 4)
745  ret <2 x double> %2
746}
747
748define <2 x double> @test_min_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
749;
750; CHECK-LABEL: @test_min_sd_mask(
751; CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 4)
752; CHECK-NEXT:    ret <2 x double> [[TMP1]]
753;
754  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
755  %2 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %a, <2 x double> %b, <2 x double> %1, i8 %mask, i32 4)
756  ret <2 x double> %2
757}
758
759define double @test_min_sd_1(double %a, double %b) {
760;
761; CHECK-LABEL: @test_min_sd_1(
762; CHECK-NEXT:    ret double 1.000000e+00
763;
764  %1 = insertelement <2 x double> poison, double %a, i32 0
765  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
766  %3 = insertelement <2 x double> poison, double %b, i32 0
767  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
768  %5 = tail call <2 x double> @llvm.x86.avx512.mask.min.sd.round(<2 x double> %2, <2 x double> %4, <2 x double> undef, i8 -1, i32 8)
769  %6 = extractelement <2 x double> %5, i32 1
770  ret double %6
771}
772
773declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
774
775define i8 @test_cmp_ss(<4 x float> %a, <4 x float> %b, i8 %mask) {
776;
777; CHECK-LABEL: @test_cmp_ss(
778; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
779; CHECK-NEXT:    ret i8 [[TMP1]]
780;
781  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
782  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
783  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
784  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
785  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
786  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
787  %7 = tail call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %3, <4 x float> %6, i32 3, i8 %mask, i32 4)
788  ret i8 %7
789}
790
791declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
792
793define i8 @test_cmp_sd(<2 x double> %a, <2 x double> %b, i8 %mask) {
794;
795; CHECK-LABEL: @test_cmp_sd(
796; CHECK-NEXT:    [[TMP1:%.*]] = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 3, i8 [[MASK:%.*]], i32 4)
797; CHECK-NEXT:    ret i8 [[TMP1]]
798;
799  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
800  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
801  %3 = tail call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %1, <2 x double> %2, i32 3, i8 %mask, i32 4)
802  ret i8 %3
803}
804
805define i64 @test(float %f, double %d) {
806;
807; CHECK-LABEL: @test(
808; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i64 0
809; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[V03]], i32 4)
810; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
811; CHECK-NEXT:    [[T1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> [[V13]], i32 4)
812; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
813; CHECK-NEXT:    [[T2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[V23]], i32 4)
814; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
815; CHECK-NEXT:    [[T3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> [[V33]], i32 4)
816; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> poison, double [[D:%.*]], i64 0
817; CHECK-NEXT:    [[T4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[V41]], i32 4)
818; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
819; CHECK-NEXT:    [[T5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> [[V51]], i32 4)
820; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
821; CHECK-NEXT:    [[T6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[V61]], i32 4)
822; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
823; CHECK-NEXT:    [[T7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> [[V71]], i32 4)
824; CHECK-NEXT:    [[T8:%.*]] = add i32 [[T0]], [[T2]]
825; CHECK-NEXT:    [[T9:%.*]] = add i32 [[T4]], [[T6]]
826; CHECK-NEXT:    [[T10:%.*]] = add i32 [[T8]], [[T9]]
827; CHECK-NEXT:    [[T11:%.*]] = sext i32 [[T10]] to i64
828; CHECK-NEXT:    [[T12:%.*]] = add i64 [[T1]], [[T3]]
829; CHECK-NEXT:    [[T13:%.*]] = add i64 [[T5]], [[T7]]
830; CHECK-NEXT:    [[T14:%.*]] = add i64 [[T12]], [[T13]]
831; CHECK-NEXT:    [[T15:%.*]] = add i64 [[T14]], [[T11]]
832; CHECK-NEXT:    ret i64 [[T15]]
833;
834  %v00 = insertelement <4 x float> poison, float %f, i32 0
835  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
836  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
837  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
838  %t0 = tail call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %v03, i32 4)
839  %v10 = insertelement <4 x float> poison, float %f, i32 0
840  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
841  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
842  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
843  %t1 = tail call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %v13, i32 4)
844  %v20 = insertelement <4 x float> poison, float %f, i32 0
845  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
846  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
847  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
848  %t2 = tail call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %v23, i32 4)
849  %v30 = insertelement <4 x float> poison, float %f, i32 0
850  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
851  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
852  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
853  %t3 = tail call i64 @llvm.x86.avx512.cvttss2si64(<4 x float> %v33, i32 4)
854  %v40 = insertelement <2 x double> poison, double %d, i32 0
855  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
856  %t4 = tail call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %v41, i32 4)
857  %v50 = insertelement <2 x double> poison, double %d, i32 0
858  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
859  %t5 = tail call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %v51, i32 4)
860  %v60 = insertelement <2 x double> poison, double %d, i32 0
861  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
862  %t6 = tail call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %v61, i32 4)
863  %v70 = insertelement <2 x double> poison, double %d, i32 0
864  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
865  %t7 = tail call i64 @llvm.x86.avx512.cvttsd2si64(<2 x double> %v71, i32 4)
866  %t8 = add i32 %t0, %t2
867  %t9 = add i32 %t4, %t6
868  %t10 = add i32 %t8, %t9
869  %t11 = sext i32 %t10 to i64
870  %t12 = add i64 %t1, %t3
871  %t13 = add i64 %t5, %t7
872  %t14 = add i64 %t12, %t13
873  %t15 = add i64 %t11, %t14
874  ret i64 %t15
875}
876
877declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32)
878declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32)
879declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32)
880declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32)
881declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32)
882declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32)
883declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32)
884declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32)
885
886define i64 @test2(float %f, double %d) {
887;
888; CHECK-LABEL: @test2(
889; CHECK-NEXT:    [[V03:%.*]] = insertelement <4 x float> poison, float [[F:%.*]], i64 0
890; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[V03]], i32 4)
891; CHECK-NEXT:    [[V13:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
892; CHECK-NEXT:    [[T1:%.*]] = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> [[V13]], i32 4)
893; CHECK-NEXT:    [[V23:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
894; CHECK-NEXT:    [[T2:%.*]] = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[V23]], i32 4)
895; CHECK-NEXT:    [[V33:%.*]] = insertelement <4 x float> poison, float [[F]], i64 0
896; CHECK-NEXT:    [[T3:%.*]] = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> [[V33]], i32 4)
897; CHECK-NEXT:    [[V41:%.*]] = insertelement <2 x double> poison, double [[D:%.*]], i64 0
898; CHECK-NEXT:    [[T4:%.*]] = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[V41]], i32 4)
899; CHECK-NEXT:    [[V51:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
900; CHECK-NEXT:    [[T5:%.*]] = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> [[V51]], i32 4)
901; CHECK-NEXT:    [[V61:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
902; CHECK-NEXT:    [[T6:%.*]] = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[V61]], i32 4)
903; CHECK-NEXT:    [[V71:%.*]] = insertelement <2 x double> poison, double [[D]], i64 0
904; CHECK-NEXT:    [[T7:%.*]] = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> [[V71]], i32 4)
905; CHECK-NEXT:    [[T8:%.*]] = add i32 [[T0]], [[T2]]
906; CHECK-NEXT:    [[T9:%.*]] = add i32 [[T4]], [[T6]]
907; CHECK-NEXT:    [[T10:%.*]] = add i32 [[T8]], [[T9]]
908; CHECK-NEXT:    [[T11:%.*]] = sext i32 [[T10]] to i64
909; CHECK-NEXT:    [[T12:%.*]] = add i64 [[T1]], [[T3]]
910; CHECK-NEXT:    [[T13:%.*]] = add i64 [[T5]], [[T7]]
911; CHECK-NEXT:    [[T14:%.*]] = add i64 [[T12]], [[T13]]
912; CHECK-NEXT:    [[T15:%.*]] = add i64 [[T14]], [[T11]]
913; CHECK-NEXT:    ret i64 [[T15]]
914;
915  %v00 = insertelement <4 x float> poison, float %f, i32 0
916  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
917  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
918  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
919  %t0 = tail call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %v03, i32 4)
920  %v10 = insertelement <4 x float> poison, float %f, i32 0
921  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
922  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
923  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
924  %t1 = tail call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %v13, i32 4)
925  %v20 = insertelement <4 x float> poison, float %f, i32 0
926  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
927  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
928  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
929  %t2 = tail call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %v23, i32 4)
930  %v30 = insertelement <4 x float> poison, float %f, i32 0
931  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
932  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
933  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
934  %t3 = tail call i64 @llvm.x86.avx512.cvttss2usi64(<4 x float> %v33, i32 4)
935  %v40 = insertelement <2 x double> poison, double %d, i32 0
936  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
937  %t4 = tail call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %v41, i32 4)
938  %v50 = insertelement <2 x double> poison, double %d, i32 0
939  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
940  %t5 = tail call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %v51, i32 4)
941  %v60 = insertelement <2 x double> poison, double %d, i32 0
942  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
943  %t6 = tail call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %v61, i32 4)
944  %v70 = insertelement <2 x double> poison, double %d, i32 0
945  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
946  %t7 = tail call i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double> %v71, i32 4)
947  %t8 = add i32 %t0, %t2
948  %t9 = add i32 %t4, %t6
949  %t10 = add i32 %t8, %t9
950  %t11 = sext i32 %t10 to i64
951  %t12 = add i64 %t1, %t3
952  %t13 = add i64 %t5, %t7
953  %t14 = add i64 %t12, %t13
954  %t15 = add i64 %t11, %t14
955  ret i64 %t15
956}
957
958declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32)
959declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32)
960declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32)
961declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32)
962declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32)
963declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32)
964declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32)
965declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32)
966
967declare float @llvm.fma.f32(float, float, float) #1
968
969define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
970;
971; CHECK-LABEL: @test_mask_vfmadd_ss(
972; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
973; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
974; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
975; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
976; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
977; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]]
978; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
979; CHECK-NEXT:    ret <4 x float> [[TMP7]]
980;
981  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
982  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
983  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
984  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
985  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
986  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
987  %7 = extractelement <4 x float> %a, i64 0
988  %8 = extractelement <4 x float> %3, i64 0
989  %9 = extractelement <4 x float> %6, i64 0
990  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
991  %11 = bitcast i8 %mask to <8 x i1>
992  %12 = extractelement <8 x i1> %11, i64 0
993  %13 = select i1 %12, float %10, float %7
994  %14 = insertelement <4 x float> %a, float %13, i64 0
995  ret <4 x float> %14
996}
997
998define float @test_mask_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
999;
1000; CHECK-LABEL: @test_mask_vfmadd_ss_0(
1001; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1002; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1003; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1004; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
1005; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1006; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP1]]
1007; CHECK-NEXT:    ret float [[TMP6]]
1008;
1009  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1010  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1011  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1012  %4 = extractelement <4 x float> %3, i64 0
1013  %5 = extractelement <4 x float> %b, i64 0
1014  %6 = extractelement <4 x float> %c, i64 0
1015  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1016  %8 = bitcast i8 %mask to <8 x i1>
1017  %9 = extractelement <8 x i1> %8, i64 0
1018  %10 = select i1 %9, float %7, float %4
1019  %11 = insertelement <4 x float> %3, float %10, i64 0
1020  %12 = extractelement <4 x float> %11, i32 0
1021  ret float %12
1022}
1023
1024define float @test_mask_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1025;
1026; CHECK-LABEL: @test_mask_vfmadd_ss_1(
1027; CHECK-NEXT:    ret float 1.000000e+00
1028;
1029  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1030  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1031  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1032  %4 = extractelement <4 x float> %3, i64 0
1033  %5 = extractelement <4 x float> %b, i64 0
1034  %6 = extractelement <4 x float> %c, i64 0
1035  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1036  %8 = bitcast i8 %mask to <8 x i1>
1037  %9 = extractelement <8 x i1> %8, i64 0
1038  %10 = select i1 %9, float %7, float %4
1039  %11 = insertelement <4 x float> %3, float %10, i64 0
1040  %12 = extractelement <4 x float> %11, i32 1
1041  ret float %12
1042}
1043
1044declare double @llvm.fma.f64(double, double, double) #1
1045
1046define <2 x double> @test_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1047;
1048; CHECK-LABEL: @test_mask_vfmadd_sd(
1049; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1050; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1051; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1052; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1053; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1054; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]]
1055; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
1056; CHECK-NEXT:    ret <2 x double> [[TMP7]]
1057;
1058  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
1059  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
1060  %3 = extractelement <2 x double> %a, i64 0
1061  %4 = extractelement <2 x double> %1, i64 0
1062  %5 = extractelement <2 x double> %2, i64 0
1063  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1064  %7 = bitcast i8 %mask to <8 x i1>
1065  %8 = extractelement <8 x i1> %7, i64 0
1066  %9 = select i1 %8, double %6, double %3
1067  %10 = insertelement <2 x double> %a, double %9, i64 0
1068  ret <2 x double> %10
1069}
1070
1071define double @test_mask_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1072;
1073; CHECK-LABEL: @test_mask_vfmadd_sd_0(
1074; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1075; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1076; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1077; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1078; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1079; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP1]]
1080; CHECK-NEXT:    ret double [[TMP6]]
1081;
1082  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1083  %2 = extractelement <2 x double> %1, i64 0
1084  %3 = extractelement <2 x double> %b, i64 0
1085  %4 = extractelement <2 x double> %c, i64 0
1086  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1087  %6 = bitcast i8 %mask to <8 x i1>
1088  %7 = extractelement <8 x i1> %6, i64 0
1089  %8 = select i1 %7, double %5, double %2
1090  %9 = insertelement <2 x double> %1, double %8, i64 0
1091  %10 = extractelement <2 x double> %9, i32 0
1092  ret double %10
1093}
1094
1095define double @test_mask_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1096;
1097; CHECK-LABEL: @test_mask_vfmadd_sd_1(
1098; CHECK-NEXT:    ret double 1.000000e+00
1099;
1100  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1101  %2 = extractelement <2 x double> %1, i64 0
1102  %3 = extractelement <2 x double> %b, i64 0
1103  %4 = extractelement <2 x double> %c, i64 0
1104  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1105  %6 = bitcast i8 %mask to <8 x i1>
1106  %7 = extractelement <8 x i1> %6, i64 0
1107  %8 = select i1 %7, double %5, double %2
1108  %9 = insertelement <2 x double> %1, double %8, i64 0
1109  %10 = extractelement <2 x double> %9, i32 1
1110  ret double %10
1111}
1112
1113define <4 x float> @test_maskz_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1114;
1115; CHECK-LABEL: @test_maskz_vfmadd_ss(
1116; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1117; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1118; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1119; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
1120; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1121; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00
1122; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[A]], float [[TMP6]], i64 0
1123; CHECK-NEXT:    ret <4 x float> [[TMP7]]
1124;
1125  %1 = insertelement <4 x float> %b, float 1.000000e+00, i32 1
1126  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1127  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1128  %4 = insertelement <4 x float> %c, float 4.000000e+00, i32 1
1129  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
1130  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
1131  %7 = extractelement <4 x float> %a, i64 0
1132  %8 = extractelement <4 x float> %3, i64 0
1133  %9 = extractelement <4 x float> %6, i64 0
1134  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
1135  %11 = bitcast i8 %mask to <8 x i1>
1136  %12 = extractelement <8 x i1> %11, i64 0
1137  %13 = select i1 %12, float %10, float 0.000000e+00
1138  %14 = insertelement <4 x float> %a, float %13, i64 0
1139  ret <4 x float> %14
1140}
1141
1142define float @test_maskz_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1143;
1144; CHECK-LABEL: @test_maskz_vfmadd_ss_0(
1145; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1146; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1147; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1148; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
1149; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1150; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float 0.000000e+00
1151; CHECK-NEXT:    ret float [[TMP6]]
1152;
1153  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1154  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1155  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1156  %4 = extractelement <4 x float> %3, i64 0
1157  %5 = extractelement <4 x float> %b, i64 0
1158  %6 = extractelement <4 x float> %c, i64 0
1159  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1160  %8 = bitcast i8 %mask to <8 x i1>
1161  %9 = extractelement <8 x i1> %8, i64 0
1162  %10 = select i1 %9, float %7, float 0.000000e+00
1163  %11 = insertelement <4 x float> %3, float %10, i64 0
1164  %12 = extractelement <4 x float> %11, i32 0
1165  ret float %12
1166}
1167
1168define float @test_maskz_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1169;
1170; CHECK-LABEL: @test_maskz_vfmadd_ss_1(
1171; CHECK-NEXT:    ret float 1.000000e+00
1172;
1173  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1174  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1175  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1176  %4 = extractelement <4 x float> %3, i64 0
1177  %5 = extractelement <4 x float> %b, i64 0
1178  %6 = extractelement <4 x float> %c, i64 0
1179  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1180  %8 = bitcast i8 %mask to <8 x i1>
1181  %9 = extractelement <8 x i1> %8, i64 0
1182  %10 = select i1 %9, float %7, float 0.000000e+00
1183  %11 = insertelement <4 x float> %3, float %10, i64 0
1184  %12 = extractelement <4 x float> %11, i32 1
1185  ret float %12
1186}
1187
1188define <2 x double> @test_maskz_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1189;
1190; CHECK-LABEL: @test_maskz_vfmadd_sd(
1191; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1192; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1193; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1194; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1195; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1196; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00
1197; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[A]], double [[TMP6]], i64 0
1198; CHECK-NEXT:    ret <2 x double> [[TMP7]]
1199;
1200  %1 = insertelement <2 x double> %b, double 1.000000e+00, i32 1
1201  %2 = insertelement <2 x double> %c, double 2.000000e+00, i32 1
1202  %3 = extractelement <2 x double> %a, i64 0
1203  %4 = extractelement <2 x double> %1, i64 0
1204  %5 = extractelement <2 x double> %2, i64 0
1205  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1206  %7 = bitcast i8 %mask to <8 x i1>
1207  %8 = extractelement <8 x i1> %7, i64 0
1208  %9 = select i1 %8, double %6, double 0.000000e+00
1209  %10 = insertelement <2 x double> %a, double %9, i64 0
1210  ret <2 x double> %10
1211}
1212
1213define double @test_maskz_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1214;
1215; CHECK-LABEL: @test_maskz_vfmadd_sd_0(
1216; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1217; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1218; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1219; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1220; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1221; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double 0.000000e+00
1222; CHECK-NEXT:    ret double [[TMP6]]
1223;
1224  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1225  %2 = extractelement <2 x double> %1, i64 0
1226  %3 = extractelement <2 x double> %b, i64 0
1227  %4 = extractelement <2 x double> %c, i64 0
1228  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1229  %6 = bitcast i8 %mask to <8 x i1>
1230  %7 = extractelement <8 x i1> %6, i64 0
1231  %8 = select i1 %7, double %5, double 0.000000e+00
1232  %9 = insertelement <2 x double> %1, double %8, i64 0
1233  %10 = extractelement <2 x double> %9, i32 0
1234  ret double %10
1235}
1236
1237define double @test_maskz_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1238;
1239; CHECK-LABEL: @test_maskz_vfmadd_sd_1(
1240; CHECK-NEXT:    ret double 1.000000e+00
1241;
1242  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1243  %2 = extractelement <2 x double> %1, i64 0
1244  %3 = extractelement <2 x double> %b, i64 0
1245  %4 = extractelement <2 x double> %c, i64 0
1246  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1247  %6 = bitcast i8 %mask to <8 x i1>
1248  %7 = extractelement <8 x i1> %6, i64 0
1249  %8 = select i1 %7, double %5, double 0.000000e+00
1250  %9 = insertelement <2 x double> %1, double %8, i64 0
1251  %10 = extractelement <2 x double> %9, i32 1
1252  ret double %10
1253}
1254
1255define <4 x float> @test_mask3_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1256;
1257; CHECK-LABEL: @test_mask3_vfmadd_ss(
1258; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1259; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1260; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1261; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
1262; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1263; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]]
1264; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[C]], float [[TMP6]], i64 0
1265; CHECK-NEXT:    ret <4 x float> [[TMP7]]
1266;
1267  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1268  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1269  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1270  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
1271  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
1272  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
1273  %7 = extractelement <4 x float> %3, i64 0
1274  %8 = extractelement <4 x float> %6, i64 0
1275  %9 = extractelement <4 x float> %c, i64 0
1276  %10 = call float @llvm.fma.f32(float %7, float %8, float %9)
1277  %11 = bitcast i8 %mask to <8 x i1>
1278  %12 = extractelement <8 x i1> %11, i64 0
1279  %13 = select i1 %12, float %10, float %9
1280  %14 = insertelement <4 x float> %c, float %13, i64 0
1281  ret <4 x float> %14
1282}
1283
1284define float @test_mask3_vfmadd_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1285;
1286; CHECK-LABEL: @test_mask3_vfmadd_ss_0(
1287; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1288; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1289; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1290; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
1291; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1292; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float [[TMP3]]
1293; CHECK-NEXT:    ret float [[TMP6]]
1294;
1295  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1296  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1297  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1298  %4 = extractelement <4 x float> %a, i64 0
1299  %5 = extractelement <4 x float> %b, i64 0
1300  %6 = extractelement <4 x float> %3, i64 0
1301  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1302  %8 = bitcast i8 %mask to <8 x i1>
1303  %9 = extractelement <8 x i1> %8, i64 0
1304  %10 = select i1 %9, float %7, float %6
1305  %11 = insertelement <4 x float> %3, float %10, i64 0
1306  %12 = extractelement <4 x float> %11, i32 0
1307  ret float %12
1308}
1309
1310define float @test_mask3_vfmadd_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1311;
1312; CHECK-LABEL: @test_mask3_vfmadd_ss_1(
1313; CHECK-NEXT:    ret float 1.000000e+00
1314;
1315  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1316  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1317  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1318  %4 = extractelement <4 x float> %a, i64 0
1319  %5 = extractelement <4 x float> %b, i64 0
1320  %6 = extractelement <4 x float> %3, i64 0
1321  %7 = call float @llvm.fma.f32(float %4, float %5, float %6)
1322  %8 = bitcast i8 %mask to <8 x i1>
1323  %9 = extractelement <8 x i1> %8, i64 0
1324  %10 = select i1 %9, float %7, float %6
1325  %11 = insertelement <4 x float> %3, float %10, i64 0
1326  %12 = extractelement <4 x float> %11, i32 1
1327  ret float %12
1328}
1329
1330define <2 x double> @test_mask3_vfmadd_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1331;
1332; CHECK-LABEL: @test_mask3_vfmadd_sd(
1333; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1334; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1335; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1336; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1337; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1338; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]]
1339; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[C]], double [[TMP6]], i64 0
1340; CHECK-NEXT:    ret <2 x double> [[TMP7]]
1341;
1342  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1343  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
1344  %3 = extractelement <2 x double> %1, i64 0
1345  %4 = extractelement <2 x double> %2, i64 0
1346  %5 = extractelement <2 x double> %c, i64 0
1347  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1348  %7 = bitcast i8 %mask to <8 x i1>
1349  %8 = extractelement <8 x i1> %7, i64 0
1350  %9 = select i1 %8, double %6, double %5
1351  %10 = insertelement <2 x double> %c, double %9, i64 0
1352  ret <2 x double> %10
1353}
1354
1355define double @test_mask3_vfmadd_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1356;
1357; CHECK-LABEL: @test_mask3_vfmadd_sd_0(
1358; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1359; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1360; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1361; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
1362; CHECK-NEXT:    [[TMP5:%.*]] = trunc i8 [[MASK:%.*]] to i1
1363; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], double [[TMP4]], double [[TMP3]]
1364; CHECK-NEXT:    ret double [[TMP6]]
1365;
1366  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1367  %2 = extractelement <2 x double> %a, i64 0
1368  %3 = extractelement <2 x double> %b, i64 0
1369  %4 = extractelement <2 x double> %1, i64 0
1370  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1371  %6 = bitcast i8 %mask to <8 x i1>
1372  %7 = extractelement <8 x i1> %6, i64 0
1373  %8 = select i1 %7, double %5, double %4
1374  %9 = insertelement <2 x double> %1, double %8, i64 0
1375  %10 = extractelement <2 x double> %9, i32 0
1376  ret double %10
1377}
1378
1379define double @test_mask3_vfmadd_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1380;
1381; CHECK-LABEL: @test_mask3_vfmadd_sd_1(
1382; CHECK-NEXT:    ret double 1.000000e+00
1383;
1384  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1385  %2 = extractelement <2 x double> %a, i64 0
1386  %3 = extractelement <2 x double> %b, i64 0
1387  %4 = extractelement <2 x double> %1, i64 0
1388  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
1389  %6 = bitcast i8 %mask to <8 x i1>
1390  %7 = extractelement <8 x i1> %6, i64 0
1391  %8 = select i1 %7, double %5, double %4
1392  %9 = insertelement <2 x double> %1, double %8, i64 0
1393  %10 = extractelement <2 x double> %9, i32 1
1394  ret double %10
1395}
1396
1397define <4 x float> @test_mask3_vfmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1398;
1399; CHECK-LABEL: @test_mask3_vfmsub_ss(
1400; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1401; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1402; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1403; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
1404; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]])
1405; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0
1406; CHECK-NEXT:    [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1
1407; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
1408; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[C]], float [[TMP8]], i64 0
1409; CHECK-NEXT:    ret <4 x float> [[TMP9]]
1410;
1411  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1412  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1413  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1414  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
1415  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
1416  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
1417  %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
1418  %8 = extractelement <4 x float> %3, i64 0
1419  %9 = extractelement <4 x float> %6, i64 0
1420  %10 = extractelement <4 x float> %7, i64 0
1421  %11 = call float @llvm.fma.f32(float %8, float %9, float %10)
1422  %12 = extractelement <4 x float> %c, i64 0
1423  %13 = bitcast i8 %mask to <8 x i1>
1424  %14 = extractelement <8 x i1> %13, i64 0
1425  %15 = select i1 %14, float %11, float %12
1426  %16 = insertelement <4 x float> %c, float %15, i64 0
1427  ret <4 x float> %16
1428}
1429
1430define float @test_mask3_vfmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1431;
1432; CHECK-LABEL: @test_mask3_vfmsub_ss_0(
1433; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1434; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1435; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1436; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
1437; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP4]])
1438; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[C]], i64 0
1439; CHECK-NEXT:    [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1
1440; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP5]], float [[TMP6]]
1441; CHECK-NEXT:    ret float [[TMP8]]
1442;
1443  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1444  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1445  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1446  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
1447  %5 = extractelement <4 x float> %a, i64 0
1448  %6 = extractelement <4 x float> %b, i64 0
1449  %7 = extractelement <4 x float> %4, i64 0
1450  %8 = call float @llvm.fma.f32(float %5, float %6, float %7)
1451  %9 = extractelement <4 x float> %3, i64 0
1452  %10 = bitcast i8 %mask to <8 x i1>
1453  %11 = extractelement <8 x i1> %10, i64 0
1454  %12 = select i1 %11, float %8, float %9
1455  %13 = insertelement <4 x float> %3, float %12, i64 0
1456  %14 = extractelement <4 x float> %13, i32 0
1457  ret float %14
1458}
1459
1460define float @test_mask3_vfmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1461;
1462; CHECK-LABEL: @test_mask3_vfmsub_ss_1(
1463; CHECK-NEXT:    ret float 1.000000e+00
1464;
1465  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1466  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1467  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1468  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
1469  %5 = extractelement <4 x float> %a, i64 0
1470  %6 = extractelement <4 x float> %b, i64 0
1471  %7 = extractelement <4 x float> %4, i64 0
1472  %8 = call float @llvm.fma.f32(float %5, float %6, float %7)
1473  %9 = extractelement <4 x float> %3, i64 0
1474  %10 = bitcast i8 %mask to <8 x i1>
1475  %11 = extractelement <8 x i1> %10, i64 0
1476  %12 = select i1 %11, float %8, float %9
1477  %13 = insertelement <4 x float> %3, float %12, i64 0
1478  %14 = extractelement <4 x float> %13, i32 1
1479  ret float %14
1480}
1481
1482define float @test_mask3_vfmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1483;
1484; CHECK-LABEL: @test_mask3_vfmsub_ss_1_unary_fneg(
1485; CHECK-NEXT:    ret float 1.000000e+00
1486;
1487  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1488  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1489  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1490  %4 = fneg <4 x float> %3
1491  %5 = extractelement <4 x float> %a, i64 0
1492  %6 = extractelement <4 x float> %b, i64 0
1493  %7 = extractelement <4 x float> %4, i64 0
1494  %8 = call float @llvm.fma.f32(float %5, float %6, float %7)
1495  %9 = extractelement <4 x float> %3, i64 0
1496  %10 = bitcast i8 %mask to <8 x i1>
1497  %11 = extractelement <8 x i1> %10, i64 0
1498  %12 = select i1 %11, float %8, float %9
1499  %13 = insertelement <4 x float> %3, float %12, i64 0
1500  %14 = extractelement <4 x float> %13, i32 1
1501  ret float %14
1502}
1503
1504define <2 x double> @test_mask3_vfmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1505;
1506; CHECK-LABEL: @test_mask3_vfmsub_sd(
1507; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1508; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1509; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1510; CHECK-NEXT:    [[TMP4:%.*]] = fneg double [[TMP3]]
1511; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]])
1512; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0
1513; CHECK-NEXT:    [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1
1514; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]]
1515; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[C]], double [[TMP8]], i64 0
1516; CHECK-NEXT:    ret <2 x double> [[TMP9]]
1517;
1518  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1519  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
1520  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
1521  %4 = extractelement <2 x double> %1, i64 0
1522  %5 = extractelement <2 x double> %2, i64 0
1523  %6 = extractelement <2 x double> %3, i64 0
1524  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
1525  %8 = extractelement <2 x double> %c, i64 0
1526  %9 = bitcast i8 %mask to <8 x i1>
1527  %10 = extractelement <8 x i1> %9, i64 0
1528  %11 = select i1 %10, double %7, double %8
1529  %12 = insertelement <2 x double> %c, double %11, i64 0
1530  ret <2 x double> %12
1531}
1532
1533define double @test_mask3_vfmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1534;
1535; CHECK-LABEL: @test_mask3_vfmsub_sd_0(
1536; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1537; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1538; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1539; CHECK-NEXT:    [[TMP4:%.*]] = fneg double [[TMP3]]
1540; CHECK-NEXT:    [[TMP5:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP4]])
1541; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[C]], i64 0
1542; CHECK-NEXT:    [[TMP7:%.*]] = trunc i8 [[MASK:%.*]] to i1
1543; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], double [[TMP5]], double [[TMP6]]
1544; CHECK-NEXT:    ret double [[TMP8]]
1545;
1546  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1547  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
1548  %3 = extractelement <2 x double> %a, i64 0
1549  %4 = extractelement <2 x double> %b, i64 0
1550  %5 = extractelement <2 x double> %2, i64 0
1551  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1552  %7 = extractelement <2 x double> %1, i64 0
1553  %8 = bitcast i8 %mask to <8 x i1>
1554  %9 = extractelement <8 x i1> %8, i64 0
1555  %10 = select i1 %9, double %6, double %7
1556  %11 = insertelement <2 x double> %1, double %10, i64 0
1557  %12 = extractelement <2 x double> %11, i32 0
1558  ret double %12
1559}
1560
1561define double @test_mask3_vfmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1562;
1563; CHECK-LABEL: @test_mask3_vfmsub_sd_1(
1564; CHECK-NEXT:    ret double 1.000000e+00
1565;
1566  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1567  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
1568  %3 = extractelement <2 x double> %a, i64 0
1569  %4 = extractelement <2 x double> %b, i64 0
1570  %5 = extractelement <2 x double> %2, i64 0
1571  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1572  %7 = extractelement <2 x double> %1, i64 0
1573  %8 = bitcast i8 %mask to <8 x i1>
1574  %9 = extractelement <8 x i1> %8, i64 0
1575  %10 = select i1 %9, double %6, double %7
1576  %11 = insertelement <2 x double> %1, double %10, i64 0
1577  %12 = extractelement <2 x double> %11, i32 1
1578  ret double %12
1579}
1580
1581define double @test_mask3_vfmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1582;
1583; CHECK-LABEL: @test_mask3_vfmsub_sd_1_unary_fneg(
1584; CHECK-NEXT:    ret double 1.000000e+00
1585;
1586  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1587  %2 = fneg <2 x double> %1
1588  %3 = extractelement <2 x double> %a, i64 0
1589  %4 = extractelement <2 x double> %b, i64 0
1590  %5 = extractelement <2 x double> %2, i64 0
1591  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
1592  %7 = extractelement <2 x double> %1, i64 0
1593  %8 = bitcast i8 %mask to <8 x i1>
1594  %9 = extractelement <8 x i1> %8, i64 0
1595  %10 = select i1 %9, double %6, double %7
1596  %11 = insertelement <2 x double> %1, double %10, i64 0
1597  %12 = extractelement <2 x double> %11, i32 1
1598  ret double %12
1599}
1600
1601define <4 x float> @test_mask3_vfnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1602;
1603; CHECK-LABEL: @test_mask3_vfnmsub_ss(
1604; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1605; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
1606; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1607; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1608; CHECK-NEXT:    [[TMP5:%.*]] = fneg float [[TMP4]]
1609; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]])
1610; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0
1611; CHECK-NEXT:    [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1
1612; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
1613; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[C]], float [[TMP9]], i64 0
1614; CHECK-NEXT:    ret <4 x float> [[TMP10]]
1615;
1616  %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1
1617  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1618  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1619  %4 = insertelement <4 x float> %b, float 4.000000e+00, i32 1
1620  %5 = insertelement <4 x float> %4, float 5.000000e+00, i32 2
1621  %6 = insertelement <4 x float> %5, float 6.000000e+00, i32 3
1622  %7 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
1623  %8 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
1624  %9 = extractelement <4 x float> %7, i64 0
1625  %10 = extractelement <4 x float> %6, i64 0
1626  %11 = extractelement <4 x float> %8, i64 0
1627  %12 = call float @llvm.fma.f32(float %9, float %10, float %11)
1628  %13 = extractelement <4 x float> %c, i64 0
1629  %14 = bitcast i8 %mask to <8 x i1>
1630  %15 = extractelement <8 x i1> %14, i64 0
1631  %16 = select i1 %15, float %12, float %13
1632  %17 = insertelement <4 x float> %c, float %16, i64 0
1633  ret <4 x float> %17
1634}
1635
1636define float @test_mask3_vfnmsub_ss_0(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1637;
1638; CHECK-LABEL: @test_mask3_vfnmsub_ss_0(
1639; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
1640; CHECK-NEXT:    [[TMP2:%.*]] = fneg float [[TMP1]]
1641; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0
1642; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[C:%.*]], i64 0
1643; CHECK-NEXT:    [[TMP5:%.*]] = fneg float [[TMP4]]
1644; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.fma.f32(float [[TMP2]], float [[TMP3]], float [[TMP5]])
1645; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[C]], i64 0
1646; CHECK-NEXT:    [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1
1647; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], float [[TMP6]], float [[TMP7]]
1648; CHECK-NEXT:    ret float [[TMP9]]
1649;
1650  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1651  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1652  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1653  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
1654  %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
1655  %6 = extractelement <4 x float> %4, i64 0
1656  %7 = extractelement <4 x float> %b, i64 0
1657  %8 = extractelement <4 x float> %5, i64 0
1658  %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
1659  %10 = extractelement <4 x float> %3, i64 0
1660  %11 = bitcast i8 %mask to <8 x i1>
1661  %12 = extractelement <8 x i1> %11, i64 0
1662  %13 = select i1 %12, float %9, float %10
1663  %14 = insertelement <4 x float> %3, float %13, i64 0
1664  %15 = extractelement <4 x float> %14, i32 0
1665  ret float %15
1666}
1667
1668define float @test_mask3_vfnmsub_ss_1(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1669;
1670; CHECK-LABEL: @test_mask3_vfnmsub_ss_1(
1671; CHECK-NEXT:    ret float 1.000000e+00
1672;
1673  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1674  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1675  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1676  %4 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
1677  %5 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
1678  %6 = extractelement <4 x float> %4, i64 0
1679  %7 = extractelement <4 x float> %b, i64 0
1680  %8 = extractelement <4 x float> %5, i64 0
1681  %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
1682  %10 = extractelement <4 x float> %3, i64 0
1683  %11 = bitcast i8 %mask to <8 x i1>
1684  %12 = extractelement <8 x i1> %11, i64 0
1685  %13 = select i1 %12, float %9, float %10
1686  %14 = insertelement <4 x float> %3, float %13, i64 0
1687  %15 = extractelement <4 x float> %14, i32 1
1688  ret float %15
1689}
1690
1691define float @test_mask3_vfnmsub_ss_1_unary_fneg(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1692;
1693; CHECK-LABEL: @test_mask3_vfnmsub_ss_1_unary_fneg(
1694; CHECK-NEXT:    ret float 1.000000e+00
1695;
1696  %1 = insertelement <4 x float> %c, float 1.000000e+00, i32 1
1697  %2 = insertelement <4 x float> %1, float 2.000000e+00, i32 2
1698  %3 = insertelement <4 x float> %2, float 3.000000e+00, i32 3
1699  %4 = fneg <4 x float> %a
1700  %5 = fneg <4 x float> %3
1701  %6 = extractelement <4 x float> %4, i64 0
1702  %7 = extractelement <4 x float> %b, i64 0
1703  %8 = extractelement <4 x float> %5, i64 0
1704  %9 = call float @llvm.fma.f32(float %6, float %7, float %8)
1705  %10 = extractelement <4 x float> %3, i64 0
1706  %11 = bitcast i8 %mask to <8 x i1>
1707  %12 = extractelement <8 x i1> %11, i64 0
1708  %13 = select i1 %12, float %9, float %10
1709  %14 = insertelement <4 x float> %3, float %13, i64 0
1710  %15 = extractelement <4 x float> %14, i32 1
1711  ret float %15
1712}
1713
1714define <2 x double> @test_mask3_vfnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1715;
1716; CHECK-LABEL: @test_mask3_vfnmsub_sd(
1717; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1718; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
1719; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1720; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1721; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP4]]
1722; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]])
1723; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0
1724; CHECK-NEXT:    [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1
1725; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]]
1726; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[C]], double [[TMP9]], i64 0
1727; CHECK-NEXT:    ret <2 x double> [[TMP10]]
1728;
1729  %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 1
1730  %2 = insertelement <2 x double> %b, double 2.000000e+00, i32 1
1731  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
1732  %4 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
1733  %5 = extractelement <2 x double> %3, i64 0
1734  %6 = extractelement <2 x double> %2, i64 0
1735  %7 = extractelement <2 x double> %4, i64 0
1736  %8 = call double @llvm.fma.f64(double %5, double %6, double %7)
1737  %9 = extractelement <2 x double> %c, i64 0
1738  %10 = bitcast i8 %mask to <8 x i1>
1739  %11 = extractelement <8 x i1> %10, i64 0
1740  %12 = select i1 %11, double %8, double %9
1741  %13 = insertelement <2 x double> %c, double %12, i64 0
1742  ret <2 x double> %13
1743}
1744
1745define double @test_mask3_vfnmsub_sd_0(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1746;
1747; CHECK-LABEL: @test_mask3_vfnmsub_sd_0(
1748; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[A:%.*]], i64 0
1749; CHECK-NEXT:    [[TMP2:%.*]] = fneg double [[TMP1]]
1750; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0
1751; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[C:%.*]], i64 0
1752; CHECK-NEXT:    [[TMP5:%.*]] = fneg double [[TMP4]]
1753; CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[TMP3]], double [[TMP5]])
1754; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[C]], i64 0
1755; CHECK-NEXT:    [[TMP8:%.*]] = trunc i8 [[MASK:%.*]] to i1
1756; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], double [[TMP6]], double [[TMP7]]
1757; CHECK-NEXT:    ret double [[TMP9]]
1758;
1759  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1760  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
1761  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
1762  %4 = extractelement <2 x double> %2, i64 0
1763  %5 = extractelement <2 x double> %b, i64 0
1764  %6 = extractelement <2 x double> %3, i64 0
1765  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
1766  %8 = extractelement <2 x double> %1, i64 0
1767  %9 = bitcast i8 %mask to <8 x i1>
1768  %10 = extractelement <8 x i1> %9, i64 0
1769  %11 = select i1 %10, double %7, double %8
1770  %12 = insertelement <2 x double> %1, double %11, i64 0
1771  %13 = extractelement <2 x double> %12, i32 0
1772  ret double %13
1773}
1774
1775define double @test_mask3_vfnmsub_sd_1(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1776;
1777; CHECK-LABEL: @test_mask3_vfnmsub_sd_1(
1778; CHECK-NEXT:    ret double 1.000000e+00
1779;
1780  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1781  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
1782  %3 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %1
1783  %4 = extractelement <2 x double> %2, i64 0
1784  %5 = extractelement <2 x double> %b, i64 0
1785  %6 = extractelement <2 x double> %3, i64 0
1786  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
1787  %8 = extractelement <2 x double> %1, i64 0
1788  %9 = bitcast i8 %mask to <8 x i1>
1789  %10 = extractelement <8 x i1> %9, i64 0
1790  %11 = select i1 %10, double %7, double %8
1791  %12 = insertelement <2 x double> %1, double %11, i64 0
1792  %13 = extractelement <2 x double> %12, i32 1
1793  ret double %13
1794}
1795
1796define double @test_mask3_vfnmsub_sd_1_unary_fneg(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1797;
1798; CHECK-LABEL: @test_mask3_vfnmsub_sd_1_unary_fneg(
1799; CHECK-NEXT:    ret double 1.000000e+00
1800;
1801  %1 = insertelement <2 x double> %c, double 1.000000e+00, i32 1
1802  %2 = fneg <2 x double> %a
1803  %3 = fneg <2 x double> %1
1804  %4 = extractelement <2 x double> %2, i64 0
1805  %5 = extractelement <2 x double> %b, i64 0
1806  %6 = extractelement <2 x double> %3, i64 0
1807  %7 = call double @llvm.fma.f64(double %4, double %5, double %6)
1808  %8 = extractelement <2 x double> %1, i64 0
1809  %9 = bitcast i8 %mask to <8 x i1>
1810  %10 = extractelement <8 x i1> %9, i64 0
1811  %11 = select i1 %10, double %7, double %8
1812  %12 = insertelement <2 x double> %1, double %11, i64 0
1813  %13 = extractelement <2 x double> %12, i32 1
1814  ret double %13
1815}
1816
1817declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
1818
1819define <16 x float> @test_add_ps(<16 x float> %a, <16 x float> %b) {
1820;
1821; CHECK-LABEL: @test_add_ps(
1822; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
1823; CHECK-NEXT:    ret <16 x float> [[TMP1]]
1824;
1825  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
1826  ret <16 x float> %1
1827}
1828
1829define <16 x float> @test_add_ps_round(<16 x float> %a, <16 x float> %b) {
1830;
1831; CHECK-LABEL: @test_add_ps_round(
1832; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
1833; CHECK-NEXT:    ret <16 x float> [[TMP1]]
1834;
1835  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
1836  ret <16 x float> %1
1837}
1838
1839define <16 x float> @test_add_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
1840;
1841; CHECK-LABEL: @test_add_ps_mask(
1842; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
1843; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
1844; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
1845; CHECK-NEXT:    ret <16 x float> [[TMP3]]
1846;
1847  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
1848  %2 = bitcast i16 %mask to <16 x i1>
1849  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
1850  ret <16 x float> %3
1851}
1852
1853define <16 x float> @test_add_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
1854;
1855; CHECK-LABEL: @test_add_ps_mask_round(
1856; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
1857; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
1858; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
1859; CHECK-NEXT:    ret <16 x float> [[TMP3]]
1860;
1861  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
1862  %2 = bitcast i16 %mask to <16 x i1>
1863  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
1864  ret <16 x float> %3
1865}
1866
1867declare <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double>, <8 x double>, i32)
1868
1869define <8 x double> @test_add_pd(<8 x double> %a, <8 x double> %b) {
1870;
1871; CHECK-LABEL: @test_add_pd(
1872; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
1873; CHECK-NEXT:    ret <8 x double> [[TMP1]]
1874;
1875  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
1876  ret <8 x double> %1
1877}
1878
1879define <8 x double> @test_add_pd_round(<8 x double> %a, <8 x double> %b) {
1880;
1881; CHECK-LABEL: @test_add_pd_round(
1882; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
1883; CHECK-NEXT:    ret <8 x double> [[TMP1]]
1884;
1885  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
1886  ret <8 x double> %1
1887}
1888
1889define <8 x double> @test_add_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
1890;
1891; CHECK-LABEL: @test_add_pd_mask(
1892; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
1893; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
1894; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
1895; CHECK-NEXT:    ret <8 x double> [[TMP3]]
1896;
1897  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
1898  %2 = bitcast i8 %mask to <8 x i1>
1899  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
1900  ret <8 x double> %3
1901}
1902
1903define <8 x double> @test_add_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
1904;
1905; CHECK-LABEL: @test_add_pd_mask_round(
1906; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
1907; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
1908; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
1909; CHECK-NEXT:    ret <8 x double> [[TMP3]]
1910;
1911  %1 = call <8 x double> @llvm.x86.avx512.add.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
1912  %2 = bitcast i8 %mask to <8 x i1>
1913  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
1914  ret <8 x double> %3
1915}
1916
1917declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
1918
1919define <16 x float> @test_sub_ps(<16 x float> %a, <16 x float> %b) {
1920;
1921; CHECK-LABEL: @test_sub_ps(
1922; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
1923; CHECK-NEXT:    ret <16 x float> [[TMP1]]
1924;
1925  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
1926  ret <16 x float> %1
1927}
1928
1929define <16 x float> @test_sub_ps_round(<16 x float> %a, <16 x float> %b) {
1930;
1931; CHECK-LABEL: @test_sub_ps_round(
1932; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
1933; CHECK-NEXT:    ret <16 x float> [[TMP1]]
1934;
1935  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
1936  ret <16 x float> %1
1937}
1938
1939define <16 x float> @test_sub_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
1940;
1941; CHECK-LABEL: @test_sub_ps_mask(
1942; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
1943; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
1944; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
1945; CHECK-NEXT:    ret <16 x float> [[TMP3]]
1946;
1947  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
1948  %2 = bitcast i16 %mask to <16 x i1>
1949  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
1950  ret <16 x float> %3
1951}
1952
1953define <16 x float> @test_sub_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
1954;
1955; CHECK-LABEL: @test_sub_ps_mask_round(
1956; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
1957; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
1958; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
1959; CHECK-NEXT:    ret <16 x float> [[TMP3]]
1960;
1961  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
1962  %2 = bitcast i16 %mask to <16 x i1>
1963  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
1964  ret <16 x float> %3
1965}
1966
1967declare <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double>, <8 x double>, i32)
1968
1969define <8 x double> @test_sub_pd(<8 x double> %a, <8 x double> %b) {
1970;
1971; CHECK-LABEL: @test_sub_pd(
1972; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
1973; CHECK-NEXT:    ret <8 x double> [[TMP1]]
1974;
1975  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
1976  ret <8 x double> %1
1977}
1978
1979define <8 x double> @test_sub_pd_round(<8 x double> %a, <8 x double> %b) {
1980;
1981; CHECK-LABEL: @test_sub_pd_round(
1982; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
1983; CHECK-NEXT:    ret <8 x double> [[TMP1]]
1984;
1985  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
1986  ret <8 x double> %1
1987}
1988
1989define <8 x double> @test_sub_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
1990;
1991; CHECK-LABEL: @test_sub_pd_mask(
1992; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
1993; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
1994; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
1995; CHECK-NEXT:    ret <8 x double> [[TMP3]]
1996;
1997  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
1998  %2 = bitcast i8 %mask to <8 x i1>
1999  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2000  ret <8 x double> %3
2001}
2002
2003define <8 x double> @test_sub_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
2004;
2005; CHECK-LABEL: @test_sub_pd_mask_round(
2006; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
2007; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
2008; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
2009; CHECK-NEXT:    ret <8 x double> [[TMP3]]
2010;
2011  %1 = call <8 x double> @llvm.x86.avx512.sub.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
2012  %2 = bitcast i8 %mask to <8 x i1>
2013  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2014  ret <8 x double> %3
2015}
2016
2017declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
2018
2019define <16 x float> @test_mul_ps(<16 x float> %a, <16 x float> %b) {
2020;
2021; CHECK-LABEL: @test_mul_ps(
2022; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
2023; CHECK-NEXT:    ret <16 x float> [[TMP1]]
2024;
2025  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
2026  ret <16 x float> %1
2027}
2028
2029define <16 x float> @test_mul_ps_round(<16 x float> %a, <16 x float> %b) {
2030;
2031; CHECK-LABEL: @test_mul_ps_round(
2032; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
2033; CHECK-NEXT:    ret <16 x float> [[TMP1]]
2034;
2035  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
2036  ret <16 x float> %1
2037}
2038
2039define <16 x float> @test_mul_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
2040;
2041; CHECK-LABEL: @test_mul_ps_mask(
2042; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
2043; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
2044; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
2045; CHECK-NEXT:    ret <16 x float> [[TMP3]]
2046;
2047  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
2048  %2 = bitcast i16 %mask to <16 x i1>
2049  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
2050  ret <16 x float> %3
2051}
2052
2053define <16 x float> @test_mul_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
2054;
2055; CHECK-LABEL: @test_mul_ps_mask_round(
2056; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
2057; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
2058; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
2059; CHECK-NEXT:    ret <16 x float> [[TMP3]]
2060;
2061  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
2062  %2 = bitcast i16 %mask to <16 x i1>
2063  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
2064  ret <16 x float> %3
2065}
2066
2067declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
2068
2069define <8 x double> @test_mul_pd(<8 x double> %a, <8 x double> %b) {
2070;
2071; CHECK-LABEL: @test_mul_pd(
2072; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
2073; CHECK-NEXT:    ret <8 x double> [[TMP1]]
2074;
2075  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
2076  ret <8 x double> %1
2077}
2078
2079define <8 x double> @test_mul_pd_round(<8 x double> %a, <8 x double> %b) {
2080;
2081; CHECK-LABEL: @test_mul_pd_round(
2082; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
2083; CHECK-NEXT:    ret <8 x double> [[TMP1]]
2084;
2085  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
2086  ret <8 x double> %1
2087}
2088
2089define <8 x double> @test_mul_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
2090;
2091; CHECK-LABEL: @test_mul_pd_mask(
2092; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
2093; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
2094; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
2095; CHECK-NEXT:    ret <8 x double> [[TMP3]]
2096;
2097  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
2098  %2 = bitcast i8 %mask to <8 x i1>
2099  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2100  ret <8 x double> %3
2101}
2102
2103define <8 x double> @test_mul_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
2104;
2105; CHECK-LABEL: @test_mul_pd_mask_round(
2106; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
2107; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
2108; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
2109; CHECK-NEXT:    ret <8 x double> [[TMP3]]
2110;
2111  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
2112  %2 = bitcast i8 %mask to <8 x i1>
2113  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2114  ret <8 x double> %3
2115}
2116
2117declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
2118
2119define <16 x float> @test_div_ps(<16 x float> %a, <16 x float> %b) {
2120;
2121; CHECK-LABEL: @test_div_ps(
2122; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
2123; CHECK-NEXT:    ret <16 x float> [[TMP1]]
2124;
2125  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
2126  ret <16 x float> %1
2127}
2128
2129define <16 x float> @test_div_ps_round(<16 x float> %a, <16 x float> %b) {
2130;
2131; CHECK-LABEL: @test_div_ps_round(
2132; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
2133; CHECK-NEXT:    ret <16 x float> [[TMP1]]
2134;
2135  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
2136  ret <16 x float> %1
2137}
2138
2139define <16 x float> @test_div_ps_mask(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
2140;
2141; CHECK-LABEL: @test_div_ps_mask(
2142; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
2143; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
2144; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
2145; CHECK-NEXT:    ret <16 x float> [[TMP3]]
2146;
2147  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 4)
2148  %2 = bitcast i16 %mask to <16 x i1>
2149  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
2150  ret <16 x float> %3
2151}
2152
2153define <16 x float> @test_div_ps_mask_round(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
2154;
2155; CHECK-LABEL: @test_div_ps_mask_round(
2156; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 8)
2157; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
2158; CHECK-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[C:%.*]]
2159; CHECK-NEXT:    ret <16 x float> [[TMP3]]
2160;
2161  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a, <16 x float> %b, i32 8)
2162  %2 = bitcast i16 %mask to <16 x i1>
2163  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %c
2164  ret <16 x float> %3
2165}
2166
2167declare <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double>, <8 x double>, i32)
2168
2169define <8 x double> @test_div_pd(<8 x double> %a, <8 x double> %b) {
2170;
2171; CHECK-LABEL: @test_div_pd(
2172; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
2173; CHECK-NEXT:    ret <8 x double> [[TMP1]]
2174;
2175  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
2176  ret <8 x double> %1
2177}
2178
2179define <8 x double> @test_div_pd_round(<8 x double> %a, <8 x double> %b) {
2180;
2181; CHECK-LABEL: @test_div_pd_round(
2182; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
2183; CHECK-NEXT:    ret <8 x double> [[TMP1]]
2184;
2185  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
2186  ret <8 x double> %1
2187}
2188
2189define <8 x double> @test_div_pd_mask(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
2190;
2191; CHECK-LABEL: @test_div_pd_mask(
2192; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
2193; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
2194; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
2195; CHECK-NEXT:    ret <8 x double> [[TMP3]]
2196;
2197  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 4)
2198  %2 = bitcast i8 %mask to <8 x i1>
2199  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2200  ret <8 x double> %3
2201}
2202
2203define <8 x double> @test_div_pd_mask_round(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
2204;
2205; CHECK-LABEL: @test_div_pd_mask_round(
2206; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 8)
2207; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
2208; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[C:%.*]]
2209; CHECK-NEXT:    ret <8 x double> [[TMP3]]
2210;
2211  %1 = call <8 x double> @llvm.x86.avx512.div.pd.512(<8 x double> %a, <8 x double> %b, i32 8)
2212  %2 = bitcast i8 %mask to <8 x i1>
2213  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %c
2214  ret <8 x double> %3
2215}
2216
2217declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
2218
2219define i32 @test_comi_ss_0(float %a, float %b) {
2220;
2221; CHECK-LABEL: @test_comi_ss_0(
2222; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
2223; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0
2224; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 0, i32 4)
2225; CHECK-NEXT:    ret i32 [[TMP3]]
2226;
2227  %1 = insertelement <4 x float> poison, float %a, i32 0
2228  %2 = insertelement <4 x float> %1, float 1.000000e+00, i32 1
2229  %3 = insertelement <4 x float> %2, float 2.000000e+00, i32 2
2230  %4 = insertelement <4 x float> %3, float 3.000000e+00, i32 3
2231  %5 = insertelement <4 x float> poison, float %b, i32 0
2232  %6 = insertelement <4 x float> %5, float 4.000000e+00, i32 1
2233  %7 = insertelement <4 x float> %6, float 5.000000e+00, i32 2
2234  %8 = insertelement <4 x float> %7, float 6.000000e+00, i32 3
2235  %9 = tail call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %4, <4 x float> %8, i32 0, i32 4)
2236  ret i32 %9
2237}
2238
2239declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
2240
2241define i32 @test_comi_sd_0(double %a, double %b) {
2242;
2243; CHECK-LABEL: @test_comi_sd_0(
2244; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0
2245; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i64 0
2246; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[TMP1]], <2 x double> [[TMP2]], i32 0, i32 4)
2247; CHECK-NEXT:    ret i32 [[TMP3]]
2248;
2249  %1 = insertelement <2 x double> poison, double %a, i32 0
2250  %2 = insertelement <2 x double> %1, double 1.000000e+00, i32 1
2251  %3 = insertelement <2 x double> poison, double %b, i32 0
2252  %4 = insertelement <2 x double> %3, double 2.000000e+00, i32 1
2253  %5 = tail call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %2, <2 x double> %4, i32 0, i32 4)
2254  ret i32 %5
2255}
2256