xref: /llvm-project/llvm/test/CodeGen/X86/combine-pmadd.ll (revision d85da4af4983fb8997865014ca5f87ad2db5e272)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5
6declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
7declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
8
9define <4 x i32> @combine_pmaddwd_zero(<8 x i16> %a0, <8 x i16> %a1) {
10; SSE-LABEL: combine_pmaddwd_zero:
11; SSE:       # %bb.0:
12; SSE-NEXT:    xorps %xmm0, %xmm0
13; SSE-NEXT:    retq
14;
15; AVX-LABEL: combine_pmaddwd_zero:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
18; AVX-NEXT:    retq
19  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> zeroinitializer)
20  ret <4 x i32> %1
21}
22
23define <4 x i32> @combine_pmaddwd_zero_commute(<8 x i16> %a0, <8 x i16> %a1) {
24; SSE-LABEL: combine_pmaddwd_zero_commute:
25; SSE:       # %bb.0:
26; SSE-NEXT:    xorps %xmm0, %xmm0
27; SSE-NEXT:    retq
28;
29; AVX-LABEL: combine_pmaddwd_zero_commute:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
32; AVX-NEXT:    retq
33  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> zeroinitializer, <8 x i16> %a0)
34  ret <4 x i32> %1
35}
36
37define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
38; SSE-LABEL: combine_pmaddwd_concat:
39; SSE:       # %bb.0:
40; SSE-NEXT:    pmaddwd %xmm1, %xmm0
41; SSE-NEXT:    pmaddwd %xmm3, %xmm2
42; SSE-NEXT:    movdqa %xmm2, %xmm1
43; SSE-NEXT:    retq
44;
45; AVX1-LABEL: combine_pmaddwd_concat:
46; AVX1:       # %bb.0:
47; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
48; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm1
49; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
50; AVX1-NEXT:    retq
51;
52; AVX2-LABEL: combine_pmaddwd_concat:
53; AVX2:       # %bb.0:
54; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
55; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
56; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
57; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
58; AVX2-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
59; AVX2-NEXT:    retq
60  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
61  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a2, <8 x i16> %a3)
62  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
63  ret <8 x i32> %3
64}
65
66define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) {
67; SSE-LABEL: combine_pmaddwd_concat_freeze:
68; SSE:       # %bb.0:
69; SSE-NEXT:    pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
70; SSE-NEXT:    pmaddwd %xmm2, %xmm0
71; SSE-NEXT:    pmaddwd %xmm2, %xmm1
72; SSE-NEXT:    retq
73;
74; AVX1-LABEL: combine_pmaddwd_concat_freeze:
75; AVX1:       # %bb.0:
76; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
77; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
78; AVX1-NEXT:    vpmaddwd %xmm2, %xmm1, %xmm1
79; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
80; AVX1-NEXT:    retq
81;
82; AVX2-LABEL: combine_pmaddwd_concat_freeze:
83; AVX2:       # %bb.0:
84; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
85; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
86; AVX2-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
87; AVX2-NEXT:    retq
88  %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
89  %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
90  %flo = freeze <4 x i32> %lo
91  %fhi = freeze <4 x i32> %hi
92  %res = shufflevector <4 x i32> %flo, <4 x i32> %fhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
93  ret <8 x i32> %res
94}
95
96define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
97; SSE-LABEL: combine_pmaddwd_demandedelts:
98; SSE:       # %bb.0:
99; SSE-NEXT:    pmaddwd %xmm1, %xmm0
100; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
101; SSE-NEXT:    retq
102;
103; AVX1-LABEL: combine_pmaddwd_demandedelts:
104; AVX1:       # %bb.0:
105; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
106; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
107; AVX1-NEXT:    retq
108;
109; AVX2-LABEL: combine_pmaddwd_demandedelts:
110; AVX2:       # %bb.0:
111; AVX2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
112; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
113; AVX2-NEXT:    retq
114  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
115  %2 = shufflevector <8 x i16> %a1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 7, i32 7>
116  %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %1, <8 x i16> %2)
117  %4 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> zeroinitializer
118  ret <4 x i32> %4
119}
120
121; [2]: (-5*13)+(6*-15) = -155 = 4294967141
122define <4 x i32> @combine_pmaddwd_constant() {
123; SSE-LABEL: combine_pmaddwd_constant:
124; SSE:       # %bb.0:
125; SSE-NEXT:    movaps {{.*#+}} xmm0 = [19,17,4294967141,271]
126; SSE-NEXT:    retq
127;
128; AVX-LABEL: combine_pmaddwd_constant:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [19,17,4294967141,271]
131; AVX-NEXT:    retq
132  %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -9, i16 -11, i16 13, i16 -15, i16 17, i16 -19>)
133  ret <4 x i32> %1
134}
135
136; ensure we don't assume pmaddwd performs add nsw
137; [0]: (-32768*-32768)+(-32768*-32768) = 0x80000000 = 2147483648
138define <4 x i32> @combine_pmaddwd_constant_nsw() {
139; SSE-LABEL: combine_pmaddwd_constant_nsw:
140; SSE:       # %bb.0:
141; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
142; SSE-NEXT:    retq
143;
144; AVX-LABEL: combine_pmaddwd_constant_nsw:
145; AVX:       # %bb.0:
146; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
147; AVX-NEXT:    retq
148  %1 = insertelement <8 x i16> undef, i16 32768, i32 0
149  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
150  %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %2)
151  ret <4 x i32> %3
152}
153
154define <8 x i16> @combine_pmaddubsw_zero(<16 x i8> %a0, <16 x i8> %a1) {
155; SSE-LABEL: combine_pmaddubsw_zero:
156; SSE:       # %bb.0:
157; SSE-NEXT:    xorps %xmm0, %xmm0
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: combine_pmaddubsw_zero:
161; AVX:       # %bb.0:
162; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
163; AVX-NEXT:    retq
164  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
165  ret <8 x i16> %1
166}
167
168define <8 x i16> @combine_pmaddubsw_zero_commute(<16 x i8> %a0, <16 x i8> %a1) {
169; SSE-LABEL: combine_pmaddubsw_zero_commute:
170; SSE:       # %bb.0:
171; SSE-NEXT:    xorps %xmm0, %xmm0
172; SSE-NEXT:    retq
173;
174; AVX-LABEL: combine_pmaddubsw_zero_commute:
175; AVX:       # %bb.0:
176; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
177; AVX-NEXT:    retq
178  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> zeroinitializer, <16 x i8> %a0)
179  ret <8 x i16> %1
180}
181
182define <16 x i16> @combine_pmaddubsw_concat(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3) {
183; SSE-LABEL: combine_pmaddubsw_concat:
184; SSE:       # %bb.0:
185; SSE-NEXT:    pmaddubsw %xmm1, %xmm0
186; SSE-NEXT:    pmaddubsw %xmm3, %xmm2
187; SSE-NEXT:    movdqa %xmm2, %xmm1
188; SSE-NEXT:    retq
189;
190; AVX1-LABEL: combine_pmaddubsw_concat:
191; AVX1:       # %bb.0:
192; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
193; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm2, %xmm1
194; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
195; AVX1-NEXT:    retq
196;
197; AVX2-LABEL: combine_pmaddubsw_concat:
198; AVX2:       # %bb.0:
199; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
200; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
201; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
202; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
203; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
204; AVX2-NEXT:    retq
205  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
206  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a2, <16 x i8> %a3)
207  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
208  ret <16 x i16> %3
209}
210
211define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1) {
212; SSE-LABEL: combine_pmaddubsw_concat_freeze:
213; SSE:       # %bb.0:
214; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
215; SSE-NEXT:    pmaddubsw %xmm2, %xmm0
216; SSE-NEXT:    pmaddubsw %xmm2, %xmm1
217; SSE-NEXT:    retq
218;
219; AVX1-LABEL: combine_pmaddubsw_concat_freeze:
220; AVX1:       # %bb.0:
221; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
222; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm0, %xmm0
223; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm1, %xmm1
224; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
225; AVX1-NEXT:    retq
226;
227; AVX2-LABEL: combine_pmaddubsw_concat_freeze:
228; AVX2:       # %bb.0:
229; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
230; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
231; AVX2-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
232; AVX2-NEXT:    retq
233  %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
234  %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
235  %flo = freeze <8 x i16> %lo
236  %fhi = freeze <8 x i16> %hi
237  %res = shufflevector <8 x i16> %flo, <8 x i16> %fhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
238  ret <16 x i16> %res
239}
240
241define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
242; SSE-LABEL: combine_pmaddubsw_demandedelts:
243; SSE:       # %bb.0:
244; SSE-NEXT:    pmaddubsw %xmm1, %xmm0
245; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
246; SSE-NEXT:    retq
247;
248; AVX1-LABEL: combine_pmaddubsw_demandedelts:
249; AVX1:       # %bb.0:
250; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
251; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
252; AVX1-NEXT:    retq
253;
254; AVX2-LABEL: combine_pmaddubsw_demandedelts:
255; AVX2:       # %bb.0:
256; AVX2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
257; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
258; AVX2-NEXT:    retq
259  %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
260  %2 = shufflevector <16 x i8> %a1, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
261  %3 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %1, <16 x i8> %2)
262  %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
263  ret <8 x i16> %4
264}
265
266; [3]: ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694
267define i32 @combine_pmaddubsw_constant() {
268; CHECK-LABEL: combine_pmaddubsw_constant:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    movl $1694, %eax # imm = 0x69E
271; CHECK-NEXT:    retq
272  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
273  %2 = extractelement <8 x i16> %1, i32 3
274  %3 = sext i16 %2 to i32
275  ret i32 %3
276}
277
278; [0]: add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768
279define i32 @combine_pmaddubsw_constant_sat() {
280; CHECK-LABEL: combine_pmaddubsw_constant_sat:
281; CHECK:       # %bb.0:
282; CHECK-NEXT:    movl $-32768, %eax # imm = 0x8000
283; CHECK-NEXT:    retq
284  %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
285  %2 = extractelement <8 x i16> %1, i32 0
286  %3 = sext i16 %2 to i32
287  ret i32 %3
288}
289
290; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values.
291define i1 @pmaddwd_pcmpgt_infinite_loop() {
292; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop:
293; CHECK:       # %bb.0:
294; CHECK-NEXT:    movb $1, %al
295; CHECK-NEXT:    retq
296  %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>)
297  %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
298  %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer
299  %4 = add <4 x i32> %3, <i32 -8, i32 -9, i32 -10, i32 -11>
300  %.not = trunc <4 x i32> %3 to <4 x i1>
301  %5 = icmp sgt <4 x i32> %4, <i32 2147483640, i32 2147483639, i32 2147483638, i32 2147483637>
302  %6 = select <4 x i1> %.not, <4 x i1> %5, <4 x i1> zeroinitializer
303  %7 = bitcast <4 x i1> %6 to i4
304  %8 = icmp eq i4 %7, 0
305  ret i1 %8
306}
307