xref: /llvm-project/llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll (revision f1ec0d12bb0843f0deab83ef2b5cf1339cbc4f0b)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=corei7 %s | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
3; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=bdver2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-XOP
4; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=core-avx2 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX2
5; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -mcpu=skylake-avx512 %s | FileCheck %s --check-prefixes=CHECK,CHECK-AVX,CHECK-AVX512BW
6
7target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
8target triple = "x86_64-apple-darwin10.9.0"
9
10define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
11; CHECK-SSE2-LABEL: @test_8bit(
12; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
13; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
14; CHECK-SSE2:       if_true:
15; CHECK-SSE2-NEXT:    ret <16 x i8> [[MASK]]
16; CHECK-SSE2:       if_false:
17; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP]], <16 x i8> undef, <16 x i32> zeroinitializer
18; CHECK-SSE2-NEXT:    [[RES:%.*]] = shl <16 x i8> [[LHS:%.*]], [[TMP1]]
19; CHECK-SSE2-NEXT:    ret <16 x i8> [[RES]]
20;
21; CHECK-XOP-LABEL: @test_8bit(
22; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
23; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
24; CHECK-XOP:       if_true:
25; CHECK-XOP-NEXT:    ret <16 x i8> [[MASK]]
26; CHECK-XOP:       if_false:
27; CHECK-XOP-NEXT:    [[RES:%.*]] = shl <16 x i8> [[LHS:%.*]], [[MASK]]
28; CHECK-XOP-NEXT:    ret <16 x i8> [[RES]]
29;
30; CHECK-AVX-LABEL: @test_8bit(
31; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <16 x i8> [[TMP:%.*]], <16 x i8> undef, <16 x i32> zeroinitializer
32; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
33; CHECK-AVX:       if_true:
34; CHECK-AVX-NEXT:    ret <16 x i8> [[MASK]]
35; CHECK-AVX:       if_false:
36; CHECK-AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP]], <16 x i8> undef, <16 x i32> zeroinitializer
37; CHECK-AVX-NEXT:    [[RES:%.*]] = shl <16 x i8> [[LHS:%.*]], [[TMP1]]
38; CHECK-AVX-NEXT:    ret <16 x i8> [[RES]]
39;
40  %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer
41  br i1 %tst, label %if_true, label %if_false
42
43if_true:
44  ret <16 x i8> %mask
45
46if_false:
47  %res = shl <16 x i8> %lhs, %mask
48  ret <16 x i8> %res
49}
50
51define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
52; CHECK-SSE2-LABEL: @test_16bit(
53; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
54; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
55; CHECK-SSE2:       if_true:
56; CHECK-SSE2-NEXT:    ret <8 x i16> [[MASK]]
57; CHECK-SSE2:       if_false:
58; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
59; CHECK-SSE2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
60; CHECK-SSE2-NEXT:    ret <8 x i16> [[RES]]
61;
62; CHECK-XOP-LABEL: @test_16bit(
63; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
64; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
65; CHECK-XOP:       if_true:
66; CHECK-XOP-NEXT:    ret <8 x i16> [[MASK]]
67; CHECK-XOP:       if_false:
68; CHECK-XOP-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[MASK]]
69; CHECK-XOP-NEXT:    ret <8 x i16> [[RES]]
70;
71; CHECK-AVX2-LABEL: @test_16bit(
72; CHECK-AVX2-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
73; CHECK-AVX2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
74; CHECK-AVX2:       if_true:
75; CHECK-AVX2-NEXT:    ret <8 x i16> [[MASK]]
76; CHECK-AVX2:       if_false:
77; CHECK-AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP]], <8 x i16> undef, <8 x i32> zeroinitializer
78; CHECK-AVX2-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[TMP1]]
79; CHECK-AVX2-NEXT:    ret <8 x i16> [[RES]]
80;
81; CHECK-AVX512BW-LABEL: @test_16bit(
82; CHECK-AVX512BW-NEXT:    [[MASK:%.*]] = shufflevector <8 x i16> [[TMP:%.*]], <8 x i16> undef, <8 x i32> zeroinitializer
83; CHECK-AVX512BW-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
84; CHECK-AVX512BW:       if_true:
85; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[MASK]]
86; CHECK-AVX512BW:       if_false:
87; CHECK-AVX512BW-NEXT:    [[RES:%.*]] = shl <8 x i16> [[LHS:%.*]], [[MASK]]
88; CHECK-AVX512BW-NEXT:    ret <8 x i16> [[RES]]
89;
90  %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer
91  br i1 %tst, label %if_true, label %if_false
92
93if_true:
94  ret <8 x i16> %mask
95
96if_false:
97  %res = shl <8 x i16> %lhs, %mask
98  ret <8 x i16> %res
99}
100
101define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
102; CHECK-LABEL: @test_notsplat(
103; CHECK-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
104; CHECK-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
105; CHECK:       if_true:
106; CHECK-NEXT:    ret <4 x i32> [[MASK]]
107; CHECK:       if_false:
108; CHECK-NEXT:    [[RES:%.*]] = shl <4 x i32> [[LHS:%.*]], [[MASK]]
109; CHECK-NEXT:    ret <4 x i32> [[RES]]
110;
111  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
112  br i1 %tst, label %if_true, label %if_false
113
114if_true:
115  ret <4 x i32> %mask
116
117if_false:
118  %res = shl <4 x i32> %lhs, %mask
119  ret <4 x i32> %res
120}
121
122define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
123; CHECK-SSE2-LABEL: @test_32bit(
124; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 0, i32 0>
125; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
126; CHECK-SSE2:       if_true:
127; CHECK-SSE2-NEXT:    ret <4 x i32> [[MASK]]
128; CHECK-SSE2:       if_false:
129; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP]], <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 0, i32 0>
130; CHECK-SSE2-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[TMP1]]
131; CHECK-SSE2-NEXT:    ret <4 x i32> [[RES]]
132;
133; CHECK-XOP-LABEL: @test_32bit(
134; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 0, i32 0>
135; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
136; CHECK-XOP:       if_true:
137; CHECK-XOP-NEXT:    ret <4 x i32> [[MASK]]
138; CHECK-XOP:       if_false:
139; CHECK-XOP-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[MASK]]
140; CHECK-XOP-NEXT:    ret <4 x i32> [[RES]]
141;
142; CHECK-AVX-LABEL: @test_32bit(
143; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <4 x i32> [[TMP:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 poison, i32 0, i32 0>
144; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
145; CHECK-AVX:       if_true:
146; CHECK-AVX-NEXT:    ret <4 x i32> [[MASK]]
147; CHECK-AVX:       if_false:
148; CHECK-AVX-NEXT:    [[RES:%.*]] = ashr <4 x i32> [[LHS:%.*]], [[MASK]]
149; CHECK-AVX-NEXT:    ret <4 x i32> [[RES]]
150;
151  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
152  br i1 %tst, label %if_true, label %if_false
153
154if_true:
155  ret <4 x i32> %mask
156
157if_false:
158  %res = ashr <4 x i32> %lhs, %mask
159  ret <4 x i32> %res
160}
161
162define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
163; CHECK-SSE2-LABEL: @test_64bit(
164; CHECK-SSE2-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
165; CHECK-SSE2-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
166; CHECK-SSE2:       if_true:
167; CHECK-SSE2-NEXT:    ret <2 x i64> [[MASK]]
168; CHECK-SSE2:       if_false:
169; CHECK-SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP]], <2 x i64> undef, <2 x i32> zeroinitializer
170; CHECK-SSE2-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[TMP1]]
171; CHECK-SSE2-NEXT:    ret <2 x i64> [[RES]]
172;
173; CHECK-XOP-LABEL: @test_64bit(
174; CHECK-XOP-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
175; CHECK-XOP-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
176; CHECK-XOP:       if_true:
177; CHECK-XOP-NEXT:    ret <2 x i64> [[MASK]]
178; CHECK-XOP:       if_false:
179; CHECK-XOP-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[MASK]]
180; CHECK-XOP-NEXT:    ret <2 x i64> [[RES]]
181;
182; CHECK-AVX-LABEL: @test_64bit(
183; CHECK-AVX-NEXT:    [[MASK:%.*]] = shufflevector <2 x i64> [[TMP:%.*]], <2 x i64> undef, <2 x i32> zeroinitializer
184; CHECK-AVX-NEXT:    br i1 [[TST:%.*]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
185; CHECK-AVX:       if_true:
186; CHECK-AVX-NEXT:    ret <2 x i64> [[MASK]]
187; CHECK-AVX:       if_false:
188; CHECK-AVX-NEXT:    [[RES:%.*]] = lshr <2 x i64> [[LHS:%.*]], [[MASK]]
189; CHECK-AVX-NEXT:    ret <2 x i64> [[RES]]
190;
191  %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
192  br i1 %tst, label %if_true, label %if_false
193
194if_true:
195  ret <2 x i64> %mask
196
197if_false:
198  %res = lshr <2 x i64> %lhs, %mask
199  ret <2 x i64> %res
200}
201
202define void @funnel_splatvar(ptr nocapture %arr, i32 %rot) {
203; CHECK-SSE2-LABEL: @funnel_splatvar(
204; CHECK-SSE2-NEXT:  entry:
205; CHECK-SSE2-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
206; CHECK-SSE2-NEXT:    br label [[VECTOR_BODY:%.*]]
207; CHECK-SSE2:       vector.body:
208; CHECK-SSE2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
209; CHECK-SSE2-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
210; CHECK-SSE2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[T0]], align 4
211; CHECK-SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
212; CHECK-SSE2-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
213; CHECK-SSE2-NEXT:    store <8 x i32> [[T2]], ptr [[T0]], align 4
214; CHECK-SSE2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
215; CHECK-SSE2-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
216; CHECK-SSE2-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
217; CHECK-SSE2:       for.cond.cleanup:
218; CHECK-SSE2-NEXT:    ret void
219;
220; CHECK-XOP-LABEL: @funnel_splatvar(
221; CHECK-XOP-NEXT:  entry:
222; CHECK-XOP-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
223; CHECK-XOP-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
224; CHECK-XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
225; CHECK-XOP:       vector.body:
226; CHECK-XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
227; CHECK-XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
228; CHECK-XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[T0]], align 4
229; CHECK-XOP-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
230; CHECK-XOP-NEXT:    store <8 x i32> [[T2]], ptr [[T0]], align 4
231; CHECK-XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
232; CHECK-XOP-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
233; CHECK-XOP-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
234; CHECK-XOP:       for.cond.cleanup:
235; CHECK-XOP-NEXT:    ret void
236;
237; CHECK-AVX-LABEL: @funnel_splatvar(
238; CHECK-AVX-NEXT:  entry:
239; CHECK-AVX-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
240; CHECK-AVX-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
241; CHECK-AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
242; CHECK-AVX:       vector.body:
243; CHECK-AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
244; CHECK-AVX-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
245; CHECK-AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[T0]], align 4
246; CHECK-AVX-NEXT:    [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
247; CHECK-AVX-NEXT:    store <8 x i32> [[T2]], ptr [[T0]], align 4
248; CHECK-AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
249; CHECK-AVX-NEXT:    [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
250; CHECK-AVX-NEXT:    br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
251; CHECK-AVX:       for.cond.cleanup:
252; CHECK-AVX-NEXT:    ret void
253;
254entry:
255  %broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0
256  %broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer
257  br label %vector.body
258
259vector.body:
260  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
261  %t0 = getelementptr inbounds i32, ptr %arr, i64 %index
262  %wide.load = load <8 x i32>, ptr %t0, align 4
263  %t2 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load, <8 x i32> %wide.load, <8 x i32> %broadcast.splat16)
264  store <8 x i32> %t2, ptr %t0, align 4
265  %index.next = add i64 %index, 8
266  %t3 = icmp eq i64 %index.next, 65536
267  br i1 %t3, label %for.cond.cleanup, label %vector.body
268
269for.cond.cleanup:
270  ret void
271}
272
273declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>)
274