xref: /llvm-project/llvm/test/Transforms/CodeGenPrepare/X86/vec-shift.ll (revision f1ec0d12bb0843f0deab83ef2b5cf1339cbc4f0b)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S < %s | FileCheck %s --check-prefixes=AVX1
3; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2 -S < %s | FileCheck %s --check-prefixes=AVX2
4; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx512bw -S < %s | FileCheck %s --check-prefixes=AVX512BW
5; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx,+xop -S < %s | FileCheck %s --check-prefixes=XOP
6; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx2,+xop -S < %s | FileCheck %s --check-prefixes=XOP
7; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -mtriple=x86_64-- -mattr=+avx -S -enable-debugify < %s 2>&1 | FileCheck %s -check-prefix=DEBUG
8
9define <4 x i32> @vector_variable_shift_right_v4i32(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
10; AVX1-LABEL: @vector_variable_shift_right_v4i32(
11; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
12; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
13; AVX1-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
14; AVX1-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SPLAT1]]
15; AVX1-NEXT:    [[TMP2:%.*]] = lshr <4 x i32> [[Z]], [[SPLAT2]]
16; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]]
17; AVX1-NEXT:    ret <4 x i32> [[TMP3]]
18;
19; AVX2-LABEL: @vector_variable_shift_right_v4i32(
20; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
21; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
22; AVX2-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
23; AVX2-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
24; AVX2-NEXT:    ret <4 x i32> [[SH]]
25;
26; AVX512BW-LABEL: @vector_variable_shift_right_v4i32(
27; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
28; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
29; AVX512BW-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
30; AVX512BW-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
31; AVX512BW-NEXT:    ret <4 x i32> [[SH]]
32;
33; XOP-LABEL: @vector_variable_shift_right_v4i32(
34; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
35; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
36; XOP-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND:%.*]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
37; XOP-NEXT:    [[SH:%.*]] = lshr <4 x i32> [[Z:%.*]], [[SEL]]
38; XOP-NEXT:    ret <4 x i32> [[SH]]
39;
40  %splat1 = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
41  %splat2 = shufflevector <4 x i32> %y, <4 x i32> undef, <4 x i32> zeroinitializer
42  %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2
43  %sh = lshr <4 x i32> %z, %sel
44  ret <4 x i32> %sh
45}
46
47define <16 x i16> @vector_variable_shift_right_v16i16(<16 x i1> %cond, <16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
48; AVX1-LABEL: @vector_variable_shift_right_v16i16(
49; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
50; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
51; AVX1-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
52; AVX1-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
53; AVX1-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
54; AVX1-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
55; AVX1-NEXT:    ret <16 x i16> [[TMP3]]
56;
57; AVX2-LABEL: @vector_variable_shift_right_v16i16(
58; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
59; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
60; AVX2-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
61; AVX2-NEXT:    [[TMP1:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SPLAT1]]
62; AVX2-NEXT:    [[TMP2:%.*]] = lshr <16 x i16> [[Z]], [[SPLAT2]]
63; AVX2-NEXT:    [[TMP3:%.*]] = select <16 x i1> [[COND]], <16 x i16> [[TMP1]], <16 x i16> [[TMP2]]
64; AVX2-NEXT:    ret <16 x i16> [[TMP3]]
65;
66; AVX512BW-LABEL: @vector_variable_shift_right_v16i16(
67; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
68; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
69; AVX512BW-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
70; AVX512BW-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
71; AVX512BW-NEXT:    ret <16 x i16> [[SH]]
72;
73; XOP-LABEL: @vector_variable_shift_right_v16i16(
74; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <16 x i16> [[X:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
75; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <16 x i16> [[Y:%.*]], <16 x i16> undef, <16 x i32> zeroinitializer
76; XOP-NEXT:    [[SEL:%.*]] = select <16 x i1> [[COND:%.*]], <16 x i16> [[SPLAT1]], <16 x i16> [[SPLAT2]]
77; XOP-NEXT:    [[SH:%.*]] = lshr <16 x i16> [[Z:%.*]], [[SEL]]
78; XOP-NEXT:    ret <16 x i16> [[SH]]
79;
80  %splat1 = shufflevector <16 x i16> %x, <16 x i16> undef, <16 x i32> zeroinitializer
81  %splat2 = shufflevector <16 x i16> %y, <16 x i16> undef, <16 x i32> zeroinitializer
82  %sel = select <16 x i1> %cond, <16 x i16> %splat1, <16 x i16> %splat2
83  %sh = lshr <16 x i16> %z, %sel
84  ret <16 x i16> %sh
85}
86
87define <32 x i8> @vector_variable_shift_right_v32i8(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
88; AVX1-LABEL: @vector_variable_shift_right_v32i8(
89; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
90; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
91; AVX1-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
92; AVX1-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
93; AVX1-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
94; AVX1-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
95; AVX1-NEXT:    ret <32 x i8> [[TMP3]]
96;
97; AVX2-LABEL: @vector_variable_shift_right_v32i8(
98; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
99; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
100; AVX2-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
101; AVX2-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
102; AVX2-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
103; AVX2-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
104; AVX2-NEXT:    ret <32 x i8> [[TMP3]]
105;
106; AVX512BW-LABEL: @vector_variable_shift_right_v32i8(
107; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
108; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
109; AVX512BW-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
110; AVX512BW-NEXT:    [[TMP1:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SPLAT1]]
111; AVX512BW-NEXT:    [[TMP2:%.*]] = lshr <32 x i8> [[Z]], [[SPLAT2]]
112; AVX512BW-NEXT:    [[TMP3:%.*]] = select <32 x i1> [[COND]], <32 x i8> [[TMP1]], <32 x i8> [[TMP2]]
113; AVX512BW-NEXT:    ret <32 x i8> [[TMP3]]
114;
115; XOP-LABEL: @vector_variable_shift_right_v32i8(
116; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <32 x i8> [[X:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
117; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <32 x i8> [[Y:%.*]], <32 x i8> undef, <32 x i32> zeroinitializer
118; XOP-NEXT:    [[SEL:%.*]] = select <32 x i1> [[COND:%.*]], <32 x i8> [[SPLAT1]], <32 x i8> [[SPLAT2]]
119; XOP-NEXT:    [[SH:%.*]] = lshr <32 x i8> [[Z:%.*]], [[SEL]]
120; XOP-NEXT:    ret <32 x i8> [[SH]]
121;
122  %splat1 = shufflevector <32 x i8> %x, <32 x i8> undef, <32 x i32> zeroinitializer
123  %splat2 = shufflevector <32 x i8> %y, <32 x i8> undef, <32 x i32> zeroinitializer
124  %sel = select <32 x i1> %cond, <32 x i8> %splat1, <32 x i8> %splat2
125  %sh = lshr <32 x i8> %z, %sel
126  ret <32 x i8> %sh
127}
128
129; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
130
131define void @vector_variable_shift_left_loop(ptr nocapture %arr, ptr nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) {
132; AVX1-LABEL: @vector_variable_shift_left_loop(
133; AVX1-NEXT:  entry:
134; AVX1-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
135; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
136; AVX1-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
137; AVX1:       vector.ph:
138; AVX1-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
139; AVX1-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
140; AVX1-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
141; AVX1-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
142; AVX1-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
143; AVX1-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
144; AVX1-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
145; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
146; AVX1:       vector.body:
147; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
148; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
149; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
150; AVX1-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
151; AVX1-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
152; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
153; AVX1-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP4]]
154; AVX1-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
155; AVX1-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP6]]
156; AVX1-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP5]], <4 x i32> [[TMP7]]
157; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
158; AVX1-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
159; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
160; AVX1-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
161; AVX1-NEXT:    br i1 [[TMP11]], label [[EXIT]], label [[VECTOR_BODY]]
162; AVX1:       exit:
163; AVX1-NEXT:    ret void
164;
165; AVX2-LABEL: @vector_variable_shift_left_loop(
166; AVX2-NEXT:  entry:
167; AVX2-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
168; AVX2-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
169; AVX2-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
170; AVX2:       vector.ph:
171; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
172; AVX2-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
173; AVX2-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
174; AVX2-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
175; AVX2-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
176; AVX2-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
177; AVX2-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
178; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
179; AVX2:       vector.body:
180; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
181; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
182; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
183; AVX2-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
184; AVX2-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
185; AVX2-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
186; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
187; AVX2-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
188; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
189; AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
190; AVX2-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
191; AVX2:       exit:
192; AVX2-NEXT:    ret void
193;
194; AVX512BW-LABEL: @vector_variable_shift_left_loop(
195; AVX512BW-NEXT:  entry:
196; AVX512BW-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
197; AVX512BW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
198; AVX512BW-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
199; AVX512BW:       vector.ph:
200; AVX512BW-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
201; AVX512BW-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
202; AVX512BW-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
203; AVX512BW-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
204; AVX512BW-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
205; AVX512BW-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
206; AVX512BW-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
207; AVX512BW-NEXT:    br label [[VECTOR_BODY:%.*]]
208; AVX512BW:       vector.body:
209; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
210; AVX512BW-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
211; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
212; AVX512BW-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
213; AVX512BW-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
214; AVX512BW-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
215; AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
216; AVX512BW-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
217; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
218; AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
219; AVX512BW-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
220; AVX512BW:       exit:
221; AVX512BW-NEXT:    ret void
222;
223; XOP-LABEL: @vector_variable_shift_left_loop(
224; XOP-NEXT:  entry:
225; XOP-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
226; XOP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[COUNT]] to i64
227; XOP-NEXT:    br i1 [[CMP16]], label [[VECTOR_PH:%.*]], label [[EXIT:%.*]]
228; XOP:       vector.ph:
229; XOP-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
230; XOP-NEXT:    [[SPLATINSERT18:%.*]] = insertelement <4 x i32> undef, i32 [[AMT0:%.*]], i32 0
231; XOP-NEXT:    [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLATINSERT18]], <4 x i32> undef, <4 x i32> zeroinitializer
232; XOP-NEXT:    [[SPLATINSERT20:%.*]] = insertelement <4 x i32> undef, i32 [[AMT1:%.*]], i32 0
233; XOP-NEXT:    [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLATINSERT20]], <4 x i32> undef, <4 x i32> zeroinitializer
234; XOP-NEXT:    [[SPLATINSERT22:%.*]] = insertelement <4 x i32> undef, i32 [[X:%.*]], i32 0
235; XOP-NEXT:    [[SPLAT3:%.*]] = shufflevector <4 x i32> [[SPLATINSERT22]], <4 x i32> undef, <4 x i32> zeroinitializer
236; XOP-NEXT:    br label [[VECTOR_BODY:%.*]]
237; XOP:       vector.body:
238; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
239; XOP-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
240; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
241; XOP-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
242; XOP-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
243; XOP-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[SPLAT3]], [[TMP3]]
244; XOP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
245; XOP-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
246; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
247; XOP-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
248; XOP-NEXT:    br i1 [[TMP7]], label [[EXIT]], label [[VECTOR_BODY]]
249; XOP:       exit:
250; XOP-NEXT:    ret void
251;
252entry:
253  %cmp16 = icmp sgt i32 %count, 0
254  %wide.trip.count = zext i32 %count to i64
255  br i1 %cmp16, label %vector.ph, label %exit
256
257vector.ph:
258  %n.vec = and i64 %wide.trip.count, 4294967292
259  %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0
260  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
261  %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0
262  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
263  %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0
264  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
265  br label %vector.body
266
267vector.body:
268  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
269  %0 = getelementptr inbounds i8, ptr %control, i64 %index
270  %wide.load = load <4 x i8>, ptr %0, align 1
271  %1 = icmp eq <4 x i8> %wide.load, zeroinitializer
272  %2 = select <4 x i1> %1, <4 x i32> %splat1, <4 x i32> %splat2
273  %3 = shl <4 x i32> %splat3, %2
274  %4 = getelementptr inbounds i32, ptr %arr, i64 %index
275  store <4 x i32> %3, ptr %4, align 4
276  %index.next = add i64 %index, 4
277  %5 = icmp eq i64 %index.next, %n.vec
278  br i1 %5, label %exit, label %vector.body
279
280exit:
281  ret void
282}
283
284; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
285; If we don't have real vector shift instructions (AVX1), convert the funnel
286; shift into 2 funnel shifts and sink the splat shuffles into the loop.
287
288define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
289; AVX1-LABEL: @fancierRotate2(
290; AVX1-NEXT:  entry:
291; AVX1-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
292; AVX1-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
293; AVX1-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
294; AVX1-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
295; AVX1-NEXT:    br label [[LOOP:%.*]]
296; AVX1:       loop:
297; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
298; AVX1-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
299; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[T0]], align 1
300; AVX1-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
301; AVX1-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
302; AVX1-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
303; AVX1-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, ptr [[T4]], align 4
304; AVX1-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
305; AVX1-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP0]])
306; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
307; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[TMP2]])
308; AVX1-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[TMP1]], <8 x i32> [[TMP3]]
309; AVX1-NEXT:    store <8 x i32> [[TMP4]], ptr [[T4]], align 4
310; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
311; AVX1-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
312; AVX1-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
313; AVX1:       exit:
314; AVX1-NEXT:    ret void
315;
316; AVX2-LABEL: @fancierRotate2(
317; AVX2-NEXT:  entry:
318; AVX2-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
319; AVX2-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
320; AVX2-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
321; AVX2-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
322; AVX2-NEXT:    br label [[LOOP:%.*]]
323; AVX2:       loop:
324; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
325; AVX2-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
326; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[T0]], align 1
327; AVX2-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
328; AVX2-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
329; AVX2-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
330; AVX2-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, ptr [[T4]], align 4
331; AVX2-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
332; AVX2-NEXT:    store <8 x i32> [[ROT]], ptr [[T4]], align 4
333; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
334; AVX2-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
335; AVX2-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
336; AVX2:       exit:
337; AVX2-NEXT:    ret void
338;
339; AVX512BW-LABEL: @fancierRotate2(
340; AVX512BW-NEXT:  entry:
341; AVX512BW-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
342; AVX512BW-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
343; AVX512BW-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
344; AVX512BW-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
345; AVX512BW-NEXT:    br label [[LOOP:%.*]]
346; AVX512BW:       loop:
347; AVX512BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
348; AVX512BW-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
349; AVX512BW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[T0]], align 1
350; AVX512BW-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
351; AVX512BW-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
352; AVX512BW-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
353; AVX512BW-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, ptr [[T4]], align 4
354; AVX512BW-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
355; AVX512BW-NEXT:    store <8 x i32> [[ROT]], ptr [[T4]], align 4
356; AVX512BW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
357; AVX512BW-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
358; AVX512BW-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
359; AVX512BW:       exit:
360; AVX512BW-NEXT:    ret void
361;
362; XOP-LABEL: @fancierRotate2(
363; XOP-NEXT:  entry:
364; XOP-NEXT:    [[I0:%.*]] = insertelement <8 x i32> undef, i32 [[ROT0:%.*]], i32 0
365; XOP-NEXT:    [[S0:%.*]] = shufflevector <8 x i32> [[I0]], <8 x i32> undef, <8 x i32> zeroinitializer
366; XOP-NEXT:    [[I1:%.*]] = insertelement <8 x i32> undef, i32 [[ROT1:%.*]], i32 0
367; XOP-NEXT:    [[S1:%.*]] = shufflevector <8 x i32> [[I1]], <8 x i32> undef, <8 x i32> zeroinitializer
368; XOP-NEXT:    br label [[LOOP:%.*]]
369; XOP:       loop:
370; XOP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
371; XOP-NEXT:    [[T0:%.*]] = getelementptr inbounds i8, ptr [[CONTROL:%.*]], i64 [[INDEX]]
372; XOP-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[T0]], align 1
373; XOP-NEXT:    [[T2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer
374; XOP-NEXT:    [[SHAMT:%.*]] = select <8 x i1> [[T2]], <8 x i32> [[S0]], <8 x i32> [[S1]]
375; XOP-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i64 [[INDEX]]
376; XOP-NEXT:    [[WIDE_LOAD21:%.*]] = load <8 x i32>, ptr [[T4]], align 4
377; XOP-NEXT:    [[ROT:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD21]], <8 x i32> [[WIDE_LOAD21]], <8 x i32> [[SHAMT]])
378; XOP-NEXT:    store <8 x i32> [[ROT]], ptr [[T4]], align 4
379; XOP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
380; XOP-NEXT:    [[T7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
381; XOP-NEXT:    br i1 [[T7]], label [[EXIT:%.*]], label [[LOOP]]
382; XOP:       exit:
383; XOP-NEXT:    ret void
384;
385entry:
386  %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0
387  %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer
388  %i1 = insertelement <8 x i32> undef, i32 %rot1, i32 0
389  %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer
390  br label %loop
391
392loop:
393  %index = phi i64 [ 0, %entry ], [ %index.next, %loop ]
394  %t0 = getelementptr inbounds i8, ptr %control, i64 %index
395  %wide.load = load <8 x i8>, ptr %t0, align 1
396  %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer
397  %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1
398  %t4 = getelementptr inbounds i32, ptr %arr, i64 %index
399  %wide.load21 = load <8 x i32>, ptr %t4, align 4
400  %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt)
401  store <8 x i32> %rot, ptr %t4, align 4
402  %index.next = add i64 %index, 8
403  %t7 = icmp eq i64 %index.next, 1024
404  br i1 %t7, label %exit, label %loop
405
406exit:
407  ret void
408}
409
410declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) #1
411
412; Check that every instruction inserted by -passes='require<profile-summary>,function(codegenprepare)' has a debug location.
413; DEBUG: CheckModuleDebugify: PASS
414