xref: /llvm-project/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll (revision 7f3428d3ed71d87a2088b77b6cab9f3d86544234)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
3; RUN: opt < %s -passes=loop-vectorize,simplifycfg -mcpu=knl -force-vector-width=2 -force-target-max-vector-interleave=1 -S | FileCheck %s -check-prefix=FVW2
4
5; With a force-vector-width, it is sometimes more profitable to generate
6; scalarized and predicated stores instead of masked scatter. Disable
7; interleaving to simplify CHECKs in that scenario.
8
9target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10target triple = "x86_64-pc_linux"
11
12; The source code:
13;
14;void foo1(ptr __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
15;
16;  for (int i=0; i < SIZE; ++i) {
17;    if (trigger[i] > 0) {
18;      out[i] = in[index[i]] + (float) 0.5;
19;    }
20;  }
21;}
22
23; Function Attrs: nounwind uwtable
24define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) {
25; AVX512-LABEL: @foo1(
26; AVX512-NEXT:  entry:
27; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
28; AVX512:       vector.body:
29; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
30; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX1]], 0
31; AVX512-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
32; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
33; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4
34; AVX512-NEXT:    [[TMP3:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
35; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]]
36; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
37; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP3]], <16 x i32> poison)
38; AVX512-NEXT:    [[TMP6:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
39; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <16 x i64> [[TMP6]]
40; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP7]], i32 4, <16 x i1> [[TMP3]], <16 x float> poison)
41; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
42; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]]
43; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
44; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[TMP8]], ptr [[TMP10]], i32 4, <16 x i1> [[TMP3]])
45; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
46; AVX512-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
47; AVX512-NEXT:    br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
48; AVX512:       for.end:
49; AVX512-NEXT:    ret void
50;
51; FVW2-LABEL: @foo1(
52; FVW2-NEXT:  entry:
53; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
54; FVW2:       vector.body:
55; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
56; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX1]], 0
57; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
58; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
59; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
60; FVW2-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
61; FVW2-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[INDEX:%.*]], i64 [[TMP0]]
62; FVW2-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
63; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP5]], i32 4, <2 x i1> [[TMP3]], <2 x i32> poison)
64; FVW2-NEXT:    [[TMP6:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
65; FVW2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <2 x i64> [[TMP6]]
66; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP7]], i32 4, <2 x i1> [[TMP3]], <2 x float> poison)
67; FVW2-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
68; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[OUT:%.*]], i64 [[TMP0]]
69; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
70; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0(<2 x float> [[TMP8]], ptr [[TMP10]], i32 4, <2 x i1> [[TMP3]])
71; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
72; FVW2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
73; FVW2-NEXT:    br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
74; FVW2:       for.end:
75; FVW2-NEXT:    ret void
76;
77entry:
78  br label %for.body
79
80for.body:
81  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
82  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
83  %0 = load i32, ptr %arrayidx, align 4
84  %cmp1 = icmp sgt i32 %0, 0
85  br i1 %cmp1, label %if.then, label %for.inc
86
87if.then:
88  %arrayidx3 = getelementptr inbounds i32, ptr %index, i64 %indvars.iv
89  %1 = load i32, ptr %arrayidx3, align 4
90  %idxprom4 = sext i32 %1 to i64
91  %arrayidx5 = getelementptr inbounds float, ptr %in, i64 %idxprom4
92  %2 = load float, ptr %arrayidx5, align 4
93  %add = fadd float %2, 5.000000e-01
94  %arrayidx7 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
95  store float %add, ptr %arrayidx7, align 4
96  br label %for.inc
97
98for.inc:
99  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
100  %exitcond.not = icmp eq i64 %indvars.iv.next, 4096
101  br i1 %exitcond.not, label %for.end, label %for.body
102
103for.end:
104  ret void
105}
106
107; The source code
108;void foo2 (In * __restrict__ in, ptr __restrict__ out, int * __restrict__ trigger) {
109;
110;  for (int i=0; i<SIZE; i += 16) {
111;    if (trigger[i] > 0) {
112;      out[i] = in[i].b + (float) 0.5;
113;    }
114;  }
115;}
116
117%struct.In = type { float, float }
118
119define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr noalias %index) #0 {
120; AVX512-LABEL: @foo2(
121; AVX512-NEXT:  entry:
122; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
123; AVX512:       vector.body:
124; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
125; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
126; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
127; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
128; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
129; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
130; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
131; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
132; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
133; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
134; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
135; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
136; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
137; AVX512-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
138; AVX512:       for.end:
139; AVX512-NEXT:    ret void
140;
141; FVW2-LABEL: @foo2(
142; FVW2-NEXT:  entry:
143; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
144; FVW2:       vector.body:
145; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
146; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
147; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
148; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
149; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
150; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
151; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
152; FVW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
153; FVW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
154; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
155; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
156; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
157; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
158; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
159; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
160; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
161; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
162; FVW2:       pred.store.if:
163; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
164; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
165; FVW2-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
166; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
167; FVW2:       pred.store.continue:
168; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
169; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
170; FVW2:       pred.store.if2:
171; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]]
172; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
173; FVW2-NEXT:    store float [[TMP16]], ptr [[TMP15]], align 4
174; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
175; FVW2:       pred.store.continue3:
176; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
177; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
178; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
179; FVW2-NEXT:    br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
180; FVW2:       for.end:
181; FVW2-NEXT:    ret void
182;
183entry:
184  br label %for.body
185
186for.body:
187  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
188  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
189  %0 = load i32, ptr %arrayidx, align 4
190  %cmp1 = icmp sgt i32 %0, 0
191  br i1 %cmp1, label %if.then, label %for.inc
192
193if.then:
194  %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
195  %1 = load float, ptr %b, align 4
196  %add = fadd float %1, 5.000000e-01
197  %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
198  store float %add, ptr %arrayidx5, align 4
199  br label %for.inc
200
201for.inc:
202  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
203  %cmp = icmp ult i64 %indvars.iv, 4080
204  br i1 %cmp, label %for.body, label %for.end
205
206for.end:
207  ret void
208}
209
210; The source code
211;struct Out {
212;  float a;
213;  float b;
214;};
215;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
216;
217;  for (int i=0; i<SIZE; i += 16) {
218;    if (trigger[i] > 0) {
219;      out[i].b = in[i].b + (float) 0.5;
220;    }
221;  }
222;}
223
224%struct.Out = type { float, float }
225
226define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
227; AVX512-LABEL: @foo3(
228; AVX512-NEXT:  entry:
229; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
230; AVX512:       vector.body:
231; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
232; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
233; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
234; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
235; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
236; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
237; AVX512-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
238; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], splat (float 5.000000e-01)
239; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
240; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
241; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
242; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
243; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
244; AVX512-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
245; AVX512:       for.end:
246; AVX512-NEXT:    ret void
247;
248; FVW2-LABEL: @foo3(
249; FVW2-NEXT:  entry:
250; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
251; FVW2:       vector.body:
252; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
253; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
254; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
255; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
256; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
257; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
258; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
259; FVW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
260; FVW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
261; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
262; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
263; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
264; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
265; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
266; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
267; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
268; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
269; FVW2:       pred.store.if:
270; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], ptr [[OUT:%.*]], i64 [[TMP0]], i32 1
271; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
272; FVW2-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
273; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
274; FVW2:       pred.store.continue:
275; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
276; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
277; FVW2:       pred.store.if1:
278; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_OUT]], ptr [[OUT]], i64 [[TMP1]], i32 1
279; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
280; FVW2-NEXT:    store float [[TMP16]], ptr [[TMP15]], align 4
281; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE2]]
282; FVW2:       pred.store.continue2:
283; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
284; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
285; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
286; FVW2-NEXT:    br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
287; FVW2:       for.end:
288; FVW2-NEXT:    ret void
289;
290entry:
291  br label %for.body
292
293for.body:
294  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
295  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
296  %0 = load i32, ptr %arrayidx, align 4
297  %cmp1 = icmp sgt i32 %0, 0
298  br i1 %cmp1, label %if.then, label %for.inc
299
300if.then:
301  %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
302  %1 = load float, ptr %b, align 4
303  %add = fadd float %1, 5.000000e-01
304  %b6 = getelementptr inbounds %struct.Out, ptr %out, i64 %indvars.iv, i32 1
305  store float %add, ptr %b6, align 4
306  br label %for.inc
307
308for.inc:
309  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
310  %cmp = icmp ult i64 %indvars.iv, 4080
311  br i1 %cmp, label %for.body, label %for.end
312
313for.end:
314  ret void
315}
316declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
317
318; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
319
320define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) #0 {
321; AVX512-LABEL: @foo2_addrspace(
322; AVX512-NEXT:  entry:
323; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
324; AVX512:       vector.body:
325; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
326; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
327; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
328; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
329; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
330; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
331; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
332; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
333; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
334; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
335; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
336; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
337; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
338; AVX512-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
339; AVX512:       for.end:
340; AVX512-NEXT:    ret void
341;
342; FVW2-LABEL: @foo2_addrspace(
343; FVW2-NEXT:  entry:
344; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
345; FVW2:       vector.body:
346; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
347; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
348; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
349; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
350; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
351; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
352; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
353; FVW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
354; FVW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
355; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
356; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
357; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
358; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
359; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
360; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
361; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
362; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
363; FVW2:       pred.store.if:
364; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]]
365; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
366; FVW2-NEXT:    store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4
367; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
368; FVW2:       pred.store.continue:
369; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
370; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
371; FVW2:       pred.store.if2:
372; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
373; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
374; FVW2-NEXT:    store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4
375; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
376; FVW2:       pred.store.continue3:
377; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
378; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
379; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
380; FVW2-NEXT:    br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
381; FVW2:       for.end:
382; FVW2-NEXT:    ret void
383;
384entry:
385  br label %for.body
386
387for.body:
388  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
389  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
390  %0 = load i32, ptr %arrayidx, align 4
391  %cmp1 = icmp sgt i32 %0, 0
392  br i1 %cmp1, label %if.then, label %for.inc
393
394if.then:
395  %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1
396  %1 = load float, ptr addrspace(1) %b, align 4
397  %add = fadd float %1, 5.000000e-01
398  %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv
399  store float %add, ptr addrspace(1) %arrayidx5, align 4
400  br label %for.inc
401
402for.inc:
403  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
404  %cmp = icmp ult i64 %indvars.iv, 4080
405  br i1 %cmp, label %for.body, label %for.end
406
407for.end:
408  ret void
409}
410
411; Same as foo2_addrspace but here only the input has the non-default address space.
412
413define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noalias %out, ptr noalias %trigger, ptr noalias %index) {
414; AVX512-LABEL: @foo2_addrspace2(
415; AVX512-NEXT:  entry:
416; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
417; AVX512:       vector.body:
418; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
419; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
420; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
421; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
422; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
423; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
424; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1(<16 x ptr addrspace(1)> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
425; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
426; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <16 x i64> [[VEC_IND]]
427; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[TMP3]], <16 x ptr> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
428; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
429; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
430; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
431; AVX512-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
432; AVX512:       for.end:
433; AVX512-NEXT:    ret void
434;
435; FVW2-LABEL: @foo2_addrspace2(
436; FVW2-NEXT:  entry:
437; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
438; FVW2:       vector.body:
439; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
440; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
441; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
442; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
443; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
444; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
445; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
446; FVW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
447; FVW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
448; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
449; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
450; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
451; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr addrspace(1) [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
452; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1(<2 x ptr addrspace(1)> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
453; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
454; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
455; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
456; FVW2:       pred.store.if:
457; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
458; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
459; FVW2-NEXT:    store float [[TMP13]], ptr [[TMP12]], align 4
460; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
461; FVW2:       pred.store.continue:
462; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
463; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
464; FVW2:       pred.store.if2:
465; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP1]]
466; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
467; FVW2-NEXT:    store float [[TMP16]], ptr [[TMP15]], align 4
468; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
469; FVW2:       pred.store.continue3:
470; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
471; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
472; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
473; FVW2-NEXT:    br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
474; FVW2:       for.end:
475; FVW2-NEXT:    ret void
476;
477entry:
478  br label %for.body
479
480for.body:
481  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
482  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
483  %0 = load i32, ptr %arrayidx, align 4
484  %cmp1 = icmp sgt i32 %0, 0
485  br i1 %cmp1, label %if.then, label %for.inc
486
487if.then:
488  %b = getelementptr inbounds %struct.In, ptr addrspace(1) %in, i64 %indvars.iv, i32 1
489  %1 = load float, ptr addrspace(1) %b, align 4
490  %add = fadd float %1, 5.000000e-01
491  %arrayidx5 = getelementptr inbounds float, ptr %out, i64 %indvars.iv
492  store float %add, ptr %arrayidx5, align 4
493  br label %for.inc
494
495for.inc:
496  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
497  %cmp = icmp ult i64 %indvars.iv, 4080
498  br i1 %cmp, label %for.body, label %for.end
499
500for.end:
501  ret void
502}
503
504; Same as foo2_addrspace but here only the output has the non-default address space.
505
506define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noalias %out, ptr noalias %trigger, ptr noalias %index) {
507; AVX512-LABEL: @foo2_addrspace3(
508; AVX512-NEXT:  entry:
509; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
510; AVX512:       vector.body:
511; AVX512-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
512; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
513; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
514; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP0]], i32 4, <16 x i1> splat (i1 true), <16 x i32> poison)
515; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
516; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
517; AVX512-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> poison)
518; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], splat (float 5.000000e-01)
519; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], <16 x i64> [[VEC_IND]]
520; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1(<16 x float> [[TMP3]], <16 x ptr addrspace(1)> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
521; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
522; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 256)
523; AVX512-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
524; AVX512-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
525; AVX512:       for.end:
526; AVX512-NEXT:    ret void
527;
528; FVW2-LABEL: @foo2_addrspace3(
529; FVW2-NEXT:  entry:
530; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
531; FVW2:       vector.body:
532; FVW2-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
533; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE3]] ]
534; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX1]], 16
535; FVW2-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
536; FVW2-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
537; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER:%.*]], i64 [[TMP0]]
538; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[TMP1]]
539; FVW2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
540; FVW2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
541; FVW2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
542; FVW2-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP5]], i32 1
543; FVW2-NEXT:    [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP7]], zeroinitializer
544; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], ptr [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
545; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP9]], i32 4, <2 x i1> [[TMP8]], <2 x float> poison)
546; FVW2-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], splat (float 5.000000e-01)
547; FVW2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
548; FVW2-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
549; FVW2:       pred.store.if:
550; FVW2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT:%.*]], i64 [[TMP0]]
551; FVW2-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0
552; FVW2-NEXT:    store float [[TMP13]], ptr addrspace(1) [[TMP12]], align 4
553; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
554; FVW2:       pred.store.continue:
555; FVW2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
556; FVW2-NEXT:    br i1 [[TMP14]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
557; FVW2:       pred.store.if2:
558; FVW2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[OUT]], i64 [[TMP1]]
559; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP10]], i32 1
560; FVW2-NEXT:    store float [[TMP16]], ptr addrspace(1) [[TMP15]], align 4
561; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE3]]
562; FVW2:       pred.store.continue3:
563; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
564; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 32)
565; FVW2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
566; FVW2-NEXT:    br i1 [[TMP17]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
567; FVW2:       for.end:
568; FVW2-NEXT:    ret void
569;
570entry:
571  br label %for.body
572
573for.body:
574  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
575  %arrayidx = getelementptr inbounds i32, ptr %trigger, i64 %indvars.iv
576  %0 = load i32, ptr %arrayidx, align 4
577  %cmp1 = icmp sgt i32 %0, 0
578  br i1 %cmp1, label %if.then, label %for.inc
579
580if.then:
581  %b = getelementptr inbounds %struct.In, ptr %in, i64 %indvars.iv, i32 1
582  %1 = load float, ptr %b, align 4
583  %add = fadd float %1, 5.000000e-01
584  %arrayidx5 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %indvars.iv
585  store float %add, ptr addrspace(1) %arrayidx5, align 4
586  br label %for.inc
587
588for.inc:
589  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 16
590  %cmp = icmp ult i64 %indvars.iv, 4080
591  br i1 %cmp, label %for.body, label %for.end
592
593for.end:
594  ret void
595}
596
597; Using gathers is not profitable for this function. PR48429.
598define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %ptr, ptr nocapture noalias %dest) {
599; AVX512-LABEL: @test_gather_not_profitable_pr48429(
600; AVX512-NEXT:  entry:
601; AVX512-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
602; AVX512-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
603; AVX512-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
604; AVX512-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[ITER_CHECK:%.*]]
605; AVX512:       iter.check:
606; AVX512-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
607; AVX512-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
608; AVX512-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
609; AVX512-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
610; AVX512-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
611; AVX512-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
612; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
613; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
614; AVX512:       vector.memcheck:
615; AVX512-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
616; AVX512-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
617; AVX512-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
618; AVX512-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 6
619; AVX512-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
620; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
621; AVX512-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
622; AVX512-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 4
623; AVX512-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
624; AVX512-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
625; AVX512-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
626; AVX512-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
627; AVX512-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
628; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
629; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
630; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
631; AVX512-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
632; AVX512-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
633; AVX512-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
634; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
635; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
636; AVX512:       vector.main.loop.iter.check:
637; AVX512-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[TMP3]], 16
638; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
639; AVX512:       vector.ph:
640; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
641; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
642; AVX512-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 64
643; AVX512-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP13]]
644; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
645; AVX512:       vector.body:
646; AVX512-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
647; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
648; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <16 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448, i64 512, i64 576, i64 640, i64 704, i64 768, i64 832, i64 896, i64 960>
649; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
650; AVX512-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
651; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]]
652; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
653; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
654; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x float>, ptr [[TMP18]], align 4, !alias.scope [[META8:![0-9]+]]
655; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD]], <16 x ptr> [[TMP14]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
656; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
657; AVX512-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x float>, ptr [[TMP19]], align 4, !alias.scope [[META15:![0-9]+]]
658; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, <16 x ptr> [[TMP14]], i64 1
659; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> [[WIDE_LOAD8]], <16 x ptr> [[TMP20]], i32 4, <16 x i1> splat (i1 true)), !alias.scope [[META11]], !noalias [[META13]]
660; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
661; AVX512-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 1024
662; AVX512-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
663; AVX512-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
664; AVX512:       middle.block:
665; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
666; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
667; AVX512:       vec.epilog.iter.check:
668; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
669; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
670; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
671; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
672; AVX512-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
673; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
674; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
675; AVX512:       vec.epilog.ph:
676; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
677; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[DEST]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
678; AVX512-NEXT:    [[N_MOD_VF9:%.*]] = urem i64 [[TMP3]], 8
679; AVX512-NEXT:    [[N_VEC10:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF9]]
680; AVX512-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC10]], 4
681; AVX512-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP24]]
682; AVX512-NEXT:    [[TMP25:%.*]] = mul i64 [[N_VEC10]], 64
683; AVX512-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP25]]
684; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
685; AVX512:       vec.epilog.vector.body:
686; AVX512-NEXT:    [[POINTER_PHI19:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[PTR_IND20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
687; AVX512-NEXT:    [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
688; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[POINTER_PHI19]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
689; AVX512-NEXT:    [[OFFSET_IDX21:%.*]] = mul i64 [[INDEX18]], 4
690; AVX512-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX21]], 0
691; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP27]]
692; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, ptr [[TMP28]], i64 [[IDXPROM]]
693; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP29]], i32 0
694; AVX512-NEXT:    [[WIDE_LOAD17:%.*]] = load <8 x float>, ptr [[TMP30]], align 4, !alias.scope [[META17:![0-9]+]]
695; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD17]], <8 x ptr> [[TMP26]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20:![0-9]+]], !noalias [[META22:![0-9]+]]
696; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
697; AVX512-NEXT:    [[WIDE_LOAD18:%.*]] = load <8 x float>, ptr [[TMP31]], align 4, !alias.scope [[META24:![0-9]+]]
698; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, <8 x ptr> [[TMP26]], i64 1
699; AVX512-NEXT:    call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> [[WIDE_LOAD18]], <8 x ptr> [[TMP32]], i32 4, <8 x i1> splat (i1 true)), !alias.scope [[META20]], !noalias [[META22]]
700; AVX512-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX18]], 8
701; AVX512-NEXT:    [[PTR_IND20]] = getelementptr i8, ptr [[POINTER_PHI19]], i64 512
702; AVX512-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC10]]
703; AVX512-NEXT:    br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
704; AVX512:       vec.epilog.middle.block:
705; AVX512-NEXT:    [[CMP_N17:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC10]]
706; AVX512-NEXT:    br i1 [[CMP_N17]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]]
707; AVX512:       vec.epilog.scalar.ph:
708; AVX512-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi ptr [ [[IND_END11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[ITER_CHECK]] ], [ [[IND_END12]], [[VEC_EPILOG_ITER_CHECK]] ]
709; AVX512-NEXT:    [[BC_RESUME_VAL16:%.*]] = phi ptr [ [[IND_END14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[ITER_CHECK]] ], [ [[IND_END15]], [[VEC_EPILOG_ITER_CHECK]] ]
710; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
711; AVX512:       for.body:
712; AVX512-NEXT:    [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL13]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
713; AVX512-NEXT:    [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL16]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
714; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]]
715; AVX512-NEXT:    [[TMP34:%.*]] = load float, ptr [[ARRAYIDX]], align 4
716; AVX512-NEXT:    store float [[TMP34]], ptr [[DEST_ADDR_011]], align 4
717; AVX512-NEXT:    [[TMP35:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4
718; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1
719; AVX512-NEXT:    store float [[TMP35]], ptr [[ARRAYIDX5]], align 4
720; AVX512-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
721; AVX512-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
722; AVX512-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
723; AVX512-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
724; AVX512:       for.end:
725; AVX512-NEXT:    ret void
726;
727; FVW2-LABEL: @test_gather_not_profitable_pr48429(
728; FVW2-NEXT:  entry:
729; FVW2-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[D:%.*]] to i64
730; FVW2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, ptr [[PTR:%.*]], i64 [[IDX_EXT]]
731; FVW2-NEXT:    [[CMP_NOT10:%.*]] = icmp eq i32 [[D]], 0
732; FVW2-NEXT:    br i1 [[CMP_NOT10]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
733; FVW2:       for.body.lr.ph:
734; FVW2-NEXT:    [[MUL:%.*]] = sub nsw i32 0, [[D]]
735; FVW2-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
736; FVW2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 2
737; FVW2-NEXT:    [[TMP1:%.*]] = add nsw i64 [[TMP0]], -4
738; FVW2-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
739; FVW2-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
740; FVW2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
741; FVW2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
742; FVW2:       vector.memcheck:
743; FVW2-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[IDX_EXT]], 2
744; FVW2-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], -4
745; FVW2-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 2
746; FVW2-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 6
747; FVW2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 8
748; FVW2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DEST:%.*]], i64 [[TMP8]]
749; FVW2-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP6]], 2
750; FVW2-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 4
751; FVW2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP10]]
752; FVW2-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[IDX_EXT]], -4
753; FVW2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP11]]
754; FVW2-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP10]], [[TMP4]]
755; FVW2-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP12]]
756; FVW2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP1]]
757; FVW2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PTR]], [[SCEVGEP]]
758; FVW2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
759; FVW2-NEXT:    [[BOUND04:%.*]] = icmp ult ptr [[DEST]], [[SCEVGEP3]]
760; FVW2-NEXT:    [[BOUND15:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP]]
761; FVW2-NEXT:    [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
762; FVW2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
763; FVW2-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
764; FVW2:       vector.ph:
765; FVW2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
766; FVW2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
767; FVW2-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 4
768; FVW2-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP13]]
769; FVW2-NEXT:    [[TMP14:%.*]] = mul i64 [[N_VEC]], 64
770; FVW2-NEXT:    [[IND_END7:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP14]]
771; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
772; FVW2:       vector.body:
773; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
774; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
775; FVW2-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 0
776; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP15]]
777; FVW2-NEXT:    [[OFFSET_IDX9:%.*]] = mul i64 [[INDEX]], 64
778; FVW2-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX9]], 0
779; FVW2-NEXT:    [[TMP18:%.*]] = add i64 [[OFFSET_IDX9]], 64
780; FVW2-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP17]]
781; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP18]]
782; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM]]
783; FVW2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 0
784; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP22]], align 4, !alias.scope [[META8:![0-9]+]]
785; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0
786; FVW2-NEXT:    store float [[TMP23]], ptr [[TMP19]], align 4, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
787; FVW2-NEXT:    [[TMP24:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1
788; FVW2-NEXT:    store float [[TMP24]], ptr [[TMP20]], align 4, !alias.scope [[META11]], !noalias [[META13]]
789; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr float, ptr [[TMP16]], i32 0
790; FVW2-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x float>, ptr [[TMP25]], align 4, !alias.scope [[META15:![0-9]+]]
791; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 1
792; FVW2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 1
793; FVW2-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 0
794; FVW2-NEXT:    store float [[TMP28]], ptr [[TMP26]], align 4, !alias.scope [[META11]], !noalias [[META13]]
795; FVW2-NEXT:    [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD10]], i32 1
796; FVW2-NEXT:    store float [[TMP29]], ptr [[TMP27]], align 4, !alias.scope [[META11]], !noalias [[META13]]
797; FVW2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
798; FVW2-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
799; FVW2-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
800; FVW2:       middle.block:
801; FVW2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
802; FVW2-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
803; FVW2:       scalar.ph:
804; FVW2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ], [ [[PTR]], [[FOR_BODY_LR_PH]] ]
805; FVW2-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEST]], [[VECTOR_MEMCHECK]] ], [ [[DEST]], [[FOR_BODY_LR_PH]] ]
806; FVW2-NEXT:    br label [[FOR_BODY:%.*]]
807; FVW2:       for.body:
808; FVW2-NEXT:    [[PTR_ADDR_012:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
809; FVW2-NEXT:    [[DEST_ADDR_011:%.*]] = phi ptr [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ]
810; FVW2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 [[IDXPROM]]
811; FVW2-NEXT:    [[TMP31:%.*]] = load float, ptr [[ARRAYIDX]], align 4
812; FVW2-NEXT:    store float [[TMP31]], ptr [[DEST_ADDR_011]], align 4
813; FVW2-NEXT:    [[TMP32:%.*]] = load float, ptr [[PTR_ADDR_012]], align 4
814; FVW2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 1
815; FVW2-NEXT:    store float [[TMP32]], ptr [[ARRAYIDX5]], align 4
816; FVW2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PTR_ADDR_012]], i64 1
817; FVW2-NEXT:    [[ADD_PTR6]] = getelementptr inbounds float, ptr [[DEST_ADDR_011]], i64 16
818; FVW2-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR]], [[ADD_PTR]]
819; FVW2-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
820; FVW2:       for.end:
821; FVW2-NEXT:    ret void
822;
823entry:
824  %idx.ext = sext i32 %d to i64
825  %add.ptr = getelementptr inbounds float, ptr %ptr, i64 %idx.ext
826  %cmp.not10 = icmp eq i32 %d, 0
827  br i1 %cmp.not10, label %for.end, label %for.body.lr.ph
828
829for.body.lr.ph:
830  %mul = sub nsw i32 0, %d
831  %idxprom = sext i32 %mul to i64
832  br label %for.body
833
834for.body:
835  %ptr.addr.012 = phi ptr [ %ptr, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
836  %dest.addr.011 = phi ptr [ %dest, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
837  %arrayidx = getelementptr inbounds float, ptr %ptr.addr.012, i64 %idxprom
838  %0 = load float, ptr %arrayidx, align 4
839  store float %0, ptr %dest.addr.011, align 4
840  %1 = load float, ptr %ptr.addr.012, align 4
841  %arrayidx5 = getelementptr inbounds float, ptr %dest.addr.011, i64 1
842  store float %1, ptr %arrayidx5, align 4
843  %incdec.ptr = getelementptr inbounds float, ptr %ptr.addr.012, i64 1
844  %add.ptr6 = getelementptr inbounds float, ptr %dest.addr.011, i64 16
845  %cmp.not = icmp eq ptr %incdec.ptr, %add.ptr
846  br i1 %cmp.not, label %for.end, label %for.body
847
848for.end:
849  ret void
850}
851