xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \
3; RUN:   -mattr=+no-sink-splat-operands -riscv-v-vector-bits-min=128 \
4; RUN:   | FileCheck -check-prefix=NO-SINK %s
5; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \
6; RUN:   -mattr=-no-sink-splat-operands -riscv-v-vector-bits-min=128 \
7; RUN:   | FileCheck -check-prefix=SINK %s
8; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \
9; RUN:   -riscv-v-vector-bits-min=128 \
10; RUN:   | FileCheck -check-prefix=DEFAULT %s
11
12; Test that we don't sink splat operands when compiling with no-sink-splat-operands.
13; Each scalar register access requires a S2V transfer buffer entry. Using too many
14; limits performance.
15; FIXME: This is potentially bad for register pressure. Need a better heuristic.
16
17define void @sink_splat_add(ptr nocapture %a, i32 signext %x) {
18; NO-SINK-LABEL: sink_splat_add:
19; NO-SINK:       # %bb.0: # %entry
20; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
21; NO-SINK-NEXT:    vmv.v.x v8, a1
22; NO-SINK-NEXT:    lui a1, 1
23; NO-SINK-NEXT:    add a1, a0, a1
24; NO-SINK-NEXT:  .LBB0_1: # %vector.body
25; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
26; NO-SINK-NEXT:    vle32.v v9, (a0)
27; NO-SINK-NEXT:    vadd.vv v9, v9, v8
28; NO-SINK-NEXT:    vse32.v v9, (a0)
29; NO-SINK-NEXT:    addi a0, a0, 16
30; NO-SINK-NEXT:    bne a0, a1, .LBB0_1
31; NO-SINK-NEXT:  # %bb.2: # %for.cond.cleanup
32; NO-SINK-NEXT:    ret
33;
34; SINK-LABEL: sink_splat_add:
35; SINK:       # %bb.0: # %entry
36; SINK-NEXT:    lui a2, 1
37; SINK-NEXT:    add a2, a0, a2
38; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
39; SINK-NEXT:  .LBB0_1: # %vector.body
40; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
41; SINK-NEXT:    vle32.v v8, (a0)
42; SINK-NEXT:    vadd.vx v8, v8, a1
43; SINK-NEXT:    vse32.v v8, (a0)
44; SINK-NEXT:    addi a0, a0, 16
45; SINK-NEXT:    bne a0, a2, .LBB0_1
46; SINK-NEXT:  # %bb.2: # %for.cond.cleanup
47; SINK-NEXT:    ret
48;
49; DEFAULT-LABEL: sink_splat_add:
50; DEFAULT:       # %bb.0: # %entry
51; DEFAULT-NEXT:    lui a2, 1
52; DEFAULT-NEXT:    add a2, a0, a2
53; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
54; DEFAULT-NEXT:  .LBB0_1: # %vector.body
55; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
56; DEFAULT-NEXT:    vle32.v v8, (a0)
57; DEFAULT-NEXT:    vadd.vx v8, v8, a1
58; DEFAULT-NEXT:    vse32.v v8, (a0)
59; DEFAULT-NEXT:    addi a0, a0, 16
60; DEFAULT-NEXT:    bne a0, a2, .LBB0_1
61; DEFAULT-NEXT:  # %bb.2: # %for.cond.cleanup
62; DEFAULT-NEXT:    ret
63entry:
64  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
65  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
66  br label %vector.body
67
68vector.body:                                      ; preds = %vector.body, %entry
69  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
70  %0 = getelementptr inbounds i32, ptr %a, i64 %index
71  %1 = bitcast ptr %0 to ptr
72  %wide.load = load <4 x i32>, ptr %1, align 4
73  %2 = add <4 x i32> %wide.load, %broadcast.splat
74  %3 = bitcast ptr %0 to ptr
75  store <4 x i32> %2, ptr %3, align 4
76  %index.next = add nuw i64 %index, 4
77  %4 = icmp eq i64 %index.next, 1024
78  br i1 %4, label %for.cond.cleanup, label %vector.body
79
80for.cond.cleanup:                                 ; preds = %vector.body
81  ret void
82}
83
84declare i64 @llvm.vscale.i64()
85
86define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
87; NO-SINK-LABEL: sink_splat_add_scalable:
88; NO-SINK:       # %bb.0: # %entry
89; NO-SINK-NEXT:    csrr a5, vlenb
90; NO-SINK-NEXT:    srli a3, a5, 1
91; NO-SINK-NEXT:    li a2, 1024
92; NO-SINK-NEXT:    bgeu a2, a3, .LBB1_2
93; NO-SINK-NEXT:  # %bb.1:
94; NO-SINK-NEXT:    li a2, 0
95; NO-SINK-NEXT:    j .LBB1_5
96; NO-SINK-NEXT:  .LBB1_2: # %vector.ph
97; NO-SINK-NEXT:    addi a2, a3, -1
98; NO-SINK-NEXT:    andi a4, a2, 1024
99; NO-SINK-NEXT:    xori a2, a4, 1024
100; NO-SINK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
101; NO-SINK-NEXT:    vmv.v.x v8, a1
102; NO-SINK-NEXT:    slli a5, a5, 1
103; NO-SINK-NEXT:    mv a6, a0
104; NO-SINK-NEXT:    mv a7, a2
105; NO-SINK-NEXT:  .LBB1_3: # %vector.body
106; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
107; NO-SINK-NEXT:    vl2re32.v v10, (a6)
108; NO-SINK-NEXT:    sub a7, a7, a3
109; NO-SINK-NEXT:    vadd.vv v10, v10, v8
110; NO-SINK-NEXT:    vs2r.v v10, (a6)
111; NO-SINK-NEXT:    add a6, a6, a5
112; NO-SINK-NEXT:    bnez a7, .LBB1_3
113; NO-SINK-NEXT:  # %bb.4: # %middle.block
114; NO-SINK-NEXT:    beqz a4, .LBB1_7
115; NO-SINK-NEXT:  .LBB1_5: # %for.body.preheader
116; NO-SINK-NEXT:    slli a2, a2, 2
117; NO-SINK-NEXT:    lui a3, 1
118; NO-SINK-NEXT:    add a2, a0, a2
119; NO-SINK-NEXT:    add a0, a0, a3
120; NO-SINK-NEXT:  .LBB1_6: # %for.body
121; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
122; NO-SINK-NEXT:    lw a3, 0(a2)
123; NO-SINK-NEXT:    add a3, a3, a1
124; NO-SINK-NEXT:    sw a3, 0(a2)
125; NO-SINK-NEXT:    addi a2, a2, 4
126; NO-SINK-NEXT:    bne a2, a0, .LBB1_6
127; NO-SINK-NEXT:  .LBB1_7: # %for.cond.cleanup
128; NO-SINK-NEXT:    ret
129;
130; SINK-LABEL: sink_splat_add_scalable:
131; SINK:       # %bb.0: # %entry
132; SINK-NEXT:    csrr a5, vlenb
133; SINK-NEXT:    srli a3, a5, 1
134; SINK-NEXT:    li a2, 1024
135; SINK-NEXT:    bgeu a2, a3, .LBB1_2
136; SINK-NEXT:  # %bb.1:
137; SINK-NEXT:    li a2, 0
138; SINK-NEXT:    j .LBB1_5
139; SINK-NEXT:  .LBB1_2: # %vector.ph
140; SINK-NEXT:    addi a2, a3, -1
141; SINK-NEXT:    andi a4, a2, 1024
142; SINK-NEXT:    xori a2, a4, 1024
143; SINK-NEXT:    slli a5, a5, 1
144; SINK-NEXT:    mv a6, a0
145; SINK-NEXT:    mv a7, a2
146; SINK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
147; SINK-NEXT:  .LBB1_3: # %vector.body
148; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
149; SINK-NEXT:    vl2re32.v v8, (a6)
150; SINK-NEXT:    sub a7, a7, a3
151; SINK-NEXT:    vadd.vx v8, v8, a1
152; SINK-NEXT:    vs2r.v v8, (a6)
153; SINK-NEXT:    add a6, a6, a5
154; SINK-NEXT:    bnez a7, .LBB1_3
155; SINK-NEXT:  # %bb.4: # %middle.block
156; SINK-NEXT:    beqz a4, .LBB1_7
157; SINK-NEXT:  .LBB1_5: # %for.body.preheader
158; SINK-NEXT:    slli a2, a2, 2
159; SINK-NEXT:    lui a3, 1
160; SINK-NEXT:    add a2, a0, a2
161; SINK-NEXT:    add a0, a0, a3
162; SINK-NEXT:  .LBB1_6: # %for.body
163; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
164; SINK-NEXT:    lw a3, 0(a2)
165; SINK-NEXT:    add a3, a3, a1
166; SINK-NEXT:    sw a3, 0(a2)
167; SINK-NEXT:    addi a2, a2, 4
168; SINK-NEXT:    bne a2, a0, .LBB1_6
169; SINK-NEXT:  .LBB1_7: # %for.cond.cleanup
170; SINK-NEXT:    ret
171;
172; DEFAULT-LABEL: sink_splat_add_scalable:
173; DEFAULT:       # %bb.0: # %entry
174; DEFAULT-NEXT:    csrr a5, vlenb
175; DEFAULT-NEXT:    srli a3, a5, 1
176; DEFAULT-NEXT:    li a2, 1024
177; DEFAULT-NEXT:    bgeu a2, a3, .LBB1_2
178; DEFAULT-NEXT:  # %bb.1:
179; DEFAULT-NEXT:    li a2, 0
180; DEFAULT-NEXT:    j .LBB1_5
181; DEFAULT-NEXT:  .LBB1_2: # %vector.ph
182; DEFAULT-NEXT:    addi a2, a3, -1
183; DEFAULT-NEXT:    andi a4, a2, 1024
184; DEFAULT-NEXT:    xori a2, a4, 1024
185; DEFAULT-NEXT:    slli a5, a5, 1
186; DEFAULT-NEXT:    mv a6, a0
187; DEFAULT-NEXT:    mv a7, a2
188; DEFAULT-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
189; DEFAULT-NEXT:  .LBB1_3: # %vector.body
190; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
191; DEFAULT-NEXT:    vl2re32.v v8, (a6)
192; DEFAULT-NEXT:    sub a7, a7, a3
193; DEFAULT-NEXT:    vadd.vx v8, v8, a1
194; DEFAULT-NEXT:    vs2r.v v8, (a6)
195; DEFAULT-NEXT:    add a6, a6, a5
196; DEFAULT-NEXT:    bnez a7, .LBB1_3
197; DEFAULT-NEXT:  # %bb.4: # %middle.block
198; DEFAULT-NEXT:    beqz a4, .LBB1_7
199; DEFAULT-NEXT:  .LBB1_5: # %for.body.preheader
200; DEFAULT-NEXT:    slli a2, a2, 2
201; DEFAULT-NEXT:    lui a3, 1
202; DEFAULT-NEXT:    add a2, a0, a2
203; DEFAULT-NEXT:    add a0, a0, a3
204; DEFAULT-NEXT:  .LBB1_6: # %for.body
205; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
206; DEFAULT-NEXT:    lw a3, 0(a2)
207; DEFAULT-NEXT:    add a3, a3, a1
208; DEFAULT-NEXT:    sw a3, 0(a2)
209; DEFAULT-NEXT:    addi a2, a2, 4
210; DEFAULT-NEXT:    bne a2, a0, .LBB1_6
211; DEFAULT-NEXT:  .LBB1_7: # %for.cond.cleanup
212; DEFAULT-NEXT:    ret
213entry:
214  %0 = call i64 @llvm.vscale.i64()
215  %1 = shl i64 %0, 2
216  %min.iters.check = icmp ugt i64 %1, 1024
217  br i1 %min.iters.check, label %for.body.preheader, label %vector.ph
218
219vector.ph:                                        ; preds = %entry
220  %2 = call i64 @llvm.vscale.i64()
221  %3 = shl i64 %2, 2
222  %n.mod.vf = urem i64 1024, %3
223  %n.vec = sub nsw i64 1024, %n.mod.vf
224  %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
225  %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
226  %4 = call i64 @llvm.vscale.i64()
227  %5 = shl i64 %4, 2
228  br label %vector.body
229
230vector.body:                                      ; preds = %vector.body, %vector.ph
231  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
232  %6 = getelementptr inbounds i32, ptr %a, i64 %index
233  %7 = bitcast ptr %6 to ptr
234  %wide.load = load <vscale x 4 x i32>, ptr %7, align 4
235  %8 = add <vscale x 4 x i32> %wide.load, %broadcast.splat
236  %9 = bitcast ptr %6 to ptr
237  store <vscale x 4 x i32> %8, ptr %9, align 4
238  %index.next = add nuw i64 %index, %5
239  %10 = icmp eq i64 %index.next, %n.vec
240  br i1 %10, label %middle.block, label %vector.body
241
242middle.block:                                     ; preds = %vector.body
243  %cmp.n = icmp eq i64 %n.mod.vf, 0
244  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
245
246for.body.preheader:                               ; preds = %entry, %middle.block
247  %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ]
248  br label %for.body
249
250for.cond.cleanup:                                 ; preds = %for.body, %middle.block
251  ret void
252
253for.body:                                         ; preds = %for.body.preheader, %for.body
254  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
255  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
256  %11 = load i32, ptr %arrayidx, align 4
257  %add = add i32 %11, %x
258  store i32 %add, ptr %arrayidx, align 4
259  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
260  %cmp.not = icmp eq i64 %indvars.iv.next, 1024
261  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
262}
263
264declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
265
266define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) {
267; NO-SINK-LABEL: sink_splat_vp_add:
268; NO-SINK:       # %bb.0: # %entry
269; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
270; NO-SINK-NEXT:    vmv.v.x v8, a1
271; NO-SINK-NEXT:    lui a1, 1
272; NO-SINK-NEXT:    add a1, a0, a1
273; NO-SINK-NEXT:  .LBB2_1: # %vector.body
274; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
275; NO-SINK-NEXT:    vle32.v v9, (a0)
276; NO-SINK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
277; NO-SINK-NEXT:    vadd.vv v9, v9, v8, v0.t
278; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
279; NO-SINK-NEXT:    vse32.v v9, (a0)
280; NO-SINK-NEXT:    addi a0, a0, 16
281; NO-SINK-NEXT:    bne a0, a1, .LBB2_1
282; NO-SINK-NEXT:  # %bb.2: # %for.cond.cleanup
283; NO-SINK-NEXT:    ret
284;
285; SINK-LABEL: sink_splat_vp_add:
286; SINK:       # %bb.0: # %entry
287; SINK-NEXT:    lui a3, 1
288; SINK-NEXT:    add a3, a0, a3
289; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
290; SINK-NEXT:  .LBB2_1: # %vector.body
291; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
292; SINK-NEXT:    vle32.v v8, (a0)
293; SINK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
294; SINK-NEXT:    vadd.vx v8, v8, a1, v0.t
295; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
296; SINK-NEXT:    vse32.v v8, (a0)
297; SINK-NEXT:    addi a0, a0, 16
298; SINK-NEXT:    bne a0, a3, .LBB2_1
299; SINK-NEXT:  # %bb.2: # %for.cond.cleanup
300; SINK-NEXT:    ret
301;
302; DEFAULT-LABEL: sink_splat_vp_add:
303; DEFAULT:       # %bb.0: # %entry
304; DEFAULT-NEXT:    lui a3, 1
305; DEFAULT-NEXT:    add a3, a0, a3
306; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
307; DEFAULT-NEXT:  .LBB2_1: # %vector.body
308; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
309; DEFAULT-NEXT:    vle32.v v8, (a0)
310; DEFAULT-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
311; DEFAULT-NEXT:    vadd.vx v8, v8, a1, v0.t
312; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
313; DEFAULT-NEXT:    vse32.v v8, (a0)
314; DEFAULT-NEXT:    addi a0, a0, 16
315; DEFAULT-NEXT:    bne a0, a3, .LBB2_1
316; DEFAULT-NEXT:  # %bb.2: # %for.cond.cleanup
317; DEFAULT-NEXT:    ret
318entry:
319  %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0
320  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
321  br label %vector.body
322
323vector.body:                                      ; preds = %vector.body, %entry
324  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
325  %0 = getelementptr inbounds i32, ptr %a, i64 %index
326  %1 = bitcast ptr %0 to ptr
327  %wide.load = load <4 x i32>, ptr %1, align 4
328  %2 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl)
329  %3 = bitcast ptr %0 to ptr
330  store <4 x i32> %2, ptr %3, align 4
331  %index.next = add nuw i64 %index, 4
332  %4 = icmp eq i64 %index.next, 1024
333  br i1 %4, label %for.cond.cleanup, label %vector.body
334
335for.cond.cleanup:                                 ; preds = %vector.body
336  ret void
337}
338
339define void @sink_splat_fadd(ptr nocapture %a, float %x) {
340; NO-SINK-LABEL: sink_splat_fadd:
341; NO-SINK:       # %bb.0: # %entry
342; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
343; NO-SINK-NEXT:    vfmv.v.f v8, fa0
344; NO-SINK-NEXT:    lui a1, 1
345; NO-SINK-NEXT:    add a1, a0, a1
346; NO-SINK-NEXT:  .LBB3_1: # %vector.body
347; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
348; NO-SINK-NEXT:    vle32.v v9, (a0)
349; NO-SINK-NEXT:    vfadd.vv v9, v9, v8
350; NO-SINK-NEXT:    vse32.v v9, (a0)
351; NO-SINK-NEXT:    addi a0, a0, 16
352; NO-SINK-NEXT:    bne a0, a1, .LBB3_1
353; NO-SINK-NEXT:  # %bb.2: # %for.cond.cleanup
354; NO-SINK-NEXT:    ret
355;
356; SINK-LABEL: sink_splat_fadd:
357; SINK:       # %bb.0: # %entry
358; SINK-NEXT:    lui a1, 1
359; SINK-NEXT:    add a1, a0, a1
360; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
361; SINK-NEXT:  .LBB3_1: # %vector.body
362; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
363; SINK-NEXT:    vle32.v v8, (a0)
364; SINK-NEXT:    vfadd.vf v8, v8, fa0
365; SINK-NEXT:    vse32.v v8, (a0)
366; SINK-NEXT:    addi a0, a0, 16
367; SINK-NEXT:    bne a0, a1, .LBB3_1
368; SINK-NEXT:  # %bb.2: # %for.cond.cleanup
369; SINK-NEXT:    ret
370;
371; DEFAULT-LABEL: sink_splat_fadd:
372; DEFAULT:       # %bb.0: # %entry
373; DEFAULT-NEXT:    lui a1, 1
374; DEFAULT-NEXT:    add a1, a0, a1
375; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
376; DEFAULT-NEXT:  .LBB3_1: # %vector.body
377; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
378; DEFAULT-NEXT:    vle32.v v8, (a0)
379; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0
380; DEFAULT-NEXT:    vse32.v v8, (a0)
381; DEFAULT-NEXT:    addi a0, a0, 16
382; DEFAULT-NEXT:    bne a0, a1, .LBB3_1
383; DEFAULT-NEXT:  # %bb.2: # %for.cond.cleanup
384; DEFAULT-NEXT:    ret
385entry:
386  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
387  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
388  br label %vector.body
389
390vector.body:                                      ; preds = %vector.body, %entry
391  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
392  %0 = getelementptr inbounds float, ptr %a, i64 %index
393  %1 = bitcast ptr %0 to ptr
394  %wide.load = load <4 x float>, ptr %1, align 4
395  %2 = fadd <4 x float> %wide.load, %broadcast.splat
396  %3 = bitcast ptr %0 to ptr
397  store <4 x float> %2, ptr %3, align 4
398  %index.next = add nuw i64 %index, 4
399  %4 = icmp eq i64 %index.next, 1024
400  br i1 %4, label %for.cond.cleanup, label %vector.body
401
402for.cond.cleanup:                                 ; preds = %vector.body
403  ret void
404}
405
406define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
407; NO-SINK-LABEL: sink_splat_fadd_scalable:
408; NO-SINK:       # %bb.0: # %entry
409; NO-SINK-NEXT:    csrr a1, vlenb
410; NO-SINK-NEXT:    srli a3, a1, 2
411; NO-SINK-NEXT:    li a2, 1024
412; NO-SINK-NEXT:    bgeu a2, a3, .LBB4_2
413; NO-SINK-NEXT:  # %bb.1:
414; NO-SINK-NEXT:    li a2, 0
415; NO-SINK-NEXT:    j .LBB4_5
416; NO-SINK-NEXT:  .LBB4_2: # %vector.ph
417; NO-SINK-NEXT:    addi a2, a3, -1
418; NO-SINK-NEXT:    andi a4, a2, 1024
419; NO-SINK-NEXT:    xori a2, a4, 1024
420; NO-SINK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
421; NO-SINK-NEXT:    vfmv.v.f v8, fa0
422; NO-SINK-NEXT:    mv a5, a0
423; NO-SINK-NEXT:    mv a6, a2
424; NO-SINK-NEXT:  .LBB4_3: # %vector.body
425; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
426; NO-SINK-NEXT:    vl1re32.v v9, (a5)
427; NO-SINK-NEXT:    sub a6, a6, a3
428; NO-SINK-NEXT:    vfadd.vv v9, v9, v8
429; NO-SINK-NEXT:    vs1r.v v9, (a5)
430; NO-SINK-NEXT:    add a5, a5, a1
431; NO-SINK-NEXT:    bnez a6, .LBB4_3
432; NO-SINK-NEXT:  # %bb.4: # %middle.block
433; NO-SINK-NEXT:    beqz a4, .LBB4_7
434; NO-SINK-NEXT:  .LBB4_5: # %for.body.preheader
435; NO-SINK-NEXT:    slli a1, a2, 2
436; NO-SINK-NEXT:    lui a2, 1
437; NO-SINK-NEXT:    add a1, a0, a1
438; NO-SINK-NEXT:    add a0, a0, a2
439; NO-SINK-NEXT:  .LBB4_6: # %for.body
440; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
441; NO-SINK-NEXT:    flw fa5, 0(a1)
442; NO-SINK-NEXT:    fadd.s fa5, fa5, fa0
443; NO-SINK-NEXT:    fsw fa5, 0(a1)
444; NO-SINK-NEXT:    addi a1, a1, 4
445; NO-SINK-NEXT:    bne a1, a0, .LBB4_6
446; NO-SINK-NEXT:  .LBB4_7: # %for.cond.cleanup
447; NO-SINK-NEXT:    ret
448;
449; SINK-LABEL: sink_splat_fadd_scalable:
450; SINK:       # %bb.0: # %entry
451; SINK-NEXT:    csrr a1, vlenb
452; SINK-NEXT:    srli a3, a1, 2
453; SINK-NEXT:    li a2, 1024
454; SINK-NEXT:    bgeu a2, a3, .LBB4_2
455; SINK-NEXT:  # %bb.1:
456; SINK-NEXT:    li a2, 0
457; SINK-NEXT:    j .LBB4_5
458; SINK-NEXT:  .LBB4_2: # %vector.ph
459; SINK-NEXT:    addi a2, a3, -1
460; SINK-NEXT:    andi a4, a2, 1024
461; SINK-NEXT:    xori a2, a4, 1024
462; SINK-NEXT:    mv a5, a0
463; SINK-NEXT:    mv a6, a2
464; SINK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
465; SINK-NEXT:  .LBB4_3: # %vector.body
466; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
467; SINK-NEXT:    vl1re32.v v8, (a5)
468; SINK-NEXT:    sub a6, a6, a3
469; SINK-NEXT:    vfadd.vf v8, v8, fa0
470; SINK-NEXT:    vs1r.v v8, (a5)
471; SINK-NEXT:    add a5, a5, a1
472; SINK-NEXT:    bnez a6, .LBB4_3
473; SINK-NEXT:  # %bb.4: # %middle.block
474; SINK-NEXT:    beqz a4, .LBB4_7
475; SINK-NEXT:  .LBB4_5: # %for.body.preheader
476; SINK-NEXT:    slli a1, a2, 2
477; SINK-NEXT:    lui a2, 1
478; SINK-NEXT:    add a1, a0, a1
479; SINK-NEXT:    add a0, a0, a2
480; SINK-NEXT:  .LBB4_6: # %for.body
481; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
482; SINK-NEXT:    flw fa5, 0(a1)
483; SINK-NEXT:    fadd.s fa5, fa5, fa0
484; SINK-NEXT:    fsw fa5, 0(a1)
485; SINK-NEXT:    addi a1, a1, 4
486; SINK-NEXT:    bne a1, a0, .LBB4_6
487; SINK-NEXT:  .LBB4_7: # %for.cond.cleanup
488; SINK-NEXT:    ret
489;
490; DEFAULT-LABEL: sink_splat_fadd_scalable:
491; DEFAULT:       # %bb.0: # %entry
492; DEFAULT-NEXT:    csrr a1, vlenb
493; DEFAULT-NEXT:    srli a3, a1, 2
494; DEFAULT-NEXT:    li a2, 1024
495; DEFAULT-NEXT:    bgeu a2, a3, .LBB4_2
496; DEFAULT-NEXT:  # %bb.1:
497; DEFAULT-NEXT:    li a2, 0
498; DEFAULT-NEXT:    j .LBB4_5
499; DEFAULT-NEXT:  .LBB4_2: # %vector.ph
500; DEFAULT-NEXT:    addi a2, a3, -1
501; DEFAULT-NEXT:    andi a4, a2, 1024
502; DEFAULT-NEXT:    xori a2, a4, 1024
503; DEFAULT-NEXT:    mv a5, a0
504; DEFAULT-NEXT:    mv a6, a2
505; DEFAULT-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
506; DEFAULT-NEXT:  .LBB4_3: # %vector.body
507; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
508; DEFAULT-NEXT:    vl1re32.v v8, (a5)
509; DEFAULT-NEXT:    sub a6, a6, a3
510; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0
511; DEFAULT-NEXT:    vs1r.v v8, (a5)
512; DEFAULT-NEXT:    add a5, a5, a1
513; DEFAULT-NEXT:    bnez a6, .LBB4_3
514; DEFAULT-NEXT:  # %bb.4: # %middle.block
515; DEFAULT-NEXT:    beqz a4, .LBB4_7
516; DEFAULT-NEXT:  .LBB4_5: # %for.body.preheader
517; DEFAULT-NEXT:    slli a1, a2, 2
518; DEFAULT-NEXT:    lui a2, 1
519; DEFAULT-NEXT:    add a1, a0, a1
520; DEFAULT-NEXT:    add a0, a0, a2
521; DEFAULT-NEXT:  .LBB4_6: # %for.body
522; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
523; DEFAULT-NEXT:    flw fa5, 0(a1)
524; DEFAULT-NEXT:    fadd.s fa5, fa5, fa0
525; DEFAULT-NEXT:    fsw fa5, 0(a1)
526; DEFAULT-NEXT:    addi a1, a1, 4
527; DEFAULT-NEXT:    bne a1, a0, .LBB4_6
528; DEFAULT-NEXT:  .LBB4_7: # %for.cond.cleanup
529; DEFAULT-NEXT:    ret
530entry:
531  %0 = call i64 @llvm.vscale.i64()
532  %1 = shl i64 %0, 1
533  %min.iters.check = icmp ugt i64 %1, 1024
534  br i1 %min.iters.check, label %for.body.preheader, label %vector.ph
535
536vector.ph:                                        ; preds = %entry
537  %2 = call i64 @llvm.vscale.i64()
538  %3 = shl i64 %2, 1
539  %n.mod.vf = urem i64 1024, %3
540  %n.vec = sub nsw i64 1024, %n.mod.vf
541  %broadcast.splatinsert = insertelement <vscale x 2 x float> poison, float %x, i32 0
542  %broadcast.splat = shufflevector <vscale x 2 x float> %broadcast.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
543  %4 = call i64 @llvm.vscale.i64()
544  %5 = shl i64 %4, 1
545  br label %vector.body
546
547vector.body:                                      ; preds = %vector.body, %vector.ph
548  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
549  %6 = getelementptr inbounds float, ptr %a, i64 %index
550  %7 = bitcast ptr %6 to ptr
551  %wide.load = load <vscale x 2 x float>, ptr %7, align 4
552  %8 = fadd <vscale x 2 x float> %wide.load, %broadcast.splat
553  %9 = bitcast ptr %6 to ptr
554  store <vscale x 2 x float> %8, ptr %9, align 4
555  %index.next = add nuw i64 %index, %5
556  %10 = icmp eq i64 %index.next, %n.vec
557  br i1 %10, label %middle.block, label %vector.body
558
559middle.block:                                     ; preds = %vector.body
560  %cmp.n = icmp eq i64 %n.mod.vf, 0
561  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader
562
563for.body.preheader:                               ; preds = %entry, %middle.block
564  %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ]
565  br label %for.body
566
567for.cond.cleanup:                                 ; preds = %for.body, %middle.block
568  ret void
569
570for.body:                                         ; preds = %for.body.preheader, %for.body
571  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
572  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
573  %11 = load float, ptr %arrayidx, align 4
574  %mul = fadd float %11, %x
575  store float %mul, ptr %arrayidx, align 4
576  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
577  %cmp.not = icmp eq i64 %indvars.iv.next, 1024
578  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
579}
580
581declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32)
582
583define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) {
584; NO-SINK-LABEL: sink_splat_vp_fadd:
585; NO-SINK:       # %bb.0: # %entry
586; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
587; NO-SINK-NEXT:    vfmv.v.f v8, fa0
588; NO-SINK-NEXT:    lui a2, 1
589; NO-SINK-NEXT:    add a2, a0, a2
590; NO-SINK-NEXT:  .LBB5_1: # %vector.body
591; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
592; NO-SINK-NEXT:    vle32.v v9, (a0)
593; NO-SINK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
594; NO-SINK-NEXT:    vfadd.vv v9, v9, v8, v0.t
595; NO-SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
596; NO-SINK-NEXT:    vse32.v v9, (a0)
597; NO-SINK-NEXT:    addi a0, a0, 16
598; NO-SINK-NEXT:    bne a0, a2, .LBB5_1
599; NO-SINK-NEXT:  # %bb.2: # %for.cond.cleanup
600; NO-SINK-NEXT:    ret
601;
602; SINK-LABEL: sink_splat_vp_fadd:
603; SINK:       # %bb.0: # %entry
604; SINK-NEXT:    lui a2, 1
605; SINK-NEXT:    add a2, a0, a2
606; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
607; SINK-NEXT:  .LBB5_1: # %vector.body
608; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
609; SINK-NEXT:    vle32.v v8, (a0)
610; SINK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
611; SINK-NEXT:    vfadd.vf v8, v8, fa0, v0.t
612; SINK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
613; SINK-NEXT:    vse32.v v8, (a0)
614; SINK-NEXT:    addi a0, a0, 16
615; SINK-NEXT:    bne a0, a2, .LBB5_1
616; SINK-NEXT:  # %bb.2: # %for.cond.cleanup
617; SINK-NEXT:    ret
618;
619; DEFAULT-LABEL: sink_splat_vp_fadd:
620; DEFAULT:       # %bb.0: # %entry
621; DEFAULT-NEXT:    lui a2, 1
622; DEFAULT-NEXT:    add a2, a0, a2
623; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
624; DEFAULT-NEXT:  .LBB5_1: # %vector.body
625; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
626; DEFAULT-NEXT:    vle32.v v8, (a0)
627; DEFAULT-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
628; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0, v0.t
629; DEFAULT-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
630; DEFAULT-NEXT:    vse32.v v8, (a0)
631; DEFAULT-NEXT:    addi a0, a0, 16
632; DEFAULT-NEXT:    bne a0, a2, .LBB5_1
633; DEFAULT-NEXT:  # %bb.2: # %for.cond.cleanup
634; DEFAULT-NEXT:    ret
635entry:
636  %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0
637  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
638  br label %vector.body
639
640vector.body:                                      ; preds = %vector.body, %entry
641  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
642  %0 = getelementptr inbounds float, ptr %a, i64 %index
643  %1 = bitcast ptr %0 to ptr
644  %wide.load = load <4 x float>, ptr %1, align 4
645  %2 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl)
646  %3 = bitcast ptr %0 to ptr
647  store <4 x float> %2, ptr %3, align 4
648  %index.next = add nuw i64 %index, 4
649  %4 = icmp eq i64 %index.next, 1024
650  br i1 %4, label %for.cond.cleanup, label %vector.body
651
652for.cond.cleanup:                                 ; preds = %vector.body
653  ret void
654}
655