xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/shrinkwrap.ll (revision 06246b2952d5b061e8fd75979bac9c90ccd493a4)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+m,+v < %s | FileCheck --check-prefix=RV32 %s
3; RUN: llc -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck --check-prefix=RV64 %s
4
5; FIXME: We can rematerialize "addi s0, a2, 32" (ideally along the edge
6; %do.call -> %exit), and shrink wrap this routine
7define void @vecaddr_straightline(i32 zeroext %a, ptr %p) {
8; RV32-LABEL: vecaddr_straightline:
9; RV32:       # %bb.0:
10; RV32-NEXT:    addi sp, sp, -16
11; RV32-NEXT:    .cfi_def_cfa_offset 16
12; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
13; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
14; RV32-NEXT:    .cfi_offset ra, -4
15; RV32-NEXT:    .cfi_offset s0, -8
16; RV32-NEXT:    addi s0, a1, 32
17; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
18; RV32-NEXT:    vle32.v v8, (s0)
19; RV32-NEXT:    vadd.vi v8, v8, 1
20; RV32-NEXT:    li a1, 57
21; RV32-NEXT:    vse32.v v8, (s0)
22; RV32-NEXT:    beq a0, a1, .LBB0_2
23; RV32-NEXT:  # %bb.1: # %do_call
24; RV32-NEXT:    call foo
25; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
26; RV32-NEXT:  .LBB0_2: # %exit
27; RV32-NEXT:    vle32.v v8, (s0)
28; RV32-NEXT:    vadd.vi v8, v8, 1
29; RV32-NEXT:    vse32.v v8, (s0)
30; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
31; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
32; RV32-NEXT:    .cfi_restore ra
33; RV32-NEXT:    .cfi_restore s0
34; RV32-NEXT:    addi sp, sp, 16
35; RV32-NEXT:    .cfi_def_cfa_offset 0
36; RV32-NEXT:    ret
37;
38; RV64-LABEL: vecaddr_straightline:
39; RV64:       # %bb.0:
40; RV64-NEXT:    addi sp, sp, -16
41; RV64-NEXT:    .cfi_def_cfa_offset 16
42; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
43; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
44; RV64-NEXT:    .cfi_offset ra, -8
45; RV64-NEXT:    .cfi_offset s0, -16
46; RV64-NEXT:    addi s0, a1, 32
47; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
48; RV64-NEXT:    vle32.v v8, (s0)
49; RV64-NEXT:    vadd.vi v8, v8, 1
50; RV64-NEXT:    li a1, 57
51; RV64-NEXT:    vse32.v v8, (s0)
52; RV64-NEXT:    beq a0, a1, .LBB0_2
53; RV64-NEXT:  # %bb.1: # %do_call
54; RV64-NEXT:    call foo
55; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
56; RV64-NEXT:  .LBB0_2: # %exit
57; RV64-NEXT:    vle32.v v8, (s0)
58; RV64-NEXT:    vadd.vi v8, v8, 1
59; RV64-NEXT:    vse32.v v8, (s0)
60; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
61; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
62; RV64-NEXT:    .cfi_restore ra
63; RV64-NEXT:    .cfi_restore s0
64; RV64-NEXT:    addi sp, sp, 16
65; RV64-NEXT:    .cfi_def_cfa_offset 0
66; RV64-NEXT:    ret
67  %gep = getelementptr i8, ptr %p, i32 32
68  %v1 = load <4 x i32>, ptr %gep
69  %v2 = add <4 x i32> %v1, splat (i32 1)
70  store <4 x i32> %v2, ptr %gep
71  %cmp0 = icmp eq i32 %a, 57
72  br i1 %cmp0, label %exit, label %do_call
73do_call:
74  call i32 @foo()
75  br label %exit
76exit:
77  %v3 = load <4 x i32>, ptr %gep
78  %v4 = add <4 x i32> %v3, splat (i32 1)
79  store <4 x i32> %v4, ptr %gep
80  ret void
81}
82
83; In this case, the second use is in a loop, so using a callee
84; saved register to avoid a remat is the profitable choice.
85; FIXME: We can shrink wrap the frame setup around the loop
86; and avoid it along the %bb.0 -> %exit edge
87define void @vecaddr_loop(i32 zeroext %a, ptr %p) {
88; RV32-LABEL: vecaddr_loop:
89; RV32:       # %bb.0:
90; RV32-NEXT:    addi sp, sp, -16
91; RV32-NEXT:    .cfi_def_cfa_offset 16
92; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
93; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
94; RV32-NEXT:    .cfi_offset ra, -4
95; RV32-NEXT:    .cfi_offset s0, -8
96; RV32-NEXT:    addi s0, a1, 32
97; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
98; RV32-NEXT:    vle32.v v8, (s0)
99; RV32-NEXT:    vadd.vi v8, v8, 1
100; RV32-NEXT:    li a1, 57
101; RV32-NEXT:    vse32.v v8, (s0)
102; RV32-NEXT:    beq a0, a1, .LBB1_2
103; RV32-NEXT:  .LBB1_1: # %do_call
104; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
105; RV32-NEXT:    call foo
106; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
107; RV32-NEXT:    vle32.v v8, (s0)
108; RV32-NEXT:    vadd.vi v8, v8, 1
109; RV32-NEXT:    vse32.v v8, (s0)
110; RV32-NEXT:    bnez a0, .LBB1_1
111; RV32-NEXT:  .LBB1_2: # %exit
112; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
113; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
114; RV32-NEXT:    .cfi_restore ra
115; RV32-NEXT:    .cfi_restore s0
116; RV32-NEXT:    addi sp, sp, 16
117; RV32-NEXT:    .cfi_def_cfa_offset 0
118; RV32-NEXT:    ret
119;
120; RV64-LABEL: vecaddr_loop:
121; RV64:       # %bb.0:
122; RV64-NEXT:    addi sp, sp, -16
123; RV64-NEXT:    .cfi_def_cfa_offset 16
124; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
125; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
126; RV64-NEXT:    .cfi_offset ra, -8
127; RV64-NEXT:    .cfi_offset s0, -16
128; RV64-NEXT:    addi s0, a1, 32
129; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
130; RV64-NEXT:    vle32.v v8, (s0)
131; RV64-NEXT:    vadd.vi v8, v8, 1
132; RV64-NEXT:    li a1, 57
133; RV64-NEXT:    vse32.v v8, (s0)
134; RV64-NEXT:    beq a0, a1, .LBB1_2
135; RV64-NEXT:  .LBB1_1: # %do_call
136; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
137; RV64-NEXT:    call foo
138; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
139; RV64-NEXT:    vle32.v v8, (s0)
140; RV64-NEXT:    vadd.vi v8, v8, 1
141; RV64-NEXT:    vse32.v v8, (s0)
142; RV64-NEXT:    bnez a0, .LBB1_1
143; RV64-NEXT:  .LBB1_2: # %exit
144; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
145; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
146; RV64-NEXT:    .cfi_restore ra
147; RV64-NEXT:    .cfi_restore s0
148; RV64-NEXT:    addi sp, sp, 16
149; RV64-NEXT:    .cfi_def_cfa_offset 0
150; RV64-NEXT:    ret
151  %gep = getelementptr i8, ptr %p, i32 32
152  %v1 = load <4 x i32>, ptr %gep
153  %v2 = add <4 x i32> %v1, splat (i32 1)
154  store <4 x i32> %v2, ptr %gep
155  %cmp0 = icmp eq i32 %a, 57
156  br i1 %cmp0, label %exit, label %do_call
157do_call:
158  %b = call i32 @foo()
159  %v3 = load <4 x i32>, ptr %gep
160  %v4 = add <4 x i32> %v3, splat (i32 1)
161  store <4 x i32> %v4, ptr %gep
162
163  %cmp1 = icmp eq i32 %b, 0
164  br i1 %cmp1, label %exit, label %do_call
165exit:
166  ret void
167}
168
169declare zeroext i32 @foo()
170
171