1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+m,+v < %s | FileCheck --check-prefix=RV32 %s 3; RUN: llc -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck --check-prefix=RV64 %s 4 5; FIXME: We can rematerialize "addi s0, a2, 32" (ideally along the edge 6; %do.call -> %exit), and shrink wrap this routine 7define void @vecaddr_straightline(i32 zeroext %a, ptr %p) { 8; RV32-LABEL: vecaddr_straightline: 9; RV32: # %bb.0: 10; RV32-NEXT: addi sp, sp, -16 11; RV32-NEXT: .cfi_def_cfa_offset 16 12; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill 13; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill 14; RV32-NEXT: .cfi_offset ra, -4 15; RV32-NEXT: .cfi_offset s0, -8 16; RV32-NEXT: addi s0, a1, 32 17; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma 18; RV32-NEXT: vle32.v v8, (s0) 19; RV32-NEXT: vadd.vi v8, v8, 1 20; RV32-NEXT: li a1, 57 21; RV32-NEXT: vse32.v v8, (s0) 22; RV32-NEXT: beq a0, a1, .LBB0_2 23; RV32-NEXT: # %bb.1: # %do_call 24; RV32-NEXT: call foo 25; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma 26; RV32-NEXT: .LBB0_2: # %exit 27; RV32-NEXT: vle32.v v8, (s0) 28; RV32-NEXT: vadd.vi v8, v8, 1 29; RV32-NEXT: vse32.v v8, (s0) 30; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload 31; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload 32; RV32-NEXT: .cfi_restore ra 33; RV32-NEXT: .cfi_restore s0 34; RV32-NEXT: addi sp, sp, 16 35; RV32-NEXT: .cfi_def_cfa_offset 0 36; RV32-NEXT: ret 37; 38; RV64-LABEL: vecaddr_straightline: 39; RV64: # %bb.0: 40; RV64-NEXT: addi sp, sp, -16 41; RV64-NEXT: .cfi_def_cfa_offset 16 42; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill 43; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill 44; RV64-NEXT: .cfi_offset ra, -8 45; RV64-NEXT: .cfi_offset s0, -16 46; RV64-NEXT: addi s0, a1, 32 47; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma 48; RV64-NEXT: vle32.v v8, (s0) 49; RV64-NEXT: vadd.vi v8, v8, 1 50; RV64-NEXT: li a1, 57 51; RV64-NEXT: vse32.v v8, (s0) 52; RV64-NEXT: beq a0, a1, .LBB0_2 53; RV64-NEXT: # %bb.1: # %do_call 54; RV64-NEXT: call foo 55; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma 56; RV64-NEXT: .LBB0_2: # %exit 57; RV64-NEXT: vle32.v v8, (s0) 58; RV64-NEXT: vadd.vi v8, v8, 1 59; RV64-NEXT: vse32.v v8, (s0) 60; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload 61; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload 62; RV64-NEXT: .cfi_restore ra 63; RV64-NEXT: .cfi_restore s0 64; RV64-NEXT: addi sp, sp, 16 65; RV64-NEXT: .cfi_def_cfa_offset 0 66; RV64-NEXT: ret 67 %gep = getelementptr i8, ptr %p, i32 32 68 %v1 = load <4 x i32>, ptr %gep 69 %v2 = add <4 x i32> %v1, splat (i32 1) 70 store <4 x i32> %v2, ptr %gep 71 %cmp0 = icmp eq i32 %a, 57 72 br i1 %cmp0, label %exit, label %do_call 73do_call: 74 call i32 @foo() 75 br label %exit 76exit: 77 %v3 = load <4 x i32>, ptr %gep 78 %v4 = add <4 x i32> %v3, splat (i32 1) 79 store <4 x i32> %v4, ptr %gep 80 ret void 81} 82 83; In this case, the second use is in a loop, so using a callee 84; saved register to avoid a remat is the profitable choice. 85; FIXME: We can shrink wrap the frame setup around the loop 86; and avoid it along the %bb.0 -> %exit edge 87define void @vecaddr_loop(i32 zeroext %a, ptr %p) { 88; RV32-LABEL: vecaddr_loop: 89; RV32: # %bb.0: 90; RV32-NEXT: addi sp, sp, -16 91; RV32-NEXT: .cfi_def_cfa_offset 16 92; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill 93; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill 94; RV32-NEXT: .cfi_offset ra, -4 95; RV32-NEXT: .cfi_offset s0, -8 96; RV32-NEXT: addi s0, a1, 32 97; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma 98; RV32-NEXT: vle32.v v8, (s0) 99; RV32-NEXT: vadd.vi v8, v8, 1 100; RV32-NEXT: li a1, 57 101; RV32-NEXT: vse32.v v8, (s0) 102; RV32-NEXT: beq a0, a1, .LBB1_2 103; RV32-NEXT: .LBB1_1: # %do_call 104; RV32-NEXT: # =>This Inner Loop Header: Depth=1 105; RV32-NEXT: call foo 106; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma 107; RV32-NEXT: vle32.v v8, (s0) 108; RV32-NEXT: vadd.vi v8, v8, 1 109; RV32-NEXT: vse32.v v8, (s0) 110; RV32-NEXT: bnez a0, .LBB1_1 111; RV32-NEXT: .LBB1_2: # %exit 112; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload 113; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload 114; RV32-NEXT: .cfi_restore ra 115; RV32-NEXT: .cfi_restore s0 116; RV32-NEXT: addi sp, sp, 16 117; RV32-NEXT: .cfi_def_cfa_offset 0 118; RV32-NEXT: ret 119; 120; RV64-LABEL: vecaddr_loop: 121; RV64: # %bb.0: 122; RV64-NEXT: addi sp, sp, -16 123; RV64-NEXT: .cfi_def_cfa_offset 16 124; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill 125; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill 126; RV64-NEXT: .cfi_offset ra, -8 127; RV64-NEXT: .cfi_offset s0, -16 128; RV64-NEXT: addi s0, a1, 32 129; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma 130; RV64-NEXT: vle32.v v8, (s0) 131; RV64-NEXT: vadd.vi v8, v8, 1 132; RV64-NEXT: li a1, 57 133; RV64-NEXT: vse32.v v8, (s0) 134; RV64-NEXT: beq a0, a1, .LBB1_2 135; RV64-NEXT: .LBB1_1: # %do_call 136; RV64-NEXT: # =>This Inner Loop Header: Depth=1 137; RV64-NEXT: call foo 138; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma 139; RV64-NEXT: vle32.v v8, (s0) 140; RV64-NEXT: vadd.vi v8, v8, 1 141; RV64-NEXT: vse32.v v8, (s0) 142; RV64-NEXT: bnez a0, .LBB1_1 143; RV64-NEXT: .LBB1_2: # %exit 144; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload 145; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload 146; RV64-NEXT: .cfi_restore ra 147; RV64-NEXT: .cfi_restore s0 148; RV64-NEXT: addi sp, sp, 16 149; RV64-NEXT: .cfi_def_cfa_offset 0 150; RV64-NEXT: ret 151 %gep = getelementptr i8, ptr %p, i32 32 152 %v1 = load <4 x i32>, ptr %gep 153 %v2 = add <4 x i32> %v1, splat (i32 1) 154 store <4 x i32> %v2, ptr %gep 155 %cmp0 = icmp eq i32 %a, 57 156 br i1 %cmp0, label %exit, label %do_call 157do_call: 158 %b = call i32 @foo() 159 %v3 = load <4 x i32>, ptr %gep 160 %v4 = add <4 x i32> %v3, splat (i32 1) 161 store <4 x i32> %v4, ptr %gep 162 163 %cmp1 = icmp eq i32 %b, 0 164 br i1 %cmp1, label %exit, label %do_call 165exit: 166 ret void 167} 168 169declare zeroext i32 @foo() 170 171