1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=riscv32 -mattr=+m \ 3; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32 4; RUN: llc < %s -mtriple=riscv64 -mattr=+m \ 5; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64 6; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \ 7; RUN: | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST 8; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \ 9; RUN: | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST 10 11; TODO: Due to the initial naive lowering implementation of memset.pattern in 12; PreISelIntrinsicLowering, the generated code is not good. 13 14define void @memset_1(ptr %a, i128 %value) nounwind { 15; RV32-BOTH-LABEL: memset_1: 16; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader 17; RV32-BOTH-NEXT: li a2, 0 18; RV32-BOTH-NEXT: lw a3, 0(a1) 19; RV32-BOTH-NEXT: lw a4, 4(a1) 20; RV32-BOTH-NEXT: lw a5, 8(a1) 21; RV32-BOTH-NEXT: lw a1, 12(a1) 22; RV32-BOTH-NEXT: li a6, 0 23; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop 24; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 25; RV32-BOTH-NEXT: slli a7, a2, 4 26; RV32-BOTH-NEXT: addi a2, a2, 1 27; RV32-BOTH-NEXT: add a7, a0, a7 28; RV32-BOTH-NEXT: seqz t0, a2 29; RV32-BOTH-NEXT: add a6, a6, t0 30; RV32-BOTH-NEXT: or t0, a2, a6 31; RV32-BOTH-NEXT: sw a3, 0(a7) 32; RV32-BOTH-NEXT: sw a4, 4(a7) 33; RV32-BOTH-NEXT: sw a5, 8(a7) 34; RV32-BOTH-NEXT: sw a1, 12(a7) 35; RV32-BOTH-NEXT: beqz t0, .LBB0_1 36; RV32-BOTH-NEXT: # %bb.2: # %split 37; RV32-BOTH-NEXT: ret 38; 39; RV64-BOTH-LABEL: memset_1: 40; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader 41; RV64-BOTH-NEXT: addi a3, a0, 16 42; RV64-BOTH-NEXT: .LBB0_1: # %loadstoreloop 43; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 44; RV64-BOTH-NEXT: sd a1, 0(a0) 45; RV64-BOTH-NEXT: sd a2, 8(a0) 46; RV64-BOTH-NEXT: addi a0, a0, 16 47; RV64-BOTH-NEXT: bne a0, a3, .LBB0_1 48; RV64-BOTH-NEXT: # %bb.2: # %split 49; RV64-BOTH-NEXT: ret 50 tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0) 51 ret void 52} 53 54define void @memset_1_noalign(ptr %a, i128 %value) nounwind { 55; RV32-LABEL: memset_1_noalign: 56; RV32: # %bb.0: # %loadstoreloop.preheader 57; RV32-NEXT: addi sp, sp, -32 58; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill 59; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill 60; RV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill 61; RV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill 62; RV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill 63; RV32-NEXT: sw s5, 8(sp) # 4-byte Folded Spill 64; RV32-NEXT: li a2, 0 65; RV32-NEXT: li a3, 0 66; RV32-NEXT: lw a4, 4(a1) 67; RV32-NEXT: lw a5, 0(a1) 68; RV32-NEXT: lw a6, 8(a1) 69; RV32-NEXT: lw a1, 12(a1) 70; RV32-NEXT: srli a7, a4, 24 71; RV32-NEXT: srli t0, a4, 16 72; RV32-NEXT: srli t1, a4, 8 73; RV32-NEXT: srli t2, a5, 24 74; RV32-NEXT: srli t3, a5, 16 75; RV32-NEXT: srli t4, a5, 8 76; RV32-NEXT: srli t5, a6, 24 77; RV32-NEXT: srli t6, a6, 16 78; RV32-NEXT: srli s0, a6, 8 79; RV32-NEXT: srli s1, a1, 24 80; RV32-NEXT: srli s2, a1, 16 81; RV32-NEXT: srli s3, a1, 8 82; RV32-NEXT: .LBB1_1: # %loadstoreloop 83; RV32-NEXT: # =>This Inner Loop Header: Depth=1 84; RV32-NEXT: slli s4, a2, 4 85; RV32-NEXT: addi a2, a2, 1 86; RV32-NEXT: add s4, a0, s4 87; RV32-NEXT: seqz s5, a2 88; RV32-NEXT: sb a4, 4(s4) 89; RV32-NEXT: sb t1, 5(s4) 90; RV32-NEXT: sb t0, 6(s4) 91; RV32-NEXT: sb a7, 7(s4) 92; RV32-NEXT: sb a5, 0(s4) 93; RV32-NEXT: sb t4, 1(s4) 94; RV32-NEXT: sb t3, 2(s4) 95; RV32-NEXT: sb t2, 3(s4) 96; RV32-NEXT: sb a6, 8(s4) 97; RV32-NEXT: sb s0, 9(s4) 98; RV32-NEXT: sb t6, 10(s4) 99; RV32-NEXT: sb t5, 11(s4) 100; RV32-NEXT: add a3, a3, s5 101; RV32-NEXT: or s5, a2, a3 102; RV32-NEXT: sb a1, 12(s4) 103; RV32-NEXT: sb s3, 13(s4) 104; RV32-NEXT: sb s2, 14(s4) 105; RV32-NEXT: sb s1, 15(s4) 106; RV32-NEXT: beqz s5, .LBB1_1 107; RV32-NEXT: # %bb.2: # %split 108; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload 109; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload 110; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload 111; RV32-NEXT: lw s3, 16(sp) # 4-byte Folded Reload 112; RV32-NEXT: lw s4, 12(sp) # 4-byte Folded Reload 113; RV32-NEXT: lw s5, 8(sp) # 4-byte Folded Reload 114; RV32-NEXT: addi sp, sp, 32 115; RV32-NEXT: ret 116; 117; RV64-LABEL: memset_1_noalign: 118; RV64: # %bb.0: # %loadstoreloop.preheader 119; RV64-NEXT: addi sp, sp, -32 120; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill 121; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill 122; RV64-NEXT: sd s2, 8(sp) # 8-byte Folded Spill 123; RV64-NEXT: addi a3, a0, 16 124; RV64-NEXT: srli a4, a1, 56 125; RV64-NEXT: srli a5, a1, 48 126; RV64-NEXT: srli a6, a1, 40 127; RV64-NEXT: srli a7, a1, 32 128; RV64-NEXT: srli t0, a1, 24 129; RV64-NEXT: srli t1, a1, 16 130; RV64-NEXT: srli t2, a1, 8 131; RV64-NEXT: srli t3, a2, 56 132; RV64-NEXT: srli t4, a2, 48 133; RV64-NEXT: srli t5, a2, 40 134; RV64-NEXT: srli t6, a2, 32 135; RV64-NEXT: srli s0, a2, 24 136; RV64-NEXT: srli s1, a2, 16 137; RV64-NEXT: srli s2, a2, 8 138; RV64-NEXT: .LBB1_1: # %loadstoreloop 139; RV64-NEXT: # =>This Inner Loop Header: Depth=1 140; RV64-NEXT: sb a7, 4(a0) 141; RV64-NEXT: sb a6, 5(a0) 142; RV64-NEXT: sb a5, 6(a0) 143; RV64-NEXT: sb a4, 7(a0) 144; RV64-NEXT: sb a1, 0(a0) 145; RV64-NEXT: sb t2, 1(a0) 146; RV64-NEXT: sb t1, 2(a0) 147; RV64-NEXT: sb t0, 3(a0) 148; RV64-NEXT: sb t6, 12(a0) 149; RV64-NEXT: sb t5, 13(a0) 150; RV64-NEXT: sb t4, 14(a0) 151; RV64-NEXT: sb t3, 15(a0) 152; RV64-NEXT: sb a2, 8(a0) 153; RV64-NEXT: sb s2, 9(a0) 154; RV64-NEXT: sb s1, 10(a0) 155; RV64-NEXT: sb s0, 11(a0) 156; RV64-NEXT: addi a0, a0, 16 157; RV64-NEXT: bne a0, a3, .LBB1_1 158; RV64-NEXT: # %bb.2: # %split 159; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload 160; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload 161; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload 162; RV64-NEXT: addi sp, sp, 32 163; RV64-NEXT: ret 164; 165; RV32-FAST-LABEL: memset_1_noalign: 166; RV32-FAST: # %bb.0: # %loadstoreloop.preheader 167; RV32-FAST-NEXT: li a2, 0 168; RV32-FAST-NEXT: lw a3, 0(a1) 169; RV32-FAST-NEXT: lw a4, 4(a1) 170; RV32-FAST-NEXT: lw a5, 8(a1) 171; RV32-FAST-NEXT: lw a1, 12(a1) 172; RV32-FAST-NEXT: li a6, 0 173; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop 174; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1 175; RV32-FAST-NEXT: slli a7, a2, 4 176; RV32-FAST-NEXT: addi a2, a2, 1 177; RV32-FAST-NEXT: add a7, a0, a7 178; RV32-FAST-NEXT: seqz t0, a2 179; RV32-FAST-NEXT: add a6, a6, t0 180; RV32-FAST-NEXT: or t0, a2, a6 181; RV32-FAST-NEXT: sw a3, 0(a7) 182; RV32-FAST-NEXT: sw a4, 4(a7) 183; RV32-FAST-NEXT: sw a5, 8(a7) 184; RV32-FAST-NEXT: sw a1, 12(a7) 185; RV32-FAST-NEXT: beqz t0, .LBB1_1 186; RV32-FAST-NEXT: # %bb.2: # %split 187; RV32-FAST-NEXT: ret 188; 189; RV64-FAST-LABEL: memset_1_noalign: 190; RV64-FAST: # %bb.0: # %loadstoreloop.preheader 191; RV64-FAST-NEXT: addi a3, a0, 16 192; RV64-FAST-NEXT: .LBB1_1: # %loadstoreloop 193; RV64-FAST-NEXT: # =>This Inner Loop Header: Depth=1 194; RV64-FAST-NEXT: sd a1, 0(a0) 195; RV64-FAST-NEXT: sd a2, 8(a0) 196; RV64-FAST-NEXT: addi a0, a0, 16 197; RV64-FAST-NEXT: bne a0, a3, .LBB1_1 198; RV64-FAST-NEXT: # %bb.2: # %split 199; RV64-FAST-NEXT: ret 200 tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0) 201 ret void 202} 203 204define void @memset_4(ptr %a, i128 %value) nounwind { 205; RV32-BOTH-LABEL: memset_4: 206; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader 207; RV32-BOTH-NEXT: li a2, 0 208; RV32-BOTH-NEXT: lw a3, 0(a1) 209; RV32-BOTH-NEXT: lw a4, 4(a1) 210; RV32-BOTH-NEXT: lw a5, 8(a1) 211; RV32-BOTH-NEXT: lw a1, 12(a1) 212; RV32-BOTH-NEXT: li a6, 0 213; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop 214; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 215; RV32-BOTH-NEXT: slli a7, a2, 4 216; RV32-BOTH-NEXT: addi a2, a2, 1 217; RV32-BOTH-NEXT: seqz t0, a2 218; RV32-BOTH-NEXT: sltiu t1, a2, 4 219; RV32-BOTH-NEXT: add a6, a6, t0 220; RV32-BOTH-NEXT: seqz t0, a6 221; RV32-BOTH-NEXT: and t0, t0, t1 222; RV32-BOTH-NEXT: add a7, a0, a7 223; RV32-BOTH-NEXT: sw a3, 0(a7) 224; RV32-BOTH-NEXT: sw a4, 4(a7) 225; RV32-BOTH-NEXT: sw a5, 8(a7) 226; RV32-BOTH-NEXT: sw a1, 12(a7) 227; RV32-BOTH-NEXT: bnez t0, .LBB2_1 228; RV32-BOTH-NEXT: # %bb.2: # %split 229; RV32-BOTH-NEXT: ret 230; 231; RV64-BOTH-LABEL: memset_4: 232; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader 233; RV64-BOTH-NEXT: addi a3, a0, 64 234; RV64-BOTH-NEXT: .LBB2_1: # %loadstoreloop 235; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 236; RV64-BOTH-NEXT: sd a1, 0(a0) 237; RV64-BOTH-NEXT: sd a2, 8(a0) 238; RV64-BOTH-NEXT: addi a0, a0, 16 239; RV64-BOTH-NEXT: bne a0, a3, .LBB2_1 240; RV64-BOTH-NEXT: # %bb.2: # %split 241; RV64-BOTH-NEXT: ret 242 tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0) 243 ret void 244} 245 246define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind { 247; RV32-BOTH-LABEL: memset_x: 248; RV32-BOTH: # %bb.0: 249; RV32-BOTH-NEXT: or a4, a2, a3 250; RV32-BOTH-NEXT: beqz a4, .LBB3_5 251; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader 252; RV32-BOTH-NEXT: li a4, 0 253; RV32-BOTH-NEXT: lw a5, 0(a1) 254; RV32-BOTH-NEXT: lw a6, 4(a1) 255; RV32-BOTH-NEXT: lw a7, 8(a1) 256; RV32-BOTH-NEXT: lw a1, 12(a1) 257; RV32-BOTH-NEXT: li t0, 0 258; RV32-BOTH-NEXT: j .LBB3_3 259; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop 260; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1 261; RV32-BOTH-NEXT: sltu t1, t0, a3 262; RV32-BOTH-NEXT: beqz t1, .LBB3_5 263; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop 264; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 265; RV32-BOTH-NEXT: slli t1, a4, 4 266; RV32-BOTH-NEXT: addi a4, a4, 1 267; RV32-BOTH-NEXT: seqz t2, a4 268; RV32-BOTH-NEXT: add t0, t0, t2 269; RV32-BOTH-NEXT: add t1, a0, t1 270; RV32-BOTH-NEXT: sw a5, 0(t1) 271; RV32-BOTH-NEXT: sw a6, 4(t1) 272; RV32-BOTH-NEXT: sw a7, 8(t1) 273; RV32-BOTH-NEXT: sw a1, 12(t1) 274; RV32-BOTH-NEXT: bne t0, a3, .LBB3_2 275; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1 276; RV32-BOTH-NEXT: sltu t1, a4, a2 277; RV32-BOTH-NEXT: bnez t1, .LBB3_3 278; RV32-BOTH-NEXT: .LBB3_5: # %split 279; RV32-BOTH-NEXT: ret 280; 281; RV64-BOTH-LABEL: memset_x: 282; RV64-BOTH: # %bb.0: 283; RV64-BOTH-NEXT: beqz a3, .LBB3_3 284; RV64-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader 285; RV64-BOTH-NEXT: li a4, 0 286; RV64-BOTH-NEXT: .LBB3_2: # %loadstoreloop 287; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1 288; RV64-BOTH-NEXT: sd a1, 0(a0) 289; RV64-BOTH-NEXT: sd a2, 8(a0) 290; RV64-BOTH-NEXT: addi a4, a4, 1 291; RV64-BOTH-NEXT: addi a0, a0, 16 292; RV64-BOTH-NEXT: bltu a4, a3, .LBB3_2 293; RV64-BOTH-NEXT: .LBB3_3: # %split 294; RV64-BOTH-NEXT: ret 295 tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0) 296 ret void 297} 298