1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2 3; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+m \ 4; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV32 5; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=sifive-p670 \ 6; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64P670 7; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=spacemit-x60 \ 8; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64X60 9; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m \ 10; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64 11 12 13; test1 14define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_dst_stride, ptr nocapture noundef readonly %src1, i32 noundef signext %i_src1_stride, ptr nocapture noundef readonly %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) { 15; RV32-LABEL: test1: 16; RV32: # %bb.0: # %entry 17; RV32-NEXT: blez a7, .LBB0_17 18; RV32-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph 19; RV32-NEXT: blez a6, .LBB0_17 20; RV32-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader 21; RV32-NEXT: addi t0, a7, -1 22; RV32-NEXT: csrr t2, vlenb 23; RV32-NEXT: mul t3, a1, t0 24; RV32-NEXT: mul t4, a3, t0 25; RV32-NEXT: mul t5, a5, t0 26; RV32-NEXT: slli t1, t2, 1 27; RV32-NEXT: li t6, 32 28; RV32-NEXT: mv t0, t1 29; RV32-NEXT: bnez zero, .LBB0_4 30; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader 31; RV32-NEXT: li t0, 32 32; RV32-NEXT: .LBB0_4: # %for.cond1.preheader.us.preheader 33; RV32-NEXT: addi sp, sp, -16 34; RV32-NEXT: .cfi_def_cfa_offset 16 35; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill 36; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill 37; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill 38; RV32-NEXT: .cfi_offset s0, -4 39; RV32-NEXT: .cfi_offset s1, -8 40; RV32-NEXT: .cfi_offset s2, -12 41; RV32-NEXT: add t3, a0, t3 42; RV32-NEXT: add t4, a2, t4 43; RV32-NEXT: add s0, a4, t5 44; RV32-NEXT: bltu t6, t1, .LBB0_6 45; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader 46; RV32-NEXT: li t1, 32 47; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader 48; RV32-NEXT: add t3, t3, a6 49; RV32-NEXT: add t5, t4, a6 50; RV32-NEXT: add t4, s0, a6 51; RV32-NEXT: beqz zero, .LBB0_8 52; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader 53; RV32-NEXT: mv t1, t0 54; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader 55; RV32-NEXT: li t0, 0 56; RV32-NEXT: sltu t5, a0, t5 57; RV32-NEXT: sltu t6, a2, t3 58; RV32-NEXT: and t5, t5, t6 59; RV32-NEXT: sltu t4, a0, t4 60; RV32-NEXT: sltu t3, a4, t3 61; RV32-NEXT: and t3, t4, t3 62; RV32-NEXT: or t4, a1, a3 63; RV32-NEXT: slti t4, t4, 0 64; RV32-NEXT: or t4, t5, t4 65; RV32-NEXT: or t5, a1, a5 66; RV32-NEXT: sltu t1, a6, t1 67; RV32-NEXT: slti t5, t5, 0 68; RV32-NEXT: or t3, t3, t5 69; RV32-NEXT: or t3, t4, t3 70; RV32-NEXT: or t1, t1, t3 71; RV32-NEXT: andi t1, t1, 1 72; RV32-NEXT: slli t2, t2, 1 73; RV32-NEXT: j .LBB0_10 74; RV32-NEXT: .LBB0_9: # %for.cond1.for.cond.cleanup3_crit_edge.us 75; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 76; RV32-NEXT: add a0, a0, a1 77; RV32-NEXT: add a2, a2, a3 78; RV32-NEXT: addi t0, t0, 1 79; RV32-NEXT: add a4, a4, a5 80; RV32-NEXT: beq t0, a7, .LBB0_16 81; RV32-NEXT: .LBB0_10: # %for.cond1.preheader.us 82; RV32-NEXT: # =>This Loop Header: Depth=1 83; RV32-NEXT: # Child Loop BB0_13 Depth 2 84; RV32-NEXT: # Child Loop BB0_15 Depth 2 85; RV32-NEXT: beqz t1, .LBB0_12 86; RV32-NEXT: # %bb.11: # in Loop: Header=BB0_10 Depth=1 87; RV32-NEXT: li t4, 0 88; RV32-NEXT: li t3, 0 89; RV32-NEXT: j .LBB0_15 90; RV32-NEXT: .LBB0_12: # %vector.ph 91; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 92; RV32-NEXT: li t3, 0 93; RV32-NEXT: neg t4, t2 94; RV32-NEXT: and t4, t4, a6 95; RV32-NEXT: csrwi vxrm, 0 96; RV32-NEXT: li t6, 0 97; RV32-NEXT: li t5, 0 98; RV32-NEXT: vsetvli s0, zero, e8, m2, ta, ma 99; RV32-NEXT: .LBB0_13: # %vector.body 100; RV32-NEXT: # Parent Loop BB0_10 Depth=1 101; RV32-NEXT: # => This Inner Loop Header: Depth=2 102; RV32-NEXT: add s0, a2, t6 103; RV32-NEXT: add s1, a4, t6 104; RV32-NEXT: vl2r.v v8, (s0) 105; RV32-NEXT: add s0, a0, t6 106; RV32-NEXT: vl2r.v v10, (s1) 107; RV32-NEXT: add s1, t6, t2 108; RV32-NEXT: sltu t6, s1, t6 109; RV32-NEXT: add t5, t5, t6 110; RV32-NEXT: xor t6, s1, t4 111; RV32-NEXT: vaaddu.vv v8, v8, v10 112; RV32-NEXT: or s2, t6, t5 113; RV32-NEXT: vs2r.v v8, (s0) 114; RV32-NEXT: mv t6, s1 115; RV32-NEXT: bnez s2, .LBB0_13 116; RV32-NEXT: # %bb.14: # %middle.block 117; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 118; RV32-NEXT: beq t4, a6, .LBB0_9 119; RV32-NEXT: .LBB0_15: # %for.body4.us 120; RV32-NEXT: # Parent Loop BB0_10 Depth=1 121; RV32-NEXT: # => This Inner Loop Header: Depth=2 122; RV32-NEXT: add t5, a2, t4 123; RV32-NEXT: add t6, a4, t4 124; RV32-NEXT: add s0, a0, t4 125; RV32-NEXT: lbu t5, 0(t5) 126; RV32-NEXT: lbu t6, 0(t6) 127; RV32-NEXT: addi t4, t4, 1 128; RV32-NEXT: seqz s1, t4 129; RV32-NEXT: add t3, t3, s1 130; RV32-NEXT: add t5, t5, t6 131; RV32-NEXT: xor t6, t4, a6 132; RV32-NEXT: addi t5, t5, 1 133; RV32-NEXT: srli t5, t5, 1 134; RV32-NEXT: or t6, t6, t3 135; RV32-NEXT: sb t5, 0(s0) 136; RV32-NEXT: bnez t6, .LBB0_15 137; RV32-NEXT: j .LBB0_9 138; RV32-NEXT: .LBB0_16: 139; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload 140; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload 141; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload 142; RV32-NEXT: .cfi_restore s0 143; RV32-NEXT: .cfi_restore s1 144; RV32-NEXT: .cfi_restore s2 145; RV32-NEXT: addi sp, sp, 16 146; RV32-NEXT: .cfi_def_cfa_offset 0 147; RV32-NEXT: .LBB0_17: # %for.cond.cleanup 148; RV32-NEXT: ret 149; 150; RV64P670-LABEL: test1: 151; RV64P670: # %bb.0: # %entry 152; RV64P670-NEXT: csrwi vxrm, 0 153; RV64P670-NEXT: blez a7, .LBB0_12 154; RV64P670-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph 155; RV64P670-NEXT: blez a6, .LBB0_12 156; RV64P670-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader 157; RV64P670-NEXT: addi sp, sp, -48 158; RV64P670-NEXT: .cfi_def_cfa_offset 48 159; RV64P670-NEXT: sd s0, 40(sp) # 8-byte Folded Spill 160; RV64P670-NEXT: sd s1, 32(sp) # 8-byte Folded Spill 161; RV64P670-NEXT: sd s2, 24(sp) # 8-byte Folded Spill 162; RV64P670-NEXT: sd s3, 16(sp) # 8-byte Folded Spill 163; RV64P670-NEXT: sd s4, 8(sp) # 8-byte Folded Spill 164; RV64P670-NEXT: .cfi_offset s0, -8 165; RV64P670-NEXT: .cfi_offset s1, -16 166; RV64P670-NEXT: .cfi_offset s2, -24 167; RV64P670-NEXT: .cfi_offset s3, -32 168; RV64P670-NEXT: .cfi_offset s4, -40 169; RV64P670-NEXT: addi s1, a7, -1 170; RV64P670-NEXT: add s0, a0, a6 171; RV64P670-NEXT: li t0, 0 172; RV64P670-NEXT: li t1, 0 173; RV64P670-NEXT: zext.w s1, s1 174; RV64P670-NEXT: mul t2, a1, s1 175; RV64P670-NEXT: add t4, s0, t2 176; RV64P670-NEXT: mul t2, a3, s1 177; RV64P670-NEXT: add s0, a2, a6 178; RV64P670-NEXT: mul s1, a5, s1 179; RV64P670-NEXT: add t3, s0, t2 180; RV64P670-NEXT: add s0, a4, a6 181; RV64P670-NEXT: csrr t2, vlenb 182; RV64P670-NEXT: add t5, s0, s1 183; RV64P670-NEXT: sltu s1, a0, t3 184; RV64P670-NEXT: sltu s0, a2, t4 185; RV64P670-NEXT: slli t3, t2, 1 186; RV64P670-NEXT: and s0, s0, s1 187; RV64P670-NEXT: or s1, a1, a3 188; RV64P670-NEXT: slti s1, s1, 0 189; RV64P670-NEXT: or t6, s0, s1 190; RV64P670-NEXT: sltu s1, a0, t5 191; RV64P670-NEXT: sltu s0, a4, t4 192; RV64P670-NEXT: mv t5, a0 193; RV64P670-NEXT: and s0, s0, s1 194; RV64P670-NEXT: or s1, a1, a5 195; RV64P670-NEXT: slti s1, s1, 0 196; RV64P670-NEXT: or s0, s0, s1 197; RV64P670-NEXT: li s1, 32 198; RV64P670-NEXT: maxu s1, t3, s1 199; RV64P670-NEXT: or s0, t6, s0 200; RV64P670-NEXT: sltu s1, a6, s1 201; RV64P670-NEXT: or s0, s0, s1 202; RV64P670-NEXT: andi t4, s0, 1 203; RV64P670-NEXT: j .LBB0_4 204; RV64P670-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us 205; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 206; RV64P670-NEXT: add t5, t5, a1 207; RV64P670-NEXT: add a2, a2, a3 208; RV64P670-NEXT: add a4, a4, a5 209; RV64P670-NEXT: addiw t1, t1, 1 210; RV64P670-NEXT: addi t0, t0, 1 211; RV64P670-NEXT: beq t1, a7, .LBB0_11 212; RV64P670-NEXT: .LBB0_4: # %for.cond1.preheader.us 213; RV64P670-NEXT: # =>This Loop Header: Depth=1 214; RV64P670-NEXT: # Child Loop BB0_7 Depth 2 215; RV64P670-NEXT: # Child Loop BB0_10 Depth 2 216; RV64P670-NEXT: beqz t4, .LBB0_6 217; RV64P670-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1 218; RV64P670-NEXT: li t6, 0 219; RV64P670-NEXT: j .LBB0_9 220; RV64P670-NEXT: .LBB0_6: # %vector.ph 221; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 222; RV64P670-NEXT: slli s1, t2, 28 223; RV64P670-NEXT: mv s2, a2 224; RV64P670-NEXT: mv s3, a4 225; RV64P670-NEXT: mv s4, t5 226; RV64P670-NEXT: sub s1, s1, t3 227; RV64P670-NEXT: vsetvli s0, zero, e8, m2, ta, ma 228; RV64P670-NEXT: and t6, s1, a6 229; RV64P670-NEXT: mv s1, t6 230; RV64P670-NEXT: .LBB0_7: # %vector.body 231; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1 232; RV64P670-NEXT: # => This Inner Loop Header: Depth=2 233; RV64P670-NEXT: vl2r.v v8, (s2) 234; RV64P670-NEXT: sub s1, s1, t3 235; RV64P670-NEXT: add s2, s2, t3 236; RV64P670-NEXT: vl2r.v v10, (s3) 237; RV64P670-NEXT: add s3, s3, t3 238; RV64P670-NEXT: vaaddu.vv v8, v8, v10 239; RV64P670-NEXT: vs2r.v v8, (s4) 240; RV64P670-NEXT: add s4, s4, t3 241; RV64P670-NEXT: bnez s1, .LBB0_7 242; RV64P670-NEXT: # %bb.8: # %middle.block 243; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 244; RV64P670-NEXT: beq t6, a6, .LBB0_3 245; RV64P670-NEXT: .LBB0_9: # %for.body4.us.preheader 246; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 247; RV64P670-NEXT: mul s2, a1, t0 248; RV64P670-NEXT: add s0, a0, a6 249; RV64P670-NEXT: add s1, t5, t6 250; RV64P670-NEXT: add s4, a4, t6 251; RV64P670-NEXT: add t6, t6, a2 252; RV64P670-NEXT: add s2, s2, s0 253; RV64P670-NEXT: .LBB0_10: # %for.body4.us 254; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1 255; RV64P670-NEXT: # => This Inner Loop Header: Depth=2 256; RV64P670-NEXT: lbu s3, 0(t6) 257; RV64P670-NEXT: lbu s0, 0(s4) 258; RV64P670-NEXT: addi s4, s4, 1 259; RV64P670-NEXT: addi t6, t6, 1 260; RV64P670-NEXT: add s0, s0, s3 261; RV64P670-NEXT: addi s0, s0, 1 262; RV64P670-NEXT: srli s0, s0, 1 263; RV64P670-NEXT: sb s0, 0(s1) 264; RV64P670-NEXT: addi s1, s1, 1 265; RV64P670-NEXT: bne s1, s2, .LBB0_10 266; RV64P670-NEXT: j .LBB0_3 267; RV64P670-NEXT: .LBB0_11: 268; RV64P670-NEXT: ld s0, 40(sp) # 8-byte Folded Reload 269; RV64P670-NEXT: ld s1, 32(sp) # 8-byte Folded Reload 270; RV64P670-NEXT: ld s2, 24(sp) # 8-byte Folded Reload 271; RV64P670-NEXT: ld s3, 16(sp) # 8-byte Folded Reload 272; RV64P670-NEXT: ld s4, 8(sp) # 8-byte Folded Reload 273; RV64P670-NEXT: .cfi_restore s0 274; RV64P670-NEXT: .cfi_restore s1 275; RV64P670-NEXT: .cfi_restore s2 276; RV64P670-NEXT: .cfi_restore s3 277; RV64P670-NEXT: .cfi_restore s4 278; RV64P670-NEXT: addi sp, sp, 48 279; RV64P670-NEXT: .cfi_def_cfa_offset 0 280; RV64P670-NEXT: .LBB0_12: # %for.cond.cleanup 281; RV64P670-NEXT: ret 282; 283; RV64X60-LABEL: test1: 284; RV64X60: # %bb.0: # %entry 285; RV64X60-NEXT: csrwi vxrm, 0 286; RV64X60-NEXT: blez a7, .LBB0_12 287; RV64X60-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph 288; RV64X60-NEXT: blez a6, .LBB0_12 289; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader 290; RV64X60-NEXT: addi sp, sp, -48 291; RV64X60-NEXT: .cfi_def_cfa_offset 48 292; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill 293; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill 294; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill 295; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill 296; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill 297; RV64X60-NEXT: .cfi_offset s0, -8 298; RV64X60-NEXT: .cfi_offset s1, -16 299; RV64X60-NEXT: .cfi_offset s2, -24 300; RV64X60-NEXT: .cfi_offset s3, -32 301; RV64X60-NEXT: .cfi_offset s4, -40 302; RV64X60-NEXT: li t0, 0 303; RV64X60-NEXT: li t1, 0 304; RV64X60-NEXT: addi t2, a7, -1 305; RV64X60-NEXT: add t4, a0, a6 306; RV64X60-NEXT: add t5, a2, a6 307; RV64X60-NEXT: add t3, a4, a6 308; RV64X60-NEXT: zext.w s0, t2 309; RV64X60-NEXT: mul s1, a1, s0 310; RV64X60-NEXT: add t4, t4, s1 311; RV64X60-NEXT: mul s1, a3, s0 312; RV64X60-NEXT: add t5, t5, s1 313; RV64X60-NEXT: csrr t2, vlenb 314; RV64X60-NEXT: mul s1, a5, s0 315; RV64X60-NEXT: add t3, t3, s1 316; RV64X60-NEXT: sltu s1, a0, t5 317; RV64X60-NEXT: sltu s0, a2, t4 318; RV64X60-NEXT: and t6, s1, s0 319; RV64X60-NEXT: li t5, 32 320; RV64X60-NEXT: sltu s1, a0, t3 321; RV64X60-NEXT: sltu s0, a4, t4 322; RV64X60-NEXT: and t3, s1, s0 323; RV64X60-NEXT: or s1, a1, a3 324; RV64X60-NEXT: slti s1, s1, 0 325; RV64X60-NEXT: or t4, t6, s1 326; RV64X60-NEXT: or s0, a1, a5 327; RV64X60-NEXT: slti s0, s0, 0 328; RV64X60-NEXT: or s0, t3, s0 329; RV64X60-NEXT: slli t3, t2, 1 330; RV64X60-NEXT: maxu s1, t3, t5 331; RV64X60-NEXT: or s0, t4, s0 332; RV64X60-NEXT: sltu s1, a6, s1 333; RV64X60-NEXT: or s0, s0, s1 334; RV64X60-NEXT: andi t4, s0, 1 335; RV64X60-NEXT: mv t5, a0 336; RV64X60-NEXT: j .LBB0_4 337; RV64X60-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us 338; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 339; RV64X60-NEXT: add t5, t5, a1 340; RV64X60-NEXT: add a2, a2, a3 341; RV64X60-NEXT: add a4, a4, a5 342; RV64X60-NEXT: addiw t1, t1, 1 343; RV64X60-NEXT: addi t0, t0, 1 344; RV64X60-NEXT: beq t1, a7, .LBB0_11 345; RV64X60-NEXT: .LBB0_4: # %for.cond1.preheader.us 346; RV64X60-NEXT: # =>This Loop Header: Depth=1 347; RV64X60-NEXT: # Child Loop BB0_7 Depth 2 348; RV64X60-NEXT: # Child Loop BB0_10 Depth 2 349; RV64X60-NEXT: beqz t4, .LBB0_6 350; RV64X60-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1 351; RV64X60-NEXT: li t6, 0 352; RV64X60-NEXT: j .LBB0_9 353; RV64X60-NEXT: .LBB0_6: # %vector.ph 354; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 355; RV64X60-NEXT: slli s1, t2, 28 356; RV64X60-NEXT: sub s1, s1, t3 357; RV64X60-NEXT: and t6, s1, a6 358; RV64X60-NEXT: mv s2, a2 359; RV64X60-NEXT: mv s3, a4 360; RV64X60-NEXT: mv s4, t5 361; RV64X60-NEXT: mv s1, t6 362; RV64X60-NEXT: vsetvli s0, zero, e8, m2, ta, ma 363; RV64X60-NEXT: .LBB0_7: # %vector.body 364; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1 365; RV64X60-NEXT: # => This Inner Loop Header: Depth=2 366; RV64X60-NEXT: vl2r.v v8, (s2) 367; RV64X60-NEXT: vl2r.v v10, (s3) 368; RV64X60-NEXT: sub s1, s1, t3 369; RV64X60-NEXT: add s3, s3, t3 370; RV64X60-NEXT: vaaddu.vv v8, v8, v10 371; RV64X60-NEXT: vs2r.v v8, (s4) 372; RV64X60-NEXT: add s4, s4, t3 373; RV64X60-NEXT: add s2, s2, t3 374; RV64X60-NEXT: bnez s1, .LBB0_7 375; RV64X60-NEXT: # %bb.8: # %middle.block 376; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 377; RV64X60-NEXT: beq t6, a6, .LBB0_3 378; RV64X60-NEXT: .LBB0_9: # %for.body4.us.preheader 379; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 380; RV64X60-NEXT: mul s2, a1, t0 381; RV64X60-NEXT: add s1, a0, a6 382; RV64X60-NEXT: add s0, t5, t6 383; RV64X60-NEXT: add s2, s2, s1 384; RV64X60-NEXT: add s4, a4, t6 385; RV64X60-NEXT: add t6, t6, a2 386; RV64X60-NEXT: .LBB0_10: # %for.body4.us 387; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1 388; RV64X60-NEXT: # => This Inner Loop Header: Depth=2 389; RV64X60-NEXT: lbu s3, 0(t6) 390; RV64X60-NEXT: lbu s1, 0(s4) 391; RV64X60-NEXT: add s1, s1, s3 392; RV64X60-NEXT: addi s1, s1, 1 393; RV64X60-NEXT: srli s1, s1, 1 394; RV64X60-NEXT: sb s1, 0(s0) 395; RV64X60-NEXT: addi s0, s0, 1 396; RV64X60-NEXT: addi s4, s4, 1 397; RV64X60-NEXT: addi t6, t6, 1 398; RV64X60-NEXT: bne s0, s2, .LBB0_10 399; RV64X60-NEXT: j .LBB0_3 400; RV64X60-NEXT: .LBB0_11: 401; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload 402; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload 403; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload 404; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload 405; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload 406; RV64X60-NEXT: .cfi_restore s0 407; RV64X60-NEXT: .cfi_restore s1 408; RV64X60-NEXT: .cfi_restore s2 409; RV64X60-NEXT: .cfi_restore s3 410; RV64X60-NEXT: .cfi_restore s4 411; RV64X60-NEXT: addi sp, sp, 48 412; RV64X60-NEXT: .cfi_def_cfa_offset 0 413; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup 414; RV64X60-NEXT: ret 415; 416; RV64-LABEL: test1: 417; RV64: # %bb.0: # %entry 418; RV64-NEXT: blez a7, .LBB0_14 419; RV64-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph 420; RV64-NEXT: blez a6, .LBB0_14 421; RV64-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader 422; RV64-NEXT: addi sp, sp, -48 423; RV64-NEXT: .cfi_def_cfa_offset 48 424; RV64-NEXT: sd s0, 40(sp) # 8-byte Folded Spill 425; RV64-NEXT: sd s1, 32(sp) # 8-byte Folded Spill 426; RV64-NEXT: sd s2, 24(sp) # 8-byte Folded Spill 427; RV64-NEXT: sd s3, 16(sp) # 8-byte Folded Spill 428; RV64-NEXT: sd s4, 8(sp) # 8-byte Folded Spill 429; RV64-NEXT: .cfi_offset s0, -8 430; RV64-NEXT: .cfi_offset s1, -16 431; RV64-NEXT: .cfi_offset s2, -24 432; RV64-NEXT: .cfi_offset s3, -32 433; RV64-NEXT: .cfi_offset s4, -40 434; RV64-NEXT: addi t1, a7, -1 435; RV64-NEXT: add t5, a0, a6 436; RV64-NEXT: add s0, a2, a6 437; RV64-NEXT: add t6, a4, a6 438; RV64-NEXT: csrr t0, vlenb 439; RV64-NEXT: li t2, 32 440; RV64-NEXT: slli t1, t1, 32 441; RV64-NEXT: srli t3, t1, 32 442; RV64-NEXT: mul t1, a1, t3 443; RV64-NEXT: add t5, t5, t1 444; RV64-NEXT: mul t1, a3, t3 445; RV64-NEXT: add s0, s0, t1 446; RV64-NEXT: slli t1, t0, 1 447; RV64-NEXT: mul t3, a5, t3 448; RV64-NEXT: add t6, t6, t3 449; RV64-NEXT: mv t4, t1 450; RV64-NEXT: bltu t2, t1, .LBB0_4 451; RV64-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader 452; RV64-NEXT: li t4, 32 453; RV64-NEXT: .LBB0_4: # %for.cond1.preheader.us.preheader 454; RV64-NEXT: li t2, 0 455; RV64-NEXT: li t3, 0 456; RV64-NEXT: sltu s0, a0, s0 457; RV64-NEXT: sltu s1, a2, t5 458; RV64-NEXT: and s0, s0, s1 459; RV64-NEXT: sltu t6, a0, t6 460; RV64-NEXT: sltu t5, a4, t5 461; RV64-NEXT: and t5, t6, t5 462; RV64-NEXT: or t6, a1, a3 463; RV64-NEXT: slti t6, t6, 0 464; RV64-NEXT: or t6, s0, t6 465; RV64-NEXT: or s0, a1, a5 466; RV64-NEXT: slti s0, s0, 0 467; RV64-NEXT: or t5, t5, s0 468; RV64-NEXT: or t5, t6, t5 469; RV64-NEXT: sltu t4, a6, t4 470; RV64-NEXT: or t4, t4, t5 471; RV64-NEXT: andi t4, t4, 1 472; RV64-NEXT: mv t5, a0 473; RV64-NEXT: j .LBB0_6 474; RV64-NEXT: .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us 475; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 476; RV64-NEXT: add t5, t5, a1 477; RV64-NEXT: add a2, a2, a3 478; RV64-NEXT: add a4, a4, a5 479; RV64-NEXT: addiw t3, t3, 1 480; RV64-NEXT: addi t2, t2, 1 481; RV64-NEXT: beq t3, a7, .LBB0_13 482; RV64-NEXT: .LBB0_6: # %for.cond1.preheader.us 483; RV64-NEXT: # =>This Loop Header: Depth=1 484; RV64-NEXT: # Child Loop BB0_9 Depth 2 485; RV64-NEXT: # Child Loop BB0_12 Depth 2 486; RV64-NEXT: beqz t4, .LBB0_8 487; RV64-NEXT: # %bb.7: # in Loop: Header=BB0_6 Depth=1 488; RV64-NEXT: li t6, 0 489; RV64-NEXT: j .LBB0_11 490; RV64-NEXT: .LBB0_8: # %vector.ph 491; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 492; RV64-NEXT: slli t6, t0, 28 493; RV64-NEXT: sub t6, t6, t1 494; RV64-NEXT: and t6, t6, a6 495; RV64-NEXT: csrwi vxrm, 0 496; RV64-NEXT: mv s0, a2 497; RV64-NEXT: mv s1, a4 498; RV64-NEXT: mv s2, t5 499; RV64-NEXT: mv s3, t6 500; RV64-NEXT: vsetvli s4, zero, e8, m2, ta, ma 501; RV64-NEXT: .LBB0_9: # %vector.body 502; RV64-NEXT: # Parent Loop BB0_6 Depth=1 503; RV64-NEXT: # => This Inner Loop Header: Depth=2 504; RV64-NEXT: vl2r.v v8, (s0) 505; RV64-NEXT: vl2r.v v10, (s1) 506; RV64-NEXT: sub s3, s3, t1 507; RV64-NEXT: add s1, s1, t1 508; RV64-NEXT: vaaddu.vv v8, v8, v10 509; RV64-NEXT: vs2r.v v8, (s2) 510; RV64-NEXT: add s2, s2, t1 511; RV64-NEXT: add s0, s0, t1 512; RV64-NEXT: bnez s3, .LBB0_9 513; RV64-NEXT: # %bb.10: # %middle.block 514; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 515; RV64-NEXT: beq t6, a6, .LBB0_5 516; RV64-NEXT: .LBB0_11: # %for.body4.us.preheader 517; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 518; RV64-NEXT: mul s1, a1, t2 519; RV64-NEXT: add s2, a0, a6 520; RV64-NEXT: add s0, t5, t6 521; RV64-NEXT: add s1, s2, s1 522; RV64-NEXT: add s2, a4, t6 523; RV64-NEXT: add t6, a2, t6 524; RV64-NEXT: .LBB0_12: # %for.body4.us 525; RV64-NEXT: # Parent Loop BB0_6 Depth=1 526; RV64-NEXT: # => This Inner Loop Header: Depth=2 527; RV64-NEXT: lbu s3, 0(t6) 528; RV64-NEXT: lbu s4, 0(s2) 529; RV64-NEXT: add s3, s3, s4 530; RV64-NEXT: addi s3, s3, 1 531; RV64-NEXT: srli s3, s3, 1 532; RV64-NEXT: sb s3, 0(s0) 533; RV64-NEXT: addi s0, s0, 1 534; RV64-NEXT: addi s2, s2, 1 535; RV64-NEXT: addi t6, t6, 1 536; RV64-NEXT: bne s0, s1, .LBB0_12 537; RV64-NEXT: j .LBB0_5 538; RV64-NEXT: .LBB0_13: 539; RV64-NEXT: ld s0, 40(sp) # 8-byte Folded Reload 540; RV64-NEXT: ld s1, 32(sp) # 8-byte Folded Reload 541; RV64-NEXT: ld s2, 24(sp) # 8-byte Folded Reload 542; RV64-NEXT: ld s3, 16(sp) # 8-byte Folded Reload 543; RV64-NEXT: ld s4, 8(sp) # 8-byte Folded Reload 544; RV64-NEXT: .cfi_restore s0 545; RV64-NEXT: .cfi_restore s1 546; RV64-NEXT: .cfi_restore s2 547; RV64-NEXT: .cfi_restore s3 548; RV64-NEXT: .cfi_restore s4 549; RV64-NEXT: addi sp, sp, 48 550; RV64-NEXT: .cfi_def_cfa_offset 0 551; RV64-NEXT: .LBB0_14: # %for.cond.cleanup 552; RV64-NEXT: ret 553entry: 554 %cmp29 = icmp sgt i32 %i_height, 0 555 br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup 556 557for.cond1.preheader.lr.ph: ; preds = %entry 558 %cmp227 = icmp sgt i32 %i_width, 0 559 %idx.ext = sext i32 %i_dst_stride to i64 560 %idx.ext12 = sext i32 %i_src1_stride to i64 561 %idx.ext14 = sext i32 %i_src2_stride to i64 562 br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup 563 564for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph 565 %wide.trip.count = zext nneg i32 %i_width to i64 566 %0 = add nsw i32 %i_height, -1 567 %1 = zext i32 %0 to i64 568 %2 = mul nsw i64 %idx.ext, %1 569 %3 = getelementptr i8, ptr %dst, i64 %2 570 %scevgep = getelementptr i8, ptr %3, i64 %wide.trip.count 571 %4 = mul nsw i64 %idx.ext12, %1 572 %5 = getelementptr i8, ptr %src1, i64 %4 573 %scevgep36 = getelementptr i8, ptr %5, i64 %wide.trip.count 574 %6 = mul nsw i64 %idx.ext14, %1 575 %7 = getelementptr i8, ptr %src2, i64 %6 576 %scevgep37 = getelementptr i8, ptr %7, i64 %wide.trip.count 577 %8 = tail call i64 @llvm.vscale.i64() 578 %9 = shl nuw nsw i64 %8, 4 579 %10 = tail call i64 @llvm.umax.i64(i64 %9, i64 32) 580 %min.iters.check = icmp ugt i64 %10, %wide.trip.count 581 %bound0 = icmp ult ptr %dst, %scevgep36 582 %bound1 = icmp ult ptr %src1, %scevgep 583 %found.conflict = and i1 %bound0, %bound1 584 %11 = or i32 %i_dst_stride, %i_src1_stride 585 %12 = icmp slt i32 %11, 0 586 %13 = or i1 %found.conflict, %12 587 %bound039 = icmp ult ptr %dst, %scevgep37 588 %bound140 = icmp ult ptr %src2, %scevgep 589 %found.conflict41 = and i1 %bound039, %bound140 590 %14 = or i32 %i_dst_stride, %i_src2_stride 591 %15 = icmp slt i32 %14, 0 592 %16 = or i1 %found.conflict41, %15 593 %conflict.rdx = or i1 %13, %16 594 br label %for.cond1.preheader.us 595 596for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us 597 %y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 598 %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ] 599 %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ] 600 %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ] 601 %brmerge = select i1 %min.iters.check, i1 true, i1 %conflict.rdx 602 br i1 %brmerge, label %for.body4.us.preheader, label %vector.ph 603 604vector.ph: ; preds = %for.cond1.preheader.us 605 %17 = tail call i64 @llvm.vscale.i64() 606 %.neg = mul nuw nsw i64 %17, 2147483632 607 %n.vec = and i64 %.neg, %wide.trip.count 608 %18 = tail call i64 @llvm.vscale.i64() 609 %19 = shl nuw nsw i64 %18, 4 610 br label %vector.body 611 612vector.body: ; preds = %vector.body, %vector.ph 613 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 614 %20 = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %index 615 %wide.load = load <vscale x 16 x i8>, ptr %20, align 1 616 %21 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i16> 617 %22 = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %index 618 %wide.load44 = load <vscale x 16 x i8>, ptr %22, align 1 619 %23 = zext <vscale x 16 x i8> %wide.load44 to <vscale x 16 x i16> 620 %24 = add nuw nsw <vscale x 16 x i16> %21, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer) 621 %25 = add nuw nsw <vscale x 16 x i16> %24, %23 622 %26 = lshr <vscale x 16 x i16> %25, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer) 623 %27 = trunc <vscale x 16 x i16> %26 to <vscale x 16 x i8> 624 %28 = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %index 625 store <vscale x 16 x i8> %27, ptr %28, align 1 626 %index.next = add nuw i64 %index, %19 627 %29 = icmp eq i64 %index.next, %n.vec 628 br i1 %29, label %middle.block, label %vector.body 629 630middle.block: ; preds = %vector.body 631 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 632 br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader 633 634for.body4.us.preheader: ; preds = %for.cond1.preheader.us, %middle.block 635 %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ] 636 br label %for.body4.us 637 638for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us 639 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ] 640 %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv 641 %30 = load i8, ptr %arrayidx.us, align 1 642 %conv.us = zext i8 %30 to i16 643 %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv 644 %31 = load i8, ptr %arrayidx6.us, align 1 645 %conv7.us = zext i8 %31 to i16 646 %add.us = add nuw nsw i16 %conv.us, 1 647 %add8.us = add nuw nsw i16 %add.us, %conv7.us 648 %shr.us = lshr i16 %add8.us, 1 649 %conv9.us = trunc nuw i16 %shr.us to i8 650 %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv 651 store i8 %conv9.us, ptr %arrayidx11.us, align 1 652 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 653 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 654 br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us 655 656for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us, %middle.block 657 %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext 658 %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12 659 %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14 660 %inc17.us = add nuw nsw i32 %y.033.us, 1 661 %exitcond35.not = icmp eq i32 %inc17.us, %i_height 662 br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us 663 664for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry 665 ret void 666} 667