1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s \ 3; RUN: | FileCheck %s --check-prefixes=SLOW,RV32-SLOW 4; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \ 5; RUN: | FileCheck %s --check-prefixes=SLOW,RV64-SLOW 6; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \ 7; RUN: | FileCheck %s --check-prefixes=FAST,RV32-FAST 8; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \ 9; RUN: | FileCheck %s --check-prefixes=FAST,RV64-FAST 10 11define <4 x i32> @load_v4i32_align1(ptr %ptr) { 12; SLOW-LABEL: load_v4i32_align1: 13; SLOW: # %bb.0: 14; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma 15; SLOW-NEXT: vle8.v v8, (a0) 16; SLOW-NEXT: ret 17; 18; FAST-LABEL: load_v4i32_align1: 19; FAST: # %bb.0: 20; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma 21; FAST-NEXT: vle32.v v8, (a0) 22; FAST-NEXT: ret 23 %z = load <4 x i32>, ptr %ptr, align 1 24 ret <4 x i32> %z 25} 26 27define <4 x i32> @load_v4i32_align2(ptr %ptr) { 28; SLOW-LABEL: load_v4i32_align2: 29; SLOW: # %bb.0: 30; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma 31; SLOW-NEXT: vle8.v v8, (a0) 32; SLOW-NEXT: ret 33; 34; FAST-LABEL: load_v4i32_align2: 35; FAST: # %bb.0: 36; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma 37; FAST-NEXT: vle32.v v8, (a0) 38; FAST-NEXT: ret 39 %z = load <4 x i32>, ptr %ptr, align 2 40 ret <4 x i32> %z 41} 42 43define void @store_v4i32_align1(<4 x i32> %x, ptr %ptr) { 44; SLOW-LABEL: store_v4i32_align1: 45; SLOW: # %bb.0: 46; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma 47; SLOW-NEXT: vse8.v v8, (a0) 48; SLOW-NEXT: ret 49; 50; FAST-LABEL: store_v4i32_align1: 51; FAST: # %bb.0: 52; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma 53; FAST-NEXT: vse32.v v8, (a0) 54; FAST-NEXT: ret 55 store <4 x i32> %x, ptr %ptr, align 1 56 ret void 57} 58 59define void @store_v4i32_align2(<4 x i32> %x, ptr %ptr) { 60; SLOW-LABEL: store_v4i32_align2: 61; SLOW: # %bb.0: 62; SLOW-NEXT: vsetivli zero, 16, e8, m1, ta, ma 63; SLOW-NEXT: vse8.v v8, (a0) 64; SLOW-NEXT: ret 65; 66; FAST-LABEL: store_v4i32_align2: 67; FAST: # %bb.0: 68; FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma 69; FAST-NEXT: vse32.v v8, (a0) 70; FAST-NEXT: ret 71 store <4 x i32> %x, ptr %ptr, align 2 72 ret void 73} 74 75declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>) 76 77define <2 x i16> @mgather_v2i16_align1(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i16> %passthru) { 78; RV32-SLOW-LABEL: mgather_v2i16_align1: 79; RV32-SLOW: # %bb.0: 80; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 81; RV32-SLOW-NEXT: vmv.x.s a0, v0 82; RV32-SLOW-NEXT: andi a1, a0, 1 83; RV32-SLOW-NEXT: beqz a1, .LBB4_2 84; RV32-SLOW-NEXT: # %bb.1: # %cond.load 85; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma 86; RV32-SLOW-NEXT: vmv.x.s a1, v8 87; RV32-SLOW-NEXT: lbu a2, 1(a1) 88; RV32-SLOW-NEXT: lbu a1, 0(a1) 89; RV32-SLOW-NEXT: slli a2, a2, 8 90; RV32-SLOW-NEXT: or a1, a2, a1 91; RV32-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma 92; RV32-SLOW-NEXT: vmv.s.x v9, a1 93; RV32-SLOW-NEXT: .LBB4_2: # %else 94; RV32-SLOW-NEXT: andi a0, a0, 2 95; RV32-SLOW-NEXT: beqz a0, .LBB4_4 96; RV32-SLOW-NEXT: # %bb.3: # %cond.load1 97; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 98; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1 99; RV32-SLOW-NEXT: vmv.x.s a0, v8 100; RV32-SLOW-NEXT: lbu a1, 1(a0) 101; RV32-SLOW-NEXT: lbu a0, 0(a0) 102; RV32-SLOW-NEXT: slli a1, a1, 8 103; RV32-SLOW-NEXT: or a0, a1, a0 104; RV32-SLOW-NEXT: vmv.s.x v8, a0 105; RV32-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 106; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1 107; RV32-SLOW-NEXT: .LBB4_4: # %else2 108; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 109; RV32-SLOW-NEXT: vmv1r.v v8, v9 110; RV32-SLOW-NEXT: ret 111; 112; RV64-SLOW-LABEL: mgather_v2i16_align1: 113; RV64-SLOW: # %bb.0: 114; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 115; RV64-SLOW-NEXT: vmv.x.s a0, v0 116; RV64-SLOW-NEXT: andi a1, a0, 1 117; RV64-SLOW-NEXT: beqz a1, .LBB4_2 118; RV64-SLOW-NEXT: # %bb.1: # %cond.load 119; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma 120; RV64-SLOW-NEXT: vmv.x.s a1, v8 121; RV64-SLOW-NEXT: lbu a2, 1(a1) 122; RV64-SLOW-NEXT: lbu a1, 0(a1) 123; RV64-SLOW-NEXT: slli a2, a2, 8 124; RV64-SLOW-NEXT: or a1, a2, a1 125; RV64-SLOW-NEXT: vsetvli zero, zero, e16, m2, tu, ma 126; RV64-SLOW-NEXT: vmv.s.x v9, a1 127; RV64-SLOW-NEXT: .LBB4_2: # %else 128; RV64-SLOW-NEXT: andi a0, a0, 2 129; RV64-SLOW-NEXT: beqz a0, .LBB4_4 130; RV64-SLOW-NEXT: # %bb.3: # %cond.load1 131; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma 132; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 133; RV64-SLOW-NEXT: vmv.x.s a0, v8 134; RV64-SLOW-NEXT: lbu a1, 1(a0) 135; RV64-SLOW-NEXT: lbu a0, 0(a0) 136; RV64-SLOW-NEXT: slli a1, a1, 8 137; RV64-SLOW-NEXT: or a0, a1, a0 138; RV64-SLOW-NEXT: vmv.s.x v8, a0 139; RV64-SLOW-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 140; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 141; RV64-SLOW-NEXT: .LBB4_4: # %else2 142; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 143; RV64-SLOW-NEXT: vmv1r.v v8, v9 144; RV64-SLOW-NEXT: ret 145; 146; RV32-FAST-LABEL: mgather_v2i16_align1: 147; RV32-FAST: # %bb.0: 148; RV32-FAST-NEXT: vsetivli zero, 2, e16, mf4, ta, mu 149; RV32-FAST-NEXT: vluxei32.v v9, (zero), v8, v0.t 150; RV32-FAST-NEXT: vmv1r.v v8, v9 151; RV32-FAST-NEXT: ret 152; 153; RV64-FAST-LABEL: mgather_v2i16_align1: 154; RV64-FAST: # %bb.0: 155; RV64-FAST-NEXT: vsetivli zero, 2, e16, mf4, ta, mu 156; RV64-FAST-NEXT: vluxei64.v v9, (zero), v8, v0.t 157; RV64-FAST-NEXT: vmv1r.v v8, v9 158; RV64-FAST-NEXT: ret 159 %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 1, <2 x i1> %m, <2 x i16> %passthru) 160 ret <2 x i16> %v 161} 162 163declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>) 164 165define <2 x i64> @mgather_v2i64_align4(<2 x ptr> %ptrs, <2 x i1> %m, <2 x i64> %passthru) { 166; RV32-SLOW-LABEL: mgather_v2i64_align4: 167; RV32-SLOW: # %bb.0: 168; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 169; RV32-SLOW-NEXT: vmv.x.s a0, v0 170; RV32-SLOW-NEXT: andi a1, a0, 1 171; RV32-SLOW-NEXT: beqz a1, .LBB5_2 172; RV32-SLOW-NEXT: # %bb.1: # %cond.load 173; RV32-SLOW-NEXT: vsetivli zero, 2, e32, m1, tu, ma 174; RV32-SLOW-NEXT: vmv.x.s a1, v8 175; RV32-SLOW-NEXT: lw a2, 0(a1) 176; RV32-SLOW-NEXT: lw a1, 4(a1) 177; RV32-SLOW-NEXT: vslide1down.vx v9, v9, a2 178; RV32-SLOW-NEXT: vslide1down.vx v9, v9, a1 179; RV32-SLOW-NEXT: .LBB5_2: # %else 180; RV32-SLOW-NEXT: andi a0, a0, 2 181; RV32-SLOW-NEXT: beqz a0, .LBB5_4 182; RV32-SLOW-NEXT: # %bb.3: # %cond.load1 183; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 184; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1 185; RV32-SLOW-NEXT: vmv.x.s a0, v8 186; RV32-SLOW-NEXT: lw a1, 0(a0) 187; RV32-SLOW-NEXT: lw a0, 4(a0) 188; RV32-SLOW-NEXT: vsetivli zero, 2, e32, m1, ta, ma 189; RV32-SLOW-NEXT: vslide1down.vx v8, v8, a1 190; RV32-SLOW-NEXT: vslide1down.vx v8, v8, a0 191; RV32-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma 192; RV32-SLOW-NEXT: vslideup.vi v9, v8, 1 193; RV32-SLOW-NEXT: .LBB5_4: # %else2 194; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 195; RV32-SLOW-NEXT: vmv1r.v v8, v9 196; RV32-SLOW-NEXT: ret 197; 198; RV64-SLOW-LABEL: mgather_v2i64_align4: 199; RV64-SLOW: # %bb.0: 200; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 201; RV64-SLOW-NEXT: vmv.x.s a0, v0 202; RV64-SLOW-NEXT: andi a1, a0, 1 203; RV64-SLOW-NEXT: beqz a1, .LBB5_2 204; RV64-SLOW-NEXT: # %bb.1: # %cond.load 205; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, tu, ma 206; RV64-SLOW-NEXT: vmv.x.s a1, v8 207; RV64-SLOW-NEXT: lwu a2, 4(a1) 208; RV64-SLOW-NEXT: lwu a1, 0(a1) 209; RV64-SLOW-NEXT: slli a2, a2, 32 210; RV64-SLOW-NEXT: or a1, a2, a1 211; RV64-SLOW-NEXT: vmv.s.x v9, a1 212; RV64-SLOW-NEXT: .LBB5_2: # %else 213; RV64-SLOW-NEXT: andi a0, a0, 2 214; RV64-SLOW-NEXT: beqz a0, .LBB5_4 215; RV64-SLOW-NEXT: # %bb.3: # %cond.load1 216; RV64-SLOW-NEXT: vsetivli zero, 2, e64, m1, ta, ma 217; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 218; RV64-SLOW-NEXT: vmv.x.s a0, v8 219; RV64-SLOW-NEXT: lwu a1, 4(a0) 220; RV64-SLOW-NEXT: lwu a0, 0(a0) 221; RV64-SLOW-NEXT: slli a1, a1, 32 222; RV64-SLOW-NEXT: or a0, a1, a0 223; RV64-SLOW-NEXT: vmv.s.x v8, a0 224; RV64-SLOW-NEXT: vslideup.vi v9, v8, 1 225; RV64-SLOW-NEXT: .LBB5_4: # %else2 226; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 227; RV64-SLOW-NEXT: vmv1r.v v8, v9 228; RV64-SLOW-NEXT: ret 229; 230; RV32-FAST-LABEL: mgather_v2i64_align4: 231; RV32-FAST: # %bb.0: 232; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, mu 233; RV32-FAST-NEXT: vluxei32.v v9, (zero), v8, v0.t 234; RV32-FAST-NEXT: vmv.v.v v8, v9 235; RV32-FAST-NEXT: ret 236; 237; RV64-FAST-LABEL: mgather_v2i64_align4: 238; RV64-FAST: # %bb.0: 239; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, mu 240; RV64-FAST-NEXT: vluxei64.v v9, (zero), v8, v0.t 241; RV64-FAST-NEXT: vmv.v.v v8, v9 242; RV64-FAST-NEXT: ret 243 %v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i64> %passthru) 244 ret <2 x i64> %v 245} 246 247declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>) 248 249define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m) { 250; RV32-SLOW-LABEL: mscatter_v4i16_align1: 251; RV32-SLOW: # %bb.0: 252; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 253; RV32-SLOW-NEXT: vmv.x.s a0, v0 254; RV32-SLOW-NEXT: andi a1, a0, 1 255; RV32-SLOW-NEXT: bnez a1, .LBB6_5 256; RV32-SLOW-NEXT: # %bb.1: # %else 257; RV32-SLOW-NEXT: andi a1, a0, 2 258; RV32-SLOW-NEXT: bnez a1, .LBB6_6 259; RV32-SLOW-NEXT: .LBB6_2: # %else2 260; RV32-SLOW-NEXT: andi a1, a0, 4 261; RV32-SLOW-NEXT: bnez a1, .LBB6_7 262; RV32-SLOW-NEXT: .LBB6_3: # %else4 263; RV32-SLOW-NEXT: andi a0, a0, 8 264; RV32-SLOW-NEXT: bnez a0, .LBB6_8 265; RV32-SLOW-NEXT: .LBB6_4: # %else6 266; RV32-SLOW-NEXT: ret 267; RV32-SLOW-NEXT: .LBB6_5: # %cond.store 268; RV32-SLOW-NEXT: vsetvli zero, zero, e16, m2, ta, ma 269; RV32-SLOW-NEXT: vmv.x.s a1, v8 270; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma 271; RV32-SLOW-NEXT: vmv.x.s a2, v9 272; RV32-SLOW-NEXT: srli a3, a1, 8 273; RV32-SLOW-NEXT: sb a1, 0(a2) 274; RV32-SLOW-NEXT: sb a3, 1(a2) 275; RV32-SLOW-NEXT: andi a1, a0, 2 276; RV32-SLOW-NEXT: beqz a1, .LBB6_2 277; RV32-SLOW-NEXT: .LBB6_6: # %cond.store1 278; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 279; RV32-SLOW-NEXT: vslidedown.vi v10, v8, 1 280; RV32-SLOW-NEXT: vmv.x.s a1, v10 281; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma 282; RV32-SLOW-NEXT: vslidedown.vi v10, v9, 1 283; RV32-SLOW-NEXT: vmv.x.s a2, v10 284; RV32-SLOW-NEXT: srli a3, a1, 8 285; RV32-SLOW-NEXT: sb a1, 0(a2) 286; RV32-SLOW-NEXT: sb a3, 1(a2) 287; RV32-SLOW-NEXT: andi a1, a0, 4 288; RV32-SLOW-NEXT: beqz a1, .LBB6_3 289; RV32-SLOW-NEXT: .LBB6_7: # %cond.store3 290; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 291; RV32-SLOW-NEXT: vslidedown.vi v10, v8, 2 292; RV32-SLOW-NEXT: vmv.x.s a1, v10 293; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma 294; RV32-SLOW-NEXT: vslidedown.vi v10, v9, 2 295; RV32-SLOW-NEXT: vmv.x.s a2, v10 296; RV32-SLOW-NEXT: srli a3, a1, 8 297; RV32-SLOW-NEXT: sb a1, 0(a2) 298; RV32-SLOW-NEXT: sb a3, 1(a2) 299; RV32-SLOW-NEXT: andi a0, a0, 8 300; RV32-SLOW-NEXT: beqz a0, .LBB6_4 301; RV32-SLOW-NEXT: .LBB6_8: # %cond.store5 302; RV32-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 303; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 3 304; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma 305; RV32-SLOW-NEXT: vslidedown.vi v9, v9, 3 306; RV32-SLOW-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 307; RV32-SLOW-NEXT: vmv.x.s a0, v8 308; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m1, ta, ma 309; RV32-SLOW-NEXT: vmv.x.s a1, v9 310; RV32-SLOW-NEXT: srli a2, a0, 8 311; RV32-SLOW-NEXT: sb a0, 0(a1) 312; RV32-SLOW-NEXT: sb a2, 1(a1) 313; RV32-SLOW-NEXT: ret 314; 315; RV64-SLOW-LABEL: mscatter_v4i16_align1: 316; RV64-SLOW: # %bb.0: 317; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 318; RV64-SLOW-NEXT: vmv.x.s a0, v0 319; RV64-SLOW-NEXT: andi a1, a0, 1 320; RV64-SLOW-NEXT: bnez a1, .LBB6_5 321; RV64-SLOW-NEXT: # %bb.1: # %else 322; RV64-SLOW-NEXT: andi a1, a0, 2 323; RV64-SLOW-NEXT: bnez a1, .LBB6_6 324; RV64-SLOW-NEXT: .LBB6_2: # %else2 325; RV64-SLOW-NEXT: andi a1, a0, 4 326; RV64-SLOW-NEXT: bnez a1, .LBB6_7 327; RV64-SLOW-NEXT: .LBB6_3: # %else4 328; RV64-SLOW-NEXT: andi a0, a0, 8 329; RV64-SLOW-NEXT: bnez a0, .LBB6_8 330; RV64-SLOW-NEXT: .LBB6_4: # %else6 331; RV64-SLOW-NEXT: ret 332; RV64-SLOW-NEXT: .LBB6_5: # %cond.store 333; RV64-SLOW-NEXT: vsetvli zero, zero, e16, m2, ta, ma 334; RV64-SLOW-NEXT: vmv.x.s a1, v8 335; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma 336; RV64-SLOW-NEXT: vmv.x.s a2, v10 337; RV64-SLOW-NEXT: srli a3, a1, 8 338; RV64-SLOW-NEXT: sb a1, 0(a2) 339; RV64-SLOW-NEXT: sb a3, 1(a2) 340; RV64-SLOW-NEXT: andi a1, a0, 2 341; RV64-SLOW-NEXT: beqz a1, .LBB6_2 342; RV64-SLOW-NEXT: .LBB6_6: # %cond.store1 343; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 344; RV64-SLOW-NEXT: vslidedown.vi v9, v8, 1 345; RV64-SLOW-NEXT: vmv.x.s a1, v9 346; RV64-SLOW-NEXT: vsetivli zero, 1, e64, m1, ta, ma 347; RV64-SLOW-NEXT: vslidedown.vi v9, v10, 1 348; RV64-SLOW-NEXT: vmv.x.s a2, v9 349; RV64-SLOW-NEXT: srli a3, a1, 8 350; RV64-SLOW-NEXT: sb a1, 0(a2) 351; RV64-SLOW-NEXT: sb a3, 1(a2) 352; RV64-SLOW-NEXT: andi a1, a0, 4 353; RV64-SLOW-NEXT: beqz a1, .LBB6_3 354; RV64-SLOW-NEXT: .LBB6_7: # %cond.store3 355; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 356; RV64-SLOW-NEXT: vslidedown.vi v9, v8, 2 357; RV64-SLOW-NEXT: vmv.x.s a1, v9 358; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma 359; RV64-SLOW-NEXT: vslidedown.vi v12, v10, 2 360; RV64-SLOW-NEXT: vmv.x.s a2, v12 361; RV64-SLOW-NEXT: srli a3, a1, 8 362; RV64-SLOW-NEXT: sb a1, 0(a2) 363; RV64-SLOW-NEXT: sb a3, 1(a2) 364; RV64-SLOW-NEXT: andi a0, a0, 8 365; RV64-SLOW-NEXT: beqz a0, .LBB6_4 366; RV64-SLOW-NEXT: .LBB6_8: # %cond.store5 367; RV64-SLOW-NEXT: vsetivli zero, 1, e16, mf2, ta, ma 368; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 3 369; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma 370; RV64-SLOW-NEXT: vslidedown.vi v10, v10, 3 371; RV64-SLOW-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 372; RV64-SLOW-NEXT: vmv.x.s a0, v8 373; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m2, ta, ma 374; RV64-SLOW-NEXT: vmv.x.s a1, v10 375; RV64-SLOW-NEXT: srli a2, a0, 8 376; RV64-SLOW-NEXT: sb a0, 0(a1) 377; RV64-SLOW-NEXT: sb a2, 1(a1) 378; RV64-SLOW-NEXT: ret 379; 380; RV32-FAST-LABEL: mscatter_v4i16_align1: 381; RV32-FAST: # %bb.0: 382; RV32-FAST-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 383; RV32-FAST-NEXT: vsoxei32.v v8, (zero), v9, v0.t 384; RV32-FAST-NEXT: ret 385; 386; RV64-FAST-LABEL: mscatter_v4i16_align1: 387; RV64-FAST: # %bb.0: 388; RV64-FAST-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 389; RV64-FAST-NEXT: vsoxei64.v v8, (zero), v10, v0.t 390; RV64-FAST-NEXT: ret 391 call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %m) 392 ret void 393} 394 395declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>) 396 397define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m) { 398; RV32-SLOW-LABEL: mscatter_v2i32_align2: 399; RV32-SLOW: # %bb.0: 400; RV32-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 401; RV32-SLOW-NEXT: vmv.x.s a0, v0 402; RV32-SLOW-NEXT: andi a1, a0, 1 403; RV32-SLOW-NEXT: bnez a1, .LBB7_3 404; RV32-SLOW-NEXT: # %bb.1: # %else 405; RV32-SLOW-NEXT: andi a0, a0, 2 406; RV32-SLOW-NEXT: bnez a0, .LBB7_4 407; RV32-SLOW-NEXT: .LBB7_2: # %else2 408; RV32-SLOW-NEXT: ret 409; RV32-SLOW-NEXT: .LBB7_3: # %cond.store 410; RV32-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma 411; RV32-SLOW-NEXT: vmv.x.s a1, v8 412; RV32-SLOW-NEXT: vmv.x.s a2, v9 413; RV32-SLOW-NEXT: srli a3, a1, 16 414; RV32-SLOW-NEXT: sh a1, 0(a2) 415; RV32-SLOW-NEXT: sh a3, 2(a2) 416; RV32-SLOW-NEXT: andi a0, a0, 2 417; RV32-SLOW-NEXT: beqz a0, .LBB7_2 418; RV32-SLOW-NEXT: .LBB7_4: # %cond.store1 419; RV32-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 420; RV32-SLOW-NEXT: vslidedown.vi v8, v8, 1 421; RV32-SLOW-NEXT: vslidedown.vi v9, v9, 1 422; RV32-SLOW-NEXT: vmv.x.s a0, v8 423; RV32-SLOW-NEXT: vmv.x.s a1, v9 424; RV32-SLOW-NEXT: srli a2, a0, 16 425; RV32-SLOW-NEXT: sh a0, 0(a1) 426; RV32-SLOW-NEXT: sh a2, 2(a1) 427; RV32-SLOW-NEXT: ret 428; 429; RV64-SLOW-LABEL: mscatter_v2i32_align2: 430; RV64-SLOW: # %bb.0: 431; RV64-SLOW-NEXT: vsetivli zero, 1, e8, m1, ta, ma 432; RV64-SLOW-NEXT: vmv.x.s a0, v0 433; RV64-SLOW-NEXT: andi a1, a0, 1 434; RV64-SLOW-NEXT: bnez a1, .LBB7_3 435; RV64-SLOW-NEXT: # %bb.1: # %else 436; RV64-SLOW-NEXT: andi a0, a0, 2 437; RV64-SLOW-NEXT: bnez a0, .LBB7_4 438; RV64-SLOW-NEXT: .LBB7_2: # %else2 439; RV64-SLOW-NEXT: ret 440; RV64-SLOW-NEXT: .LBB7_3: # %cond.store 441; RV64-SLOW-NEXT: vsetvli zero, zero, e32, m4, ta, ma 442; RV64-SLOW-NEXT: vmv.x.s a1, v8 443; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m8, ta, ma 444; RV64-SLOW-NEXT: vmv.x.s a2, v9 445; RV64-SLOW-NEXT: srli a3, a1, 16 446; RV64-SLOW-NEXT: sh a1, 0(a2) 447; RV64-SLOW-NEXT: sh a3, 2(a2) 448; RV64-SLOW-NEXT: andi a0, a0, 2 449; RV64-SLOW-NEXT: beqz a0, .LBB7_2 450; RV64-SLOW-NEXT: .LBB7_4: # %cond.store1 451; RV64-SLOW-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 452; RV64-SLOW-NEXT: vslidedown.vi v8, v8, 1 453; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m1, ta, ma 454; RV64-SLOW-NEXT: vslidedown.vi v9, v9, 1 455; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 456; RV64-SLOW-NEXT: vmv.x.s a0, v8 457; RV64-SLOW-NEXT: vsetvli zero, zero, e64, m1, ta, ma 458; RV64-SLOW-NEXT: vmv.x.s a1, v9 459; RV64-SLOW-NEXT: srli a2, a0, 16 460; RV64-SLOW-NEXT: sh a0, 0(a1) 461; RV64-SLOW-NEXT: sh a2, 2(a1) 462; RV64-SLOW-NEXT: ret 463; 464; RV32-FAST-LABEL: mscatter_v2i32_align2: 465; RV32-FAST: # %bb.0: 466; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 467; RV32-FAST-NEXT: vsoxei32.v v8, (zero), v9, v0.t 468; RV32-FAST-NEXT: ret 469; 470; RV64-FAST-LABEL: mscatter_v2i32_align2: 471; RV64-FAST: # %bb.0: 472; RV64-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 473; RV64-FAST-NEXT: vsoxei64.v v8, (zero), v9, v0.t 474; RV64-FAST-NEXT: ret 475 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %val, <2 x ptr> %ptrs, i32 2, <2 x i1> %m) 476 ret void 477} 478 479declare <2 x i32> @llvm.masked.load.v2i32(ptr, i32, <2 x i1>, <2 x i32>) 480 481define void @masked_load_v2i32_align1(ptr %a, <2 x i32> %m, ptr %res_ptr) nounwind { 482; RV32-SLOW-LABEL: masked_load_v2i32_align1: 483; RV32-SLOW: # %bb.0: 484; RV32-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 485; RV32-SLOW-NEXT: vmseq.vi v8, v8, 0 486; RV32-SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma 487; RV32-SLOW-NEXT: vmv.x.s a2, v8 488; RV32-SLOW-NEXT: andi a3, a2, 1 489; RV32-SLOW-NEXT: # implicit-def: $v8 490; RV32-SLOW-NEXT: beqz a3, .LBB8_2 491; RV32-SLOW-NEXT: # %bb.1: # %cond.load 492; RV32-SLOW-NEXT: lbu a3, 1(a0) 493; RV32-SLOW-NEXT: lbu a4, 0(a0) 494; RV32-SLOW-NEXT: lbu a5, 2(a0) 495; RV32-SLOW-NEXT: lbu a6, 3(a0) 496; RV32-SLOW-NEXT: slli a3, a3, 8 497; RV32-SLOW-NEXT: or a3, a3, a4 498; RV32-SLOW-NEXT: slli a5, a5, 16 499; RV32-SLOW-NEXT: slli a6, a6, 24 500; RV32-SLOW-NEXT: or a4, a6, a5 501; RV32-SLOW-NEXT: or a3, a4, a3 502; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 503; RV32-SLOW-NEXT: vmv.v.x v8, a3 504; RV32-SLOW-NEXT: .LBB8_2: # %else 505; RV32-SLOW-NEXT: andi a2, a2, 2 506; RV32-SLOW-NEXT: beqz a2, .LBB8_4 507; RV32-SLOW-NEXT: # %bb.3: # %cond.load1 508; RV32-SLOW-NEXT: lbu a2, 5(a0) 509; RV32-SLOW-NEXT: lbu a3, 4(a0) 510; RV32-SLOW-NEXT: lbu a4, 6(a0) 511; RV32-SLOW-NEXT: lbu a0, 7(a0) 512; RV32-SLOW-NEXT: slli a2, a2, 8 513; RV32-SLOW-NEXT: or a2, a2, a3 514; RV32-SLOW-NEXT: slli a4, a4, 16 515; RV32-SLOW-NEXT: slli a0, a0, 24 516; RV32-SLOW-NEXT: or a0, a0, a4 517; RV32-SLOW-NEXT: or a0, a0, a2 518; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 519; RV32-SLOW-NEXT: vmv.s.x v9, a0 520; RV32-SLOW-NEXT: vslideup.vi v8, v9, 1 521; RV32-SLOW-NEXT: .LBB8_4: # %else2 522; RV32-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 523; RV32-SLOW-NEXT: vse32.v v8, (a1) 524; RV32-SLOW-NEXT: ret 525; 526; RV64-SLOW-LABEL: masked_load_v2i32_align1: 527; RV64-SLOW: # %bb.0: 528; RV64-SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 529; RV64-SLOW-NEXT: vmseq.vi v8, v8, 0 530; RV64-SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma 531; RV64-SLOW-NEXT: vmv.x.s a2, v8 532; RV64-SLOW-NEXT: andi a3, a2, 1 533; RV64-SLOW-NEXT: # implicit-def: $v8 534; RV64-SLOW-NEXT: beqz a3, .LBB8_2 535; RV64-SLOW-NEXT: # %bb.1: # %cond.load 536; RV64-SLOW-NEXT: lbu a3, 1(a0) 537; RV64-SLOW-NEXT: lbu a4, 0(a0) 538; RV64-SLOW-NEXT: lbu a5, 2(a0) 539; RV64-SLOW-NEXT: lb a6, 3(a0) 540; RV64-SLOW-NEXT: slli a3, a3, 8 541; RV64-SLOW-NEXT: or a3, a3, a4 542; RV64-SLOW-NEXT: slli a5, a5, 16 543; RV64-SLOW-NEXT: slli a6, a6, 24 544; RV64-SLOW-NEXT: or a4, a6, a5 545; RV64-SLOW-NEXT: or a3, a4, a3 546; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 547; RV64-SLOW-NEXT: vmv.v.x v8, a3 548; RV64-SLOW-NEXT: .LBB8_2: # %else 549; RV64-SLOW-NEXT: andi a2, a2, 2 550; RV64-SLOW-NEXT: beqz a2, .LBB8_4 551; RV64-SLOW-NEXT: # %bb.3: # %cond.load1 552; RV64-SLOW-NEXT: lbu a2, 5(a0) 553; RV64-SLOW-NEXT: lbu a3, 4(a0) 554; RV64-SLOW-NEXT: lbu a4, 6(a0) 555; RV64-SLOW-NEXT: lb a0, 7(a0) 556; RV64-SLOW-NEXT: slli a2, a2, 8 557; RV64-SLOW-NEXT: or a2, a2, a3 558; RV64-SLOW-NEXT: slli a4, a4, 16 559; RV64-SLOW-NEXT: slli a0, a0, 24 560; RV64-SLOW-NEXT: or a0, a0, a4 561; RV64-SLOW-NEXT: or a0, a0, a2 562; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 563; RV64-SLOW-NEXT: vmv.s.x v9, a0 564; RV64-SLOW-NEXT: vslideup.vi v8, v9, 1 565; RV64-SLOW-NEXT: .LBB8_4: # %else2 566; RV64-SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 567; RV64-SLOW-NEXT: vse32.v v8, (a1) 568; RV64-SLOW-NEXT: ret 569; 570; FAST-LABEL: masked_load_v2i32_align1: 571; FAST: # %bb.0: 572; FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 573; FAST-NEXT: vmseq.vi v0, v8, 0 574; FAST-NEXT: vle32.v v8, (a0), v0.t 575; FAST-NEXT: vse32.v v8, (a1) 576; FAST-NEXT: ret 577 %mask = icmp eq <2 x i32> %m, zeroinitializer 578 %load = call <2 x i32> @llvm.masked.load.v2i32(ptr %a, i32 1, <2 x i1> %mask, <2 x i32> undef) 579 store <2 x i32> %load, ptr %res_ptr 580 ret void 581} 582 583declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) 584 585define void @masked_store_v2i32_align2(<2 x i32> %val, ptr %a, <2 x i32> %m) nounwind { 586; SLOW-LABEL: masked_store_v2i32_align2: 587; SLOW: # %bb.0: 588; SLOW-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 589; SLOW-NEXT: vmseq.vi v9, v9, 0 590; SLOW-NEXT: vsetvli zero, zero, e8, mf8, ta, ma 591; SLOW-NEXT: vmv.x.s a1, v9 592; SLOW-NEXT: andi a2, a1, 1 593; SLOW-NEXT: bnez a2, .LBB9_3 594; SLOW-NEXT: # %bb.1: # %else 595; SLOW-NEXT: andi a1, a1, 2 596; SLOW-NEXT: bnez a1, .LBB9_4 597; SLOW-NEXT: .LBB9_2: # %else2 598; SLOW-NEXT: ret 599; SLOW-NEXT: .LBB9_3: # %cond.store 600; SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 601; SLOW-NEXT: vmv.x.s a2, v8 602; SLOW-NEXT: srli a3, a2, 16 603; SLOW-NEXT: sh a2, 0(a0) 604; SLOW-NEXT: sh a3, 2(a0) 605; SLOW-NEXT: andi a1, a1, 2 606; SLOW-NEXT: beqz a1, .LBB9_2 607; SLOW-NEXT: .LBB9_4: # %cond.store1 608; SLOW-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 609; SLOW-NEXT: vslidedown.vi v8, v8, 1 610; SLOW-NEXT: vmv.x.s a1, v8 611; SLOW-NEXT: srli a2, a1, 16 612; SLOW-NEXT: sh a1, 4(a0) 613; SLOW-NEXT: sh a2, 6(a0) 614; SLOW-NEXT: ret 615; 616; FAST-LABEL: masked_store_v2i32_align2: 617; FAST: # %bb.0: 618; FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 619; FAST-NEXT: vmseq.vi v0, v9, 0 620; FAST-NEXT: vse32.v v8, (a0), v0.t 621; FAST-NEXT: ret 622 %mask = icmp eq <2 x i32> %m, zeroinitializer 623 call void @llvm.masked.store.v2i32.p0(<2 x i32> %val, ptr %a, i32 2, <2 x i1> %mask) 624 ret void 625} 626