1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \ 3; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 4; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \ 5; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 6 7declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32) 8 9define <2 x i16> @vp_bswap_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { 10; CHECK-LABEL: vp_bswap_v2i16: 11; CHECK: # %bb.0: 12; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma 13; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t 14; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 15; CHECK-NEXT: vor.vv v8, v8, v9, v0.t 16; CHECK-NEXT: ret 17 %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl) 18 ret <2 x i16> %v 19} 20 21define <2 x i16> @vp_bswap_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { 22; CHECK-LABEL: vp_bswap_v2i16_unmasked: 23; CHECK: # %bb.0: 24; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma 25; CHECK-NEXT: vsrl.vi v9, v8, 8 26; CHECK-NEXT: vsll.vi v8, v8, 8 27; CHECK-NEXT: vor.vv v8, v8, v9 28; CHECK-NEXT: ret 29 %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> splat (i1 true), i32 %evl) 30 ret <2 x i16> %v 31} 32 33declare <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16>, <4 x i1>, i32) 34 35define <4 x i16> @vp_bswap_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { 36; CHECK-LABEL: vp_bswap_v4i16: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma 39; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t 40; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 41; CHECK-NEXT: vor.vv v8, v8, v9, v0.t 42; CHECK-NEXT: ret 43 %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl) 44 ret <4 x i16> %v 45} 46 47define <4 x i16> @vp_bswap_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { 48; CHECK-LABEL: vp_bswap_v4i16_unmasked: 49; CHECK: # %bb.0: 50; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma 51; CHECK-NEXT: vsrl.vi v9, v8, 8 52; CHECK-NEXT: vsll.vi v8, v8, 8 53; CHECK-NEXT: vor.vv v8, v8, v9 54; CHECK-NEXT: ret 55 %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> splat (i1 true), i32 %evl) 56 ret <4 x i16> %v 57} 58 59declare <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16>, <8 x i1>, i32) 60 61define <8 x i16> @vp_bswap_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { 62; CHECK-LABEL: vp_bswap_v8i16: 63; CHECK: # %bb.0: 64; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma 65; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t 66; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 67; CHECK-NEXT: vor.vv v8, v8, v9, v0.t 68; CHECK-NEXT: ret 69 %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl) 70 ret <8 x i16> %v 71} 72 73define <8 x i16> @vp_bswap_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { 74; CHECK-LABEL: vp_bswap_v8i16_unmasked: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma 77; CHECK-NEXT: vsrl.vi v9, v8, 8 78; CHECK-NEXT: vsll.vi v8, v8, 8 79; CHECK-NEXT: vor.vv v8, v8, v9 80; CHECK-NEXT: ret 81 %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> splat (i1 true), i32 %evl) 82 ret <8 x i16> %v 83} 84 85declare <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16>, <16 x i1>, i32) 86 87define <16 x i16> @vp_bswap_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { 88; CHECK-LABEL: vp_bswap_v16i16: 89; CHECK: # %bb.0: 90; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma 91; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t 92; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 93; CHECK-NEXT: vor.vv v8, v8, v10, v0.t 94; CHECK-NEXT: ret 95 %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl) 96 ret <16 x i16> %v 97} 98 99define <16 x i16> @vp_bswap_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { 100; CHECK-LABEL: vp_bswap_v16i16_unmasked: 101; CHECK: # %bb.0: 102; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma 103; CHECK-NEXT: vsrl.vi v10, v8, 8 104; CHECK-NEXT: vsll.vi v8, v8, 8 105; CHECK-NEXT: vor.vv v8, v8, v10 106; CHECK-NEXT: ret 107 %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> splat (i1 true), i32 %evl) 108 ret <16 x i16> %v 109} 110 111declare <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32>, <2 x i1>, i32) 112 113define <2 x i32> @vp_bswap_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { 114; CHECK-LABEL: vp_bswap_v2i32: 115; CHECK: # %bb.0: 116; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma 117; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t 118; CHECK-NEXT: lui a0, 16 119; CHECK-NEXT: addi a0, a0, -256 120; CHECK-NEXT: vand.vx v9, v9, a0, v0.t 121; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t 122; CHECK-NEXT: vor.vv v9, v9, v10, v0.t 123; CHECK-NEXT: vand.vx v10, v8, a0, v0.t 124; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t 125; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t 126; CHECK-NEXT: vor.vv v8, v8, v10, v0.t 127; CHECK-NEXT: vor.vv v8, v8, v9, v0.t 128; CHECK-NEXT: ret 129 %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl) 130 ret <2 x i32> %v 131} 132 133define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { 134; CHECK-LABEL: vp_bswap_v2i32_unmasked: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma 137; CHECK-NEXT: vsrl.vi v9, v8, 8 138; CHECK-NEXT: lui a0, 16 139; CHECK-NEXT: vsrl.vi v10, v8, 24 140; CHECK-NEXT: addi a0, a0, -256 141; CHECK-NEXT: vand.vx v9, v9, a0 142; CHECK-NEXT: vor.vv v9, v9, v10 143; CHECK-NEXT: vand.vx v10, v8, a0 144; CHECK-NEXT: vsll.vi v10, v10, 8 145; CHECK-NEXT: vsll.vi v8, v8, 24 146; CHECK-NEXT: vor.vv v8, v8, v10 147; CHECK-NEXT: vor.vv v8, v8, v9 148; CHECK-NEXT: ret 149 %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> splat (i1 true), i32 %evl) 150 ret <2 x i32> %v 151} 152 153declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32) 154 155define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { 156; CHECK-LABEL: vp_bswap_v4i32: 157; CHECK: # %bb.0: 158; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma 159; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t 160; CHECK-NEXT: lui a0, 16 161; CHECK-NEXT: addi a0, a0, -256 162; CHECK-NEXT: vand.vx v9, v9, a0, v0.t 163; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t 164; CHECK-NEXT: vor.vv v9, v9, v10, v0.t 165; CHECK-NEXT: vand.vx v10, v8, a0, v0.t 166; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t 167; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t 168; CHECK-NEXT: vor.vv v8, v8, v10, v0.t 169; CHECK-NEXT: vor.vv v8, v8, v9, v0.t 170; CHECK-NEXT: ret 171 %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) 172 ret <4 x i32> %v 173} 174 175define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { 176; CHECK-LABEL: vp_bswap_v4i32_unmasked: 177; CHECK: # %bb.0: 178; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma 179; CHECK-NEXT: vsrl.vi v9, v8, 8 180; CHECK-NEXT: lui a0, 16 181; CHECK-NEXT: vsrl.vi v10, v8, 24 182; CHECK-NEXT: addi a0, a0, -256 183; CHECK-NEXT: vand.vx v9, v9, a0 184; CHECK-NEXT: vor.vv v9, v9, v10 185; CHECK-NEXT: vand.vx v10, v8, a0 186; CHECK-NEXT: vsll.vi v10, v10, 8 187; CHECK-NEXT: vsll.vi v8, v8, 24 188; CHECK-NEXT: vor.vv v8, v8, v10 189; CHECK-NEXT: vor.vv v8, v8, v9 190; CHECK-NEXT: ret 191 %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl) 192 ret <4 x i32> %v 193} 194 195declare <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32>, <8 x i1>, i32) 196 197define <8 x i32> @vp_bswap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { 198; CHECK-LABEL: vp_bswap_v8i32: 199; CHECK: # %bb.0: 200; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma 201; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t 202; CHECK-NEXT: lui a0, 16 203; CHECK-NEXT: addi a0, a0, -256 204; CHECK-NEXT: vand.vx v10, v10, a0, v0.t 205; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t 206; CHECK-NEXT: vor.vv v10, v10, v12, v0.t 207; CHECK-NEXT: vand.vx v12, v8, a0, v0.t 208; CHECK-NEXT: vsll.vi v12, v12, 8, v0.t 209; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t 210; CHECK-NEXT: vor.vv v8, v8, v12, v0.t 211; CHECK-NEXT: vor.vv v8, v8, v10, v0.t 212; CHECK-NEXT: ret 213 %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl) 214 ret <8 x i32> %v 215} 216 217define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { 218; CHECK-LABEL: vp_bswap_v8i32_unmasked: 219; CHECK: # %bb.0: 220; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma 221; CHECK-NEXT: vsrl.vi v10, v8, 8 222; CHECK-NEXT: lui a0, 16 223; CHECK-NEXT: vsrl.vi v12, v8, 24 224; CHECK-NEXT: addi a0, a0, -256 225; CHECK-NEXT: vand.vx v10, v10, a0 226; CHECK-NEXT: vor.vv v10, v10, v12 227; CHECK-NEXT: vand.vx v12, v8, a0 228; CHECK-NEXT: vsll.vi v12, v12, 8 229; CHECK-NEXT: vsll.vi v8, v8, 24 230; CHECK-NEXT: vor.vv v8, v8, v12 231; CHECK-NEXT: vor.vv v8, v8, v10 232; CHECK-NEXT: ret 233 %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> splat (i1 true), i32 %evl) 234 ret <8 x i32> %v 235} 236 237declare <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32>, <16 x i1>, i32) 238 239define <16 x i32> @vp_bswap_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { 240; CHECK-LABEL: vp_bswap_v16i32: 241; CHECK: # %bb.0: 242; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma 243; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t 244; CHECK-NEXT: lui a0, 16 245; CHECK-NEXT: addi a0, a0, -256 246; CHECK-NEXT: vand.vx v12, v12, a0, v0.t 247; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t 248; CHECK-NEXT: vor.vv v12, v12, v16, v0.t 249; CHECK-NEXT: vand.vx v16, v8, a0, v0.t 250; CHECK-NEXT: vsll.vi v16, v16, 8, v0.t 251; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t 252; CHECK-NEXT: vor.vv v8, v8, v16, v0.t 253; CHECK-NEXT: vor.vv v8, v8, v12, v0.t 254; CHECK-NEXT: ret 255 %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl) 256 ret <16 x i32> %v 257} 258 259define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { 260; CHECK-LABEL: vp_bswap_v16i32_unmasked: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma 263; CHECK-NEXT: vsrl.vi v12, v8, 8 264; CHECK-NEXT: lui a0, 16 265; CHECK-NEXT: vsrl.vi v16, v8, 24 266; CHECK-NEXT: addi a0, a0, -256 267; CHECK-NEXT: vand.vx v12, v12, a0 268; CHECK-NEXT: vor.vv v12, v12, v16 269; CHECK-NEXT: vand.vx v16, v8, a0 270; CHECK-NEXT: vsll.vi v16, v16, 8 271; CHECK-NEXT: vsll.vi v8, v8, 24 272; CHECK-NEXT: vor.vv v8, v8, v16 273; CHECK-NEXT: vor.vv v8, v8, v12 274; CHECK-NEXT: ret 275 %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> splat (i1 true), i32 %evl) 276 ret <16 x i32> %v 277} 278 279declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32) 280 281define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { 282; RV32-LABEL: vp_bswap_v2i64: 283; RV32: # %bb.0: 284; RV32-NEXT: addi sp, sp, -16 285; RV32-NEXT: .cfi_def_cfa_offset 16 286; RV32-NEXT: lui a1, 1044480 287; RV32-NEXT: li a2, 56 288; RV32-NEXT: lui a3, 16 289; RV32-NEXT: li a4, 40 290; RV32-NEXT: lui a5, 4080 291; RV32-NEXT: addi a6, sp, 8 292; RV32-NEXT: sw a1, 8(sp) 293; RV32-NEXT: sw zero, 12(sp) 294; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma 295; RV32-NEXT: vsll.vx v9, v8, a2, v0.t 296; RV32-NEXT: addi a1, a3, -256 297; RV32-NEXT: vand.vx v10, v8, a1, v0.t 298; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 299; RV32-NEXT: vlse64.v v11, (a6), zero 300; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma 301; RV32-NEXT: vsll.vx v10, v10, a4, v0.t 302; RV32-NEXT: vor.vv v9, v9, v10, v0.t 303; RV32-NEXT: vand.vx v10, v8, a5, v0.t 304; RV32-NEXT: vsll.vi v10, v10, 24, v0.t 305; RV32-NEXT: vand.vv v12, v8, v11, v0.t 306; RV32-NEXT: vsll.vi v12, v12, 8, v0.t 307; RV32-NEXT: vor.vv v10, v10, v12, v0.t 308; RV32-NEXT: vor.vv v9, v9, v10, v0.t 309; RV32-NEXT: vsrl.vx v10, v8, a2, v0.t 310; RV32-NEXT: vsrl.vx v12, v8, a4, v0.t 311; RV32-NEXT: vand.vx v12, v12, a1, v0.t 312; RV32-NEXT: vor.vv v10, v12, v10, v0.t 313; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t 314; RV32-NEXT: vand.vx v12, v12, a5, v0.t 315; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t 316; RV32-NEXT: vand.vv v8, v8, v11, v0.t 317; RV32-NEXT: vor.vv v8, v8, v12, v0.t 318; RV32-NEXT: vor.vv v8, v8, v10, v0.t 319; RV32-NEXT: vor.vv v8, v9, v8, v0.t 320; RV32-NEXT: addi sp, sp, 16 321; RV32-NEXT: .cfi_def_cfa_offset 0 322; RV32-NEXT: ret 323; 324; RV64-LABEL: vp_bswap_v2i64: 325; RV64: # %bb.0: 326; RV64-NEXT: lui a1, 4080 327; RV64-NEXT: li a2, 255 328; RV64-NEXT: li a3, 56 329; RV64-NEXT: lui a4, 16 330; RV64-NEXT: li a5, 40 331; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma 332; RV64-NEXT: vand.vx v9, v8, a1, v0.t 333; RV64-NEXT: slli a2, a2, 24 334; RV64-NEXT: addiw a0, a4, -256 335; RV64-NEXT: vsll.vi v9, v9, 24, v0.t 336; RV64-NEXT: vand.vx v10, v8, a2, v0.t 337; RV64-NEXT: vsll.vi v10, v10, 8, v0.t 338; RV64-NEXT: vor.vv v9, v9, v10, v0.t 339; RV64-NEXT: vsll.vx v10, v8, a3, v0.t 340; RV64-NEXT: vand.vx v11, v8, a0, v0.t 341; RV64-NEXT: vsll.vx v11, v11, a5, v0.t 342; RV64-NEXT: vor.vv v10, v10, v11, v0.t 343; RV64-NEXT: vor.vv v9, v10, v9, v0.t 344; RV64-NEXT: vsrl.vx v10, v8, a3, v0.t 345; RV64-NEXT: vsrl.vx v11, v8, a5, v0.t 346; RV64-NEXT: vand.vx v11, v11, a0, v0.t 347; RV64-NEXT: vor.vv v10, v11, v10, v0.t 348; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t 349; RV64-NEXT: vand.vx v11, v11, a1, v0.t 350; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t 351; RV64-NEXT: vand.vx v8, v8, a2, v0.t 352; RV64-NEXT: vor.vv v8, v8, v11, v0.t 353; RV64-NEXT: vor.vv v8, v8, v10, v0.t 354; RV64-NEXT: vor.vv v8, v9, v8, v0.t 355; RV64-NEXT: ret 356 %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl) 357 ret <2 x i64> %v 358} 359 360define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { 361; RV32-LABEL: vp_bswap_v2i64_unmasked: 362; RV32: # %bb.0: 363; RV32-NEXT: addi sp, sp, -16 364; RV32-NEXT: .cfi_def_cfa_offset 16 365; RV32-NEXT: lui a1, 1044480 366; RV32-NEXT: li a2, 56 367; RV32-NEXT: lui a3, 16 368; RV32-NEXT: li a4, 40 369; RV32-NEXT: lui a5, 4080 370; RV32-NEXT: addi a6, sp, 8 371; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma 372; RV32-NEXT: vsrl.vi v9, v8, 24 373; RV32-NEXT: sw a1, 8(sp) 374; RV32-NEXT: sw zero, 12(sp) 375; RV32-NEXT: vsll.vx v10, v8, a2 376; RV32-NEXT: addi a1, a3, -256 377; RV32-NEXT: vsrl.vx v11, v8, a2 378; RV32-NEXT: vsrl.vx v12, v8, a4 379; RV32-NEXT: vand.vx v13, v8, a1 380; RV32-NEXT: vand.vx v12, v12, a1 381; RV32-NEXT: vor.vv v11, v12, v11 382; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 383; RV32-NEXT: vlse64.v v12, (a6), zero 384; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma 385; RV32-NEXT: vsll.vx v13, v13, a4 386; RV32-NEXT: vor.vv v10, v10, v13 387; RV32-NEXT: vsrl.vi v13, v8, 8 388; RV32-NEXT: vand.vx v9, v9, a5 389; RV32-NEXT: vand.vv v13, v13, v12 390; RV32-NEXT: vor.vv v9, v13, v9 391; RV32-NEXT: vand.vv v12, v8, v12 392; RV32-NEXT: vand.vx v8, v8, a5 393; RV32-NEXT: vsll.vi v8, v8, 24 394; RV32-NEXT: vsll.vi v12, v12, 8 395; RV32-NEXT: vor.vv v8, v8, v12 396; RV32-NEXT: vor.vv v8, v10, v8 397; RV32-NEXT: vor.vv v9, v9, v11 398; RV32-NEXT: vor.vv v8, v8, v9 399; RV32-NEXT: addi sp, sp, 16 400; RV32-NEXT: .cfi_def_cfa_offset 0 401; RV32-NEXT: ret 402; 403; RV64-LABEL: vp_bswap_v2i64_unmasked: 404; RV64: # %bb.0: 405; RV64-NEXT: lui a1, 4080 406; RV64-NEXT: li a2, 255 407; RV64-NEXT: li a3, 56 408; RV64-NEXT: lui a4, 16 409; RV64-NEXT: li a5, 40 410; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma 411; RV64-NEXT: vsrl.vi v9, v8, 24 412; RV64-NEXT: vsrl.vi v10, v8, 8 413; RV64-NEXT: addiw a0, a4, -256 414; RV64-NEXT: vsrl.vx v11, v8, a3 415; RV64-NEXT: vsrl.vx v12, v8, a5 416; RV64-NEXT: vand.vx v12, v12, a0 417; RV64-NEXT: vor.vv v11, v12, v11 418; RV64-NEXT: vand.vx v12, v8, a1 419; RV64-NEXT: slli a2, a2, 24 420; RV64-NEXT: vand.vx v9, v9, a1 421; RV64-NEXT: vsll.vi v12, v12, 24 422; RV64-NEXT: vand.vx v10, v10, a2 423; RV64-NEXT: vor.vv v9, v10, v9 424; RV64-NEXT: vand.vx v10, v8, a2 425; RV64-NEXT: vsll.vi v10, v10, 8 426; RV64-NEXT: vor.vv v10, v12, v10 427; RV64-NEXT: vsll.vx v12, v8, a3 428; RV64-NEXT: vand.vx v8, v8, a0 429; RV64-NEXT: vsll.vx v8, v8, a5 430; RV64-NEXT: vor.vv v8, v12, v8 431; RV64-NEXT: vor.vv v8, v8, v10 432; RV64-NEXT: vor.vv v9, v9, v11 433; RV64-NEXT: vor.vv v8, v8, v9 434; RV64-NEXT: ret 435 %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl) 436 ret <2 x i64> %v 437} 438 439declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32) 440 441define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { 442; RV32-LABEL: vp_bswap_v4i64: 443; RV32: # %bb.0: 444; RV32-NEXT: addi sp, sp, -16 445; RV32-NEXT: .cfi_def_cfa_offset 16 446; RV32-NEXT: lui a1, 1044480 447; RV32-NEXT: li a2, 56 448; RV32-NEXT: lui a3, 16 449; RV32-NEXT: li a4, 40 450; RV32-NEXT: lui a5, 4080 451; RV32-NEXT: addi a6, sp, 8 452; RV32-NEXT: sw a1, 8(sp) 453; RV32-NEXT: sw zero, 12(sp) 454; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma 455; RV32-NEXT: vsll.vx v10, v8, a2, v0.t 456; RV32-NEXT: addi a1, a3, -256 457; RV32-NEXT: vand.vx v12, v8, a1, v0.t 458; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 459; RV32-NEXT: vlse64.v v14, (a6), zero 460; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma 461; RV32-NEXT: vsll.vx v12, v12, a4, v0.t 462; RV32-NEXT: vor.vv v10, v10, v12, v0.t 463; RV32-NEXT: vand.vx v12, v8, a5, v0.t 464; RV32-NEXT: vsll.vi v12, v12, 24, v0.t 465; RV32-NEXT: vand.vv v16, v8, v14, v0.t 466; RV32-NEXT: vsll.vi v16, v16, 8, v0.t 467; RV32-NEXT: vor.vv v12, v12, v16, v0.t 468; RV32-NEXT: vor.vv v10, v10, v12, v0.t 469; RV32-NEXT: vsrl.vx v12, v8, a2, v0.t 470; RV32-NEXT: vsrl.vx v16, v8, a4, v0.t 471; RV32-NEXT: vand.vx v16, v16, a1, v0.t 472; RV32-NEXT: vor.vv v12, v16, v12, v0.t 473; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t 474; RV32-NEXT: vand.vx v16, v16, a5, v0.t 475; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t 476; RV32-NEXT: vand.vv v8, v8, v14, v0.t 477; RV32-NEXT: vor.vv v8, v8, v16, v0.t 478; RV32-NEXT: vor.vv v8, v8, v12, v0.t 479; RV32-NEXT: vor.vv v8, v10, v8, v0.t 480; RV32-NEXT: addi sp, sp, 16 481; RV32-NEXT: .cfi_def_cfa_offset 0 482; RV32-NEXT: ret 483; 484; RV64-LABEL: vp_bswap_v4i64: 485; RV64: # %bb.0: 486; RV64-NEXT: lui a1, 4080 487; RV64-NEXT: li a2, 255 488; RV64-NEXT: li a3, 56 489; RV64-NEXT: lui a4, 16 490; RV64-NEXT: li a5, 40 491; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma 492; RV64-NEXT: vand.vx v10, v8, a1, v0.t 493; RV64-NEXT: slli a2, a2, 24 494; RV64-NEXT: addiw a0, a4, -256 495; RV64-NEXT: vsll.vi v10, v10, 24, v0.t 496; RV64-NEXT: vand.vx v12, v8, a2, v0.t 497; RV64-NEXT: vsll.vi v12, v12, 8, v0.t 498; RV64-NEXT: vor.vv v10, v10, v12, v0.t 499; RV64-NEXT: vsll.vx v12, v8, a3, v0.t 500; RV64-NEXT: vand.vx v14, v8, a0, v0.t 501; RV64-NEXT: vsll.vx v14, v14, a5, v0.t 502; RV64-NEXT: vor.vv v12, v12, v14, v0.t 503; RV64-NEXT: vor.vv v10, v12, v10, v0.t 504; RV64-NEXT: vsrl.vx v12, v8, a3, v0.t 505; RV64-NEXT: vsrl.vx v14, v8, a5, v0.t 506; RV64-NEXT: vand.vx v14, v14, a0, v0.t 507; RV64-NEXT: vor.vv v12, v14, v12, v0.t 508; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t 509; RV64-NEXT: vand.vx v14, v14, a1, v0.t 510; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t 511; RV64-NEXT: vand.vx v8, v8, a2, v0.t 512; RV64-NEXT: vor.vv v8, v8, v14, v0.t 513; RV64-NEXT: vor.vv v8, v8, v12, v0.t 514; RV64-NEXT: vor.vv v8, v10, v8, v0.t 515; RV64-NEXT: ret 516 %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl) 517 ret <4 x i64> %v 518} 519 520define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { 521; RV32-LABEL: vp_bswap_v4i64_unmasked: 522; RV32: # %bb.0: 523; RV32-NEXT: addi sp, sp, -16 524; RV32-NEXT: .cfi_def_cfa_offset 16 525; RV32-NEXT: lui a1, 1044480 526; RV32-NEXT: li a2, 56 527; RV32-NEXT: lui a3, 16 528; RV32-NEXT: li a4, 40 529; RV32-NEXT: lui a5, 4080 530; RV32-NEXT: addi a6, sp, 8 531; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma 532; RV32-NEXT: vsrl.vi v10, v8, 24 533; RV32-NEXT: sw a1, 8(sp) 534; RV32-NEXT: sw zero, 12(sp) 535; RV32-NEXT: vsll.vx v12, v8, a2 536; RV32-NEXT: addi a1, a3, -256 537; RV32-NEXT: vsrl.vx v14, v8, a2 538; RV32-NEXT: vsrl.vx v16, v8, a4 539; RV32-NEXT: vand.vx v18, v8, a1 540; RV32-NEXT: vand.vx v16, v16, a1 541; RV32-NEXT: vor.vv v14, v16, v14 542; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 543; RV32-NEXT: vlse64.v v16, (a6), zero 544; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma 545; RV32-NEXT: vsll.vx v18, v18, a4 546; RV32-NEXT: vor.vv v12, v12, v18 547; RV32-NEXT: vsrl.vi v18, v8, 8 548; RV32-NEXT: vand.vx v10, v10, a5 549; RV32-NEXT: vand.vv v18, v18, v16 550; RV32-NEXT: vor.vv v10, v18, v10 551; RV32-NEXT: vand.vv v16, v8, v16 552; RV32-NEXT: vand.vx v8, v8, a5 553; RV32-NEXT: vsll.vi v8, v8, 24 554; RV32-NEXT: vsll.vi v16, v16, 8 555; RV32-NEXT: vor.vv v8, v8, v16 556; RV32-NEXT: vor.vv v8, v12, v8 557; RV32-NEXT: vor.vv v10, v10, v14 558; RV32-NEXT: vor.vv v8, v8, v10 559; RV32-NEXT: addi sp, sp, 16 560; RV32-NEXT: .cfi_def_cfa_offset 0 561; RV32-NEXT: ret 562; 563; RV64-LABEL: vp_bswap_v4i64_unmasked: 564; RV64: # %bb.0: 565; RV64-NEXT: lui a1, 4080 566; RV64-NEXT: li a2, 255 567; RV64-NEXT: li a3, 56 568; RV64-NEXT: lui a4, 16 569; RV64-NEXT: li a5, 40 570; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma 571; RV64-NEXT: vsrl.vi v10, v8, 24 572; RV64-NEXT: vsrl.vi v12, v8, 8 573; RV64-NEXT: addiw a0, a4, -256 574; RV64-NEXT: vsrl.vx v14, v8, a3 575; RV64-NEXT: vsrl.vx v16, v8, a5 576; RV64-NEXT: vand.vx v16, v16, a0 577; RV64-NEXT: vor.vv v14, v16, v14 578; RV64-NEXT: vand.vx v16, v8, a1 579; RV64-NEXT: slli a2, a2, 24 580; RV64-NEXT: vand.vx v10, v10, a1 581; RV64-NEXT: vsll.vi v16, v16, 24 582; RV64-NEXT: vand.vx v12, v12, a2 583; RV64-NEXT: vor.vv v10, v12, v10 584; RV64-NEXT: vand.vx v12, v8, a2 585; RV64-NEXT: vsll.vi v12, v12, 8 586; RV64-NEXT: vor.vv v12, v16, v12 587; RV64-NEXT: vsll.vx v16, v8, a3 588; RV64-NEXT: vand.vx v8, v8, a0 589; RV64-NEXT: vsll.vx v8, v8, a5 590; RV64-NEXT: vor.vv v8, v16, v8 591; RV64-NEXT: vor.vv v8, v8, v12 592; RV64-NEXT: vor.vv v10, v10, v14 593; RV64-NEXT: vor.vv v8, v8, v10 594; RV64-NEXT: ret 595 %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl) 596 ret <4 x i64> %v 597} 598 599declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32) 600 601define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { 602; RV32-LABEL: vp_bswap_v8i64: 603; RV32: # %bb.0: 604; RV32-NEXT: addi sp, sp, -16 605; RV32-NEXT: .cfi_def_cfa_offset 16 606; RV32-NEXT: lui a1, 1044480 607; RV32-NEXT: li a2, 56 608; RV32-NEXT: lui a3, 16 609; RV32-NEXT: li a4, 40 610; RV32-NEXT: lui a5, 4080 611; RV32-NEXT: addi a6, sp, 8 612; RV32-NEXT: sw a1, 8(sp) 613; RV32-NEXT: sw zero, 12(sp) 614; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma 615; RV32-NEXT: vsll.vx v16, v8, a2, v0.t 616; RV32-NEXT: addi a1, a3, -256 617; RV32-NEXT: vand.vx v20, v8, a1, v0.t 618; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 619; RV32-NEXT: vlse64.v v12, (a6), zero 620; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma 621; RV32-NEXT: vsll.vx v20, v20, a4, v0.t 622; RV32-NEXT: vor.vv v16, v16, v20, v0.t 623; RV32-NEXT: vand.vx v20, v8, a5, v0.t 624; RV32-NEXT: vsll.vi v20, v20, 24, v0.t 625; RV32-NEXT: vand.vv v24, v8, v12, v0.t 626; RV32-NEXT: vsll.vi v24, v24, 8, v0.t 627; RV32-NEXT: vor.vv v20, v20, v24, v0.t 628; RV32-NEXT: vor.vv v16, v16, v20, v0.t 629; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t 630; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t 631; RV32-NEXT: vand.vx v24, v24, a1, v0.t 632; RV32-NEXT: vor.vv v20, v24, v20, v0.t 633; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t 634; RV32-NEXT: vand.vx v24, v24, a5, v0.t 635; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t 636; RV32-NEXT: vand.vv v8, v8, v12, v0.t 637; RV32-NEXT: vor.vv v8, v8, v24, v0.t 638; RV32-NEXT: vor.vv v8, v8, v20, v0.t 639; RV32-NEXT: vor.vv v8, v16, v8, v0.t 640; RV32-NEXT: addi sp, sp, 16 641; RV32-NEXT: .cfi_def_cfa_offset 0 642; RV32-NEXT: ret 643; 644; RV64-LABEL: vp_bswap_v8i64: 645; RV64: # %bb.0: 646; RV64-NEXT: lui a1, 4080 647; RV64-NEXT: li a2, 255 648; RV64-NEXT: li a3, 56 649; RV64-NEXT: lui a4, 16 650; RV64-NEXT: li a5, 40 651; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma 652; RV64-NEXT: vand.vx v12, v8, a1, v0.t 653; RV64-NEXT: slli a2, a2, 24 654; RV64-NEXT: addiw a0, a4, -256 655; RV64-NEXT: vsll.vi v12, v12, 24, v0.t 656; RV64-NEXT: vand.vx v16, v8, a2, v0.t 657; RV64-NEXT: vsll.vi v16, v16, 8, v0.t 658; RV64-NEXT: vor.vv v12, v12, v16, v0.t 659; RV64-NEXT: vsll.vx v16, v8, a3, v0.t 660; RV64-NEXT: vand.vx v20, v8, a0, v0.t 661; RV64-NEXT: vsll.vx v20, v20, a5, v0.t 662; RV64-NEXT: vor.vv v16, v16, v20, v0.t 663; RV64-NEXT: vor.vv v12, v16, v12, v0.t 664; RV64-NEXT: vsrl.vx v16, v8, a3, v0.t 665; RV64-NEXT: vsrl.vx v20, v8, a5, v0.t 666; RV64-NEXT: vand.vx v20, v20, a0, v0.t 667; RV64-NEXT: vor.vv v16, v20, v16, v0.t 668; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t 669; RV64-NEXT: vand.vx v20, v20, a1, v0.t 670; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t 671; RV64-NEXT: vand.vx v8, v8, a2, v0.t 672; RV64-NEXT: vor.vv v8, v8, v20, v0.t 673; RV64-NEXT: vor.vv v8, v8, v16, v0.t 674; RV64-NEXT: vor.vv v8, v12, v8, v0.t 675; RV64-NEXT: ret 676 %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl) 677 ret <8 x i64> %v 678} 679 680define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { 681; RV32-LABEL: vp_bswap_v8i64_unmasked: 682; RV32: # %bb.0: 683; RV32-NEXT: addi sp, sp, -16 684; RV32-NEXT: .cfi_def_cfa_offset 16 685; RV32-NEXT: lui a1, 1044480 686; RV32-NEXT: li a2, 56 687; RV32-NEXT: lui a3, 16 688; RV32-NEXT: li a4, 40 689; RV32-NEXT: lui a5, 4080 690; RV32-NEXT: addi a6, sp, 8 691; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma 692; RV32-NEXT: vsrl.vi v12, v8, 24 693; RV32-NEXT: sw a1, 8(sp) 694; RV32-NEXT: sw zero, 12(sp) 695; RV32-NEXT: vsll.vx v16, v8, a2 696; RV32-NEXT: addi a1, a3, -256 697; RV32-NEXT: vsrl.vx v20, v8, a2 698; RV32-NEXT: vsrl.vx v24, v8, a4 699; RV32-NEXT: vand.vx v28, v8, a1 700; RV32-NEXT: vand.vx v24, v24, a1 701; RV32-NEXT: vor.vv v20, v24, v20 702; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 703; RV32-NEXT: vlse64.v v24, (a6), zero 704; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma 705; RV32-NEXT: vsll.vx v28, v28, a4 706; RV32-NEXT: vor.vv v16, v16, v28 707; RV32-NEXT: vsrl.vi v28, v8, 8 708; RV32-NEXT: vand.vx v12, v12, a5 709; RV32-NEXT: vand.vv v28, v28, v24 710; RV32-NEXT: vor.vv v12, v28, v12 711; RV32-NEXT: vand.vv v24, v8, v24 712; RV32-NEXT: vand.vx v8, v8, a5 713; RV32-NEXT: vsll.vi v8, v8, 24 714; RV32-NEXT: vsll.vi v24, v24, 8 715; RV32-NEXT: vor.vv v8, v8, v24 716; RV32-NEXT: vor.vv v8, v16, v8 717; RV32-NEXT: vor.vv v12, v12, v20 718; RV32-NEXT: vor.vv v8, v8, v12 719; RV32-NEXT: addi sp, sp, 16 720; RV32-NEXT: .cfi_def_cfa_offset 0 721; RV32-NEXT: ret 722; 723; RV64-LABEL: vp_bswap_v8i64_unmasked: 724; RV64: # %bb.0: 725; RV64-NEXT: lui a1, 4080 726; RV64-NEXT: li a2, 255 727; RV64-NEXT: li a3, 56 728; RV64-NEXT: lui a4, 16 729; RV64-NEXT: li a5, 40 730; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma 731; RV64-NEXT: vsrl.vi v12, v8, 24 732; RV64-NEXT: vsrl.vi v16, v8, 8 733; RV64-NEXT: addiw a0, a4, -256 734; RV64-NEXT: vsrl.vx v20, v8, a3 735; RV64-NEXT: vsrl.vx v24, v8, a5 736; RV64-NEXT: vand.vx v24, v24, a0 737; RV64-NEXT: vor.vv v20, v24, v20 738; RV64-NEXT: vand.vx v24, v8, a1 739; RV64-NEXT: slli a2, a2, 24 740; RV64-NEXT: vand.vx v12, v12, a1 741; RV64-NEXT: vsll.vi v24, v24, 24 742; RV64-NEXT: vand.vx v16, v16, a2 743; RV64-NEXT: vor.vv v12, v16, v12 744; RV64-NEXT: vand.vx v16, v8, a2 745; RV64-NEXT: vsll.vi v16, v16, 8 746; RV64-NEXT: vor.vv v16, v24, v16 747; RV64-NEXT: vsll.vx v24, v8, a3 748; RV64-NEXT: vand.vx v8, v8, a0 749; RV64-NEXT: vsll.vx v8, v8, a5 750; RV64-NEXT: vor.vv v8, v24, v8 751; RV64-NEXT: vor.vv v8, v8, v16 752; RV64-NEXT: vor.vv v12, v12, v20 753; RV64-NEXT: vor.vv v8, v8, v12 754; RV64-NEXT: ret 755 %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl) 756 ret <8 x i64> %v 757} 758 759declare <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64>, <15 x i1>, i32) 760 761define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { 762; RV32-LABEL: vp_bswap_v15i64: 763; RV32: # %bb.0: 764; RV32-NEXT: addi sp, sp, -16 765; RV32-NEXT: .cfi_def_cfa_offset 16 766; RV32-NEXT: csrr a1, vlenb 767; RV32-NEXT: li a2, 24 768; RV32-NEXT: mul a1, a1, a2 769; RV32-NEXT: sub sp, sp, a1 770; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb 771; RV32-NEXT: lui a1, 1044480 772; RV32-NEXT: li a2, 56 773; RV32-NEXT: lui a3, 16 774; RV32-NEXT: li a4, 40 775; RV32-NEXT: addi a5, sp, 8 776; RV32-NEXT: sw a1, 8(sp) 777; RV32-NEXT: sw zero, 12(sp) 778; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 779; RV32-NEXT: vsll.vx v16, v8, a2, v0.t 780; RV32-NEXT: addi a1, a3, -256 781; RV32-NEXT: vand.vx v24, v8, a1, v0.t 782; RV32-NEXT: vsll.vx v24, v24, a4, v0.t 783; RV32-NEXT: vor.vv v16, v16, v24, v0.t 784; RV32-NEXT: csrr a3, vlenb 785; RV32-NEXT: slli a3, a3, 4 786; RV32-NEXT: add a3, sp, a3 787; RV32-NEXT: addi a3, a3, 16 788; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 789; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 790; RV32-NEXT: vlse64.v v16, (a5), zero 791; RV32-NEXT: csrr a3, vlenb 792; RV32-NEXT: slli a3, a3, 3 793; RV32-NEXT: add a3, sp, a3 794; RV32-NEXT: addi a3, a3, 16 795; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 796; RV32-NEXT: lui a3, 4080 797; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 798; RV32-NEXT: vand.vx v24, v8, a3, v0.t 799; RV32-NEXT: vsll.vi v24, v24, 24, v0.t 800; RV32-NEXT: addi a0, sp, 16 801; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill 802; RV32-NEXT: vand.vv v24, v8, v16, v0.t 803; RV32-NEXT: vsll.vi v16, v24, 8, v0.t 804; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 805; RV32-NEXT: vor.vv v16, v24, v16, v0.t 806; RV32-NEXT: csrr a0, vlenb 807; RV32-NEXT: slli a0, a0, 4 808; RV32-NEXT: add a0, sp, a0 809; RV32-NEXT: addi a0, a0, 16 810; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 811; RV32-NEXT: vor.vv v16, v24, v16, v0.t 812; RV32-NEXT: csrr a0, vlenb 813; RV32-NEXT: slli a0, a0, 4 814; RV32-NEXT: add a0, sp, a0 815; RV32-NEXT: addi a0, a0, 16 816; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill 817; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t 818; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t 819; RV32-NEXT: vand.vx v24, v24, a1, v0.t 820; RV32-NEXT: vor.vv v16, v24, v16, v0.t 821; RV32-NEXT: addi a0, sp, 16 822; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill 823; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t 824; RV32-NEXT: vand.vx v24, v24, a3, v0.t 825; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t 826; RV32-NEXT: csrr a0, vlenb 827; RV32-NEXT: slli a0, a0, 3 828; RV32-NEXT: add a0, sp, a0 829; RV32-NEXT: addi a0, a0, 16 830; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 831; RV32-NEXT: vand.vv v8, v8, v16, v0.t 832; RV32-NEXT: vor.vv v8, v8, v24, v0.t 833; RV32-NEXT: addi a0, sp, 16 834; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 835; RV32-NEXT: vor.vv v8, v8, v16, v0.t 836; RV32-NEXT: csrr a0, vlenb 837; RV32-NEXT: slli a0, a0, 4 838; RV32-NEXT: add a0, sp, a0 839; RV32-NEXT: addi a0, a0, 16 840; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 841; RV32-NEXT: vor.vv v8, v16, v8, v0.t 842; RV32-NEXT: csrr a0, vlenb 843; RV32-NEXT: li a1, 24 844; RV32-NEXT: mul a0, a0, a1 845; RV32-NEXT: add sp, sp, a0 846; RV32-NEXT: .cfi_def_cfa sp, 16 847; RV32-NEXT: addi sp, sp, 16 848; RV32-NEXT: .cfi_def_cfa_offset 0 849; RV32-NEXT: ret 850; 851; RV64-LABEL: vp_bswap_v15i64: 852; RV64: # %bb.0: 853; RV64-NEXT: addi sp, sp, -16 854; RV64-NEXT: .cfi_def_cfa_offset 16 855; RV64-NEXT: csrr a1, vlenb 856; RV64-NEXT: slli a1, a1, 3 857; RV64-NEXT: sub sp, sp, a1 858; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 859; RV64-NEXT: lui a1, 4080 860; RV64-NEXT: li a2, 255 861; RV64-NEXT: li a3, 56 862; RV64-NEXT: lui a4, 16 863; RV64-NEXT: li a5, 40 864; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma 865; RV64-NEXT: vand.vx v16, v8, a1, v0.t 866; RV64-NEXT: slli a2, a2, 24 867; RV64-NEXT: addiw a0, a4, -256 868; RV64-NEXT: vsll.vi v16, v16, 24, v0.t 869; RV64-NEXT: vand.vx v24, v8, a2, v0.t 870; RV64-NEXT: vsll.vi v24, v24, 8, v0.t 871; RV64-NEXT: vor.vv v16, v16, v24, v0.t 872; RV64-NEXT: addi a4, sp, 16 873; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 874; RV64-NEXT: vsll.vx v24, v8, a3, v0.t 875; RV64-NEXT: vand.vx v16, v8, a0, v0.t 876; RV64-NEXT: vsll.vx v16, v16, a5, v0.t 877; RV64-NEXT: vor.vv v16, v24, v16, v0.t 878; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload 879; RV64-NEXT: vor.vv v16, v16, v24, v0.t 880; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 881; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t 882; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t 883; RV64-NEXT: vand.vx v16, v16, a0, v0.t 884; RV64-NEXT: vor.vv v24, v16, v24, v0.t 885; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t 886; RV64-NEXT: vand.vx v16, v16, a1, v0.t 887; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t 888; RV64-NEXT: vand.vx v8, v8, a2, v0.t 889; RV64-NEXT: vor.vv v8, v8, v16, v0.t 890; RV64-NEXT: vor.vv v8, v8, v24, v0.t 891; RV64-NEXT: addi a0, sp, 16 892; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 893; RV64-NEXT: vor.vv v8, v16, v8, v0.t 894; RV64-NEXT: csrr a0, vlenb 895; RV64-NEXT: slli a0, a0, 3 896; RV64-NEXT: add sp, sp, a0 897; RV64-NEXT: .cfi_def_cfa sp, 16 898; RV64-NEXT: addi sp, sp, 16 899; RV64-NEXT: .cfi_def_cfa_offset 0 900; RV64-NEXT: ret 901 %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) 902 ret <15 x i64> %v 903} 904 905define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { 906; RV32-LABEL: vp_bswap_v15i64_unmasked: 907; RV32: # %bb.0: 908; RV32-NEXT: addi sp, sp, -16 909; RV32-NEXT: .cfi_def_cfa_offset 16 910; RV32-NEXT: csrr a1, vlenb 911; RV32-NEXT: slli a1, a1, 4 912; RV32-NEXT: sub sp, sp, a1 913; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb 914; RV32-NEXT: lui a1, 1044480 915; RV32-NEXT: li a2, 56 916; RV32-NEXT: lui a3, 16 917; RV32-NEXT: li a4, 40 918; RV32-NEXT: lui a5, 4080 919; RV32-NEXT: addi a6, sp, 8 920; RV32-NEXT: sw a1, 8(sp) 921; RV32-NEXT: sw zero, 12(sp) 922; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 923; RV32-NEXT: vsll.vx v24, v8, a2 924; RV32-NEXT: addi a1, a3, -256 925; RV32-NEXT: vsrl.vx v16, v8, a2 926; RV32-NEXT: vsrl.vx v0, v8, a4 927; RV32-NEXT: vand.vx v0, v0, a1 928; RV32-NEXT: vor.vv v16, v0, v16 929; RV32-NEXT: csrr a2, vlenb 930; RV32-NEXT: slli a2, a2, 3 931; RV32-NEXT: add a2, sp, a2 932; RV32-NEXT: addi a2, a2, 16 933; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill 934; RV32-NEXT: vand.vx v0, v8, a1 935; RV32-NEXT: vsll.vx v0, v0, a4 936; RV32-NEXT: vor.vv v16, v24, v0 937; RV32-NEXT: addi a1, sp, 16 938; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 939; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 940; RV32-NEXT: vlse64.v v0, (a6), zero 941; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 942; RV32-NEXT: vsrl.vi v16, v8, 24 943; RV32-NEXT: vand.vx v16, v16, a5 944; RV32-NEXT: vsrl.vi v24, v8, 8 945; RV32-NEXT: vand.vv v24, v24, v0 946; RV32-NEXT: vor.vv v16, v24, v16 947; RV32-NEXT: vand.vv v24, v8, v0 948; RV32-NEXT: vand.vx v8, v8, a5 949; RV32-NEXT: vsll.vi v8, v8, 24 950; RV32-NEXT: vsll.vi v24, v24, 8 951; RV32-NEXT: vor.vv v8, v8, v24 952; RV32-NEXT: addi a0, sp, 16 953; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 954; RV32-NEXT: vor.vv v8, v24, v8 955; RV32-NEXT: csrr a0, vlenb 956; RV32-NEXT: slli a0, a0, 3 957; RV32-NEXT: add a0, sp, a0 958; RV32-NEXT: addi a0, a0, 16 959; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 960; RV32-NEXT: vor.vv v16, v16, v24 961; RV32-NEXT: vor.vv v8, v8, v16 962; RV32-NEXT: csrr a0, vlenb 963; RV32-NEXT: slli a0, a0, 4 964; RV32-NEXT: add sp, sp, a0 965; RV32-NEXT: .cfi_def_cfa sp, 16 966; RV32-NEXT: addi sp, sp, 16 967; RV32-NEXT: .cfi_def_cfa_offset 0 968; RV32-NEXT: ret 969; 970; RV64-LABEL: vp_bswap_v15i64_unmasked: 971; RV64: # %bb.0: 972; RV64-NEXT: addi sp, sp, -16 973; RV64-NEXT: .cfi_def_cfa_offset 16 974; RV64-NEXT: csrr a1, vlenb 975; RV64-NEXT: slli a1, a1, 3 976; RV64-NEXT: sub sp, sp, a1 977; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 978; RV64-NEXT: lui a1, 4080 979; RV64-NEXT: li a2, 255 980; RV64-NEXT: li a3, 56 981; RV64-NEXT: lui a4, 16 982; RV64-NEXT: li a5, 40 983; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma 984; RV64-NEXT: vsrl.vi v24, v8, 24 985; RV64-NEXT: addiw a0, a4, -256 986; RV64-NEXT: vsrl.vx v16, v8, a3 987; RV64-NEXT: vsrl.vx v0, v8, a5 988; RV64-NEXT: vand.vx v0, v0, a0 989; RV64-NEXT: vor.vv v16, v0, v16 990; RV64-NEXT: addi a4, sp, 16 991; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 992; RV64-NEXT: vsrl.vi v0, v8, 8 993; RV64-NEXT: slli a2, a2, 24 994; RV64-NEXT: vand.vx v24, v24, a1 995; RV64-NEXT: vand.vx v0, v0, a2 996; RV64-NEXT: vor.vv v24, v0, v24 997; RV64-NEXT: vand.vx v0, v8, a1 998; RV64-NEXT: vsll.vi v0, v0, 24 999; RV64-NEXT: vand.vx v16, v8, a2 1000; RV64-NEXT: vsll.vi v16, v16, 8 1001; RV64-NEXT: vor.vv v16, v0, v16 1002; RV64-NEXT: vsll.vx v0, v8, a3 1003; RV64-NEXT: vand.vx v8, v8, a0 1004; RV64-NEXT: vsll.vx v8, v8, a5 1005; RV64-NEXT: vor.vv v8, v0, v8 1006; RV64-NEXT: vor.vv v8, v8, v16 1007; RV64-NEXT: addi a0, sp, 16 1008; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1009; RV64-NEXT: vor.vv v16, v24, v16 1010; RV64-NEXT: vor.vv v8, v8, v16 1011; RV64-NEXT: csrr a0, vlenb 1012; RV64-NEXT: slli a0, a0, 3 1013; RV64-NEXT: add sp, sp, a0 1014; RV64-NEXT: .cfi_def_cfa sp, 16 1015; RV64-NEXT: addi sp, sp, 16 1016; RV64-NEXT: .cfi_def_cfa_offset 0 1017; RV64-NEXT: ret 1018 %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl) 1019 ret <15 x i64> %v 1020} 1021 1022declare <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64>, <16 x i1>, i32) 1023 1024define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { 1025; RV32-LABEL: vp_bswap_v16i64: 1026; RV32: # %bb.0: 1027; RV32-NEXT: addi sp, sp, -16 1028; RV32-NEXT: .cfi_def_cfa_offset 16 1029; RV32-NEXT: csrr a1, vlenb 1030; RV32-NEXT: li a2, 24 1031; RV32-NEXT: mul a1, a1, a2 1032; RV32-NEXT: sub sp, sp, a1 1033; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb 1034; RV32-NEXT: lui a1, 1044480 1035; RV32-NEXT: li a2, 56 1036; RV32-NEXT: lui a3, 16 1037; RV32-NEXT: li a4, 40 1038; RV32-NEXT: addi a5, sp, 8 1039; RV32-NEXT: sw a1, 8(sp) 1040; RV32-NEXT: sw zero, 12(sp) 1041; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1042; RV32-NEXT: vsll.vx v16, v8, a2, v0.t 1043; RV32-NEXT: addi a1, a3, -256 1044; RV32-NEXT: vand.vx v24, v8, a1, v0.t 1045; RV32-NEXT: vsll.vx v24, v24, a4, v0.t 1046; RV32-NEXT: vor.vv v16, v16, v24, v0.t 1047; RV32-NEXT: csrr a3, vlenb 1048; RV32-NEXT: slli a3, a3, 4 1049; RV32-NEXT: add a3, sp, a3 1050; RV32-NEXT: addi a3, a3, 16 1051; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 1052; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1053; RV32-NEXT: vlse64.v v16, (a5), zero 1054; RV32-NEXT: csrr a3, vlenb 1055; RV32-NEXT: slli a3, a3, 3 1056; RV32-NEXT: add a3, sp, a3 1057; RV32-NEXT: addi a3, a3, 16 1058; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 1059; RV32-NEXT: lui a3, 4080 1060; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1061; RV32-NEXT: vand.vx v24, v8, a3, v0.t 1062; RV32-NEXT: vsll.vi v24, v24, 24, v0.t 1063; RV32-NEXT: addi a0, sp, 16 1064; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill 1065; RV32-NEXT: vand.vv v24, v8, v16, v0.t 1066; RV32-NEXT: vsll.vi v16, v24, 8, v0.t 1067; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 1068; RV32-NEXT: vor.vv v16, v24, v16, v0.t 1069; RV32-NEXT: csrr a0, vlenb 1070; RV32-NEXT: slli a0, a0, 4 1071; RV32-NEXT: add a0, sp, a0 1072; RV32-NEXT: addi a0, a0, 16 1073; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 1074; RV32-NEXT: vor.vv v16, v24, v16, v0.t 1075; RV32-NEXT: csrr a0, vlenb 1076; RV32-NEXT: slli a0, a0, 4 1077; RV32-NEXT: add a0, sp, a0 1078; RV32-NEXT: addi a0, a0, 16 1079; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill 1080; RV32-NEXT: vsrl.vx v16, v8, a2, v0.t 1081; RV32-NEXT: vsrl.vx v24, v8, a4, v0.t 1082; RV32-NEXT: vand.vx v24, v24, a1, v0.t 1083; RV32-NEXT: vor.vv v16, v24, v16, v0.t 1084; RV32-NEXT: addi a0, sp, 16 1085; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill 1086; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t 1087; RV32-NEXT: vand.vx v24, v24, a3, v0.t 1088; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t 1089; RV32-NEXT: csrr a0, vlenb 1090; RV32-NEXT: slli a0, a0, 3 1091; RV32-NEXT: add a0, sp, a0 1092; RV32-NEXT: addi a0, a0, 16 1093; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1094; RV32-NEXT: vand.vv v8, v8, v16, v0.t 1095; RV32-NEXT: vor.vv v8, v8, v24, v0.t 1096; RV32-NEXT: addi a0, sp, 16 1097; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1098; RV32-NEXT: vor.vv v8, v8, v16, v0.t 1099; RV32-NEXT: csrr a0, vlenb 1100; RV32-NEXT: slli a0, a0, 4 1101; RV32-NEXT: add a0, sp, a0 1102; RV32-NEXT: addi a0, a0, 16 1103; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1104; RV32-NEXT: vor.vv v8, v16, v8, v0.t 1105; RV32-NEXT: csrr a0, vlenb 1106; RV32-NEXT: li a1, 24 1107; RV32-NEXT: mul a0, a0, a1 1108; RV32-NEXT: add sp, sp, a0 1109; RV32-NEXT: .cfi_def_cfa sp, 16 1110; RV32-NEXT: addi sp, sp, 16 1111; RV32-NEXT: .cfi_def_cfa_offset 0 1112; RV32-NEXT: ret 1113; 1114; RV64-LABEL: vp_bswap_v16i64: 1115; RV64: # %bb.0: 1116; RV64-NEXT: addi sp, sp, -16 1117; RV64-NEXT: .cfi_def_cfa_offset 16 1118; RV64-NEXT: csrr a1, vlenb 1119; RV64-NEXT: slli a1, a1, 3 1120; RV64-NEXT: sub sp, sp, a1 1121; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 1122; RV64-NEXT: lui a1, 4080 1123; RV64-NEXT: li a2, 255 1124; RV64-NEXT: li a3, 56 1125; RV64-NEXT: lui a4, 16 1126; RV64-NEXT: li a5, 40 1127; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1128; RV64-NEXT: vand.vx v16, v8, a1, v0.t 1129; RV64-NEXT: slli a2, a2, 24 1130; RV64-NEXT: addiw a0, a4, -256 1131; RV64-NEXT: vsll.vi v16, v16, 24, v0.t 1132; RV64-NEXT: vand.vx v24, v8, a2, v0.t 1133; RV64-NEXT: vsll.vi v24, v24, 8, v0.t 1134; RV64-NEXT: vor.vv v16, v16, v24, v0.t 1135; RV64-NEXT: addi a4, sp, 16 1136; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 1137; RV64-NEXT: vsll.vx v24, v8, a3, v0.t 1138; RV64-NEXT: vand.vx v16, v8, a0, v0.t 1139; RV64-NEXT: vsll.vx v16, v16, a5, v0.t 1140; RV64-NEXT: vor.vv v16, v24, v16, v0.t 1141; RV64-NEXT: vl8r.v v24, (a4) # Unknown-size Folded Reload 1142; RV64-NEXT: vor.vv v16, v16, v24, v0.t 1143; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 1144; RV64-NEXT: vsrl.vx v24, v8, a3, v0.t 1145; RV64-NEXT: vsrl.vx v16, v8, a5, v0.t 1146; RV64-NEXT: vand.vx v16, v16, a0, v0.t 1147; RV64-NEXT: vor.vv v24, v16, v24, v0.t 1148; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t 1149; RV64-NEXT: vand.vx v16, v16, a1, v0.t 1150; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t 1151; RV64-NEXT: vand.vx v8, v8, a2, v0.t 1152; RV64-NEXT: vor.vv v8, v8, v16, v0.t 1153; RV64-NEXT: vor.vv v8, v8, v24, v0.t 1154; RV64-NEXT: addi a0, sp, 16 1155; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1156; RV64-NEXT: vor.vv v8, v16, v8, v0.t 1157; RV64-NEXT: csrr a0, vlenb 1158; RV64-NEXT: slli a0, a0, 3 1159; RV64-NEXT: add sp, sp, a0 1160; RV64-NEXT: .cfi_def_cfa sp, 16 1161; RV64-NEXT: addi sp, sp, 16 1162; RV64-NEXT: .cfi_def_cfa_offset 0 1163; RV64-NEXT: ret 1164 %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl) 1165 ret <16 x i64> %v 1166} 1167 1168define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { 1169; RV32-LABEL: vp_bswap_v16i64_unmasked: 1170; RV32: # %bb.0: 1171; RV32-NEXT: addi sp, sp, -16 1172; RV32-NEXT: .cfi_def_cfa_offset 16 1173; RV32-NEXT: csrr a1, vlenb 1174; RV32-NEXT: slli a1, a1, 4 1175; RV32-NEXT: sub sp, sp, a1 1176; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb 1177; RV32-NEXT: lui a1, 1044480 1178; RV32-NEXT: li a2, 56 1179; RV32-NEXT: lui a3, 16 1180; RV32-NEXT: li a4, 40 1181; RV32-NEXT: lui a5, 4080 1182; RV32-NEXT: addi a6, sp, 8 1183; RV32-NEXT: sw a1, 8(sp) 1184; RV32-NEXT: sw zero, 12(sp) 1185; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1186; RV32-NEXT: vsll.vx v24, v8, a2 1187; RV32-NEXT: addi a1, a3, -256 1188; RV32-NEXT: vsrl.vx v16, v8, a2 1189; RV32-NEXT: vsrl.vx v0, v8, a4 1190; RV32-NEXT: vand.vx v0, v0, a1 1191; RV32-NEXT: vor.vv v16, v0, v16 1192; RV32-NEXT: csrr a2, vlenb 1193; RV32-NEXT: slli a2, a2, 3 1194; RV32-NEXT: add a2, sp, a2 1195; RV32-NEXT: addi a2, a2, 16 1196; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill 1197; RV32-NEXT: vand.vx v0, v8, a1 1198; RV32-NEXT: vsll.vx v0, v0, a4 1199; RV32-NEXT: vor.vv v16, v24, v0 1200; RV32-NEXT: addi a1, sp, 16 1201; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 1202; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1203; RV32-NEXT: vlse64.v v0, (a6), zero 1204; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1205; RV32-NEXT: vsrl.vi v16, v8, 24 1206; RV32-NEXT: vand.vx v16, v16, a5 1207; RV32-NEXT: vsrl.vi v24, v8, 8 1208; RV32-NEXT: vand.vv v24, v24, v0 1209; RV32-NEXT: vor.vv v16, v24, v16 1210; RV32-NEXT: vand.vv v24, v8, v0 1211; RV32-NEXT: vand.vx v8, v8, a5 1212; RV32-NEXT: vsll.vi v8, v8, 24 1213; RV32-NEXT: vsll.vi v24, v24, 8 1214; RV32-NEXT: vor.vv v8, v8, v24 1215; RV32-NEXT: addi a0, sp, 16 1216; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 1217; RV32-NEXT: vor.vv v8, v24, v8 1218; RV32-NEXT: csrr a0, vlenb 1219; RV32-NEXT: slli a0, a0, 3 1220; RV32-NEXT: add a0, sp, a0 1221; RV32-NEXT: addi a0, a0, 16 1222; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 1223; RV32-NEXT: vor.vv v16, v16, v24 1224; RV32-NEXT: vor.vv v8, v8, v16 1225; RV32-NEXT: csrr a0, vlenb 1226; RV32-NEXT: slli a0, a0, 4 1227; RV32-NEXT: add sp, sp, a0 1228; RV32-NEXT: .cfi_def_cfa sp, 16 1229; RV32-NEXT: addi sp, sp, 16 1230; RV32-NEXT: .cfi_def_cfa_offset 0 1231; RV32-NEXT: ret 1232; 1233; RV64-LABEL: vp_bswap_v16i64_unmasked: 1234; RV64: # %bb.0: 1235; RV64-NEXT: addi sp, sp, -16 1236; RV64-NEXT: .cfi_def_cfa_offset 16 1237; RV64-NEXT: csrr a1, vlenb 1238; RV64-NEXT: slli a1, a1, 3 1239; RV64-NEXT: sub sp, sp, a1 1240; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 1241; RV64-NEXT: lui a1, 4080 1242; RV64-NEXT: li a2, 255 1243; RV64-NEXT: li a3, 56 1244; RV64-NEXT: lui a4, 16 1245; RV64-NEXT: li a5, 40 1246; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma 1247; RV64-NEXT: vsrl.vi v24, v8, 24 1248; RV64-NEXT: addiw a0, a4, -256 1249; RV64-NEXT: vsrl.vx v16, v8, a3 1250; RV64-NEXT: vsrl.vx v0, v8, a5 1251; RV64-NEXT: vand.vx v0, v0, a0 1252; RV64-NEXT: vor.vv v16, v0, v16 1253; RV64-NEXT: addi a4, sp, 16 1254; RV64-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill 1255; RV64-NEXT: vsrl.vi v0, v8, 8 1256; RV64-NEXT: slli a2, a2, 24 1257; RV64-NEXT: vand.vx v24, v24, a1 1258; RV64-NEXT: vand.vx v0, v0, a2 1259; RV64-NEXT: vor.vv v24, v0, v24 1260; RV64-NEXT: vand.vx v0, v8, a1 1261; RV64-NEXT: vsll.vi v0, v0, 24 1262; RV64-NEXT: vand.vx v16, v8, a2 1263; RV64-NEXT: vsll.vi v16, v16, 8 1264; RV64-NEXT: vor.vv v16, v0, v16 1265; RV64-NEXT: vsll.vx v0, v8, a3 1266; RV64-NEXT: vand.vx v8, v8, a0 1267; RV64-NEXT: vsll.vx v8, v8, a5 1268; RV64-NEXT: vor.vv v8, v0, v8 1269; RV64-NEXT: vor.vv v8, v8, v16 1270; RV64-NEXT: addi a0, sp, 16 1271; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1272; RV64-NEXT: vor.vv v16, v24, v16 1273; RV64-NEXT: vor.vv v8, v8, v16 1274; RV64-NEXT: csrr a0, vlenb 1275; RV64-NEXT: slli a0, a0, 3 1276; RV64-NEXT: add sp, sp, a0 1277; RV64-NEXT: .cfi_def_cfa sp, 16 1278; RV64-NEXT: addi sp, sp, 16 1279; RV64-NEXT: .cfi_def_cfa_offset 0 1280; RV64-NEXT: ret 1281 %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl) 1282 ret <16 x i64> %v 1283} 1284 1285declare <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16>, <128 x i1>, i32) 1286 1287define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) { 1288; CHECK-LABEL: vp_bswap_v128i16: 1289; CHECK: # %bb.0: 1290; CHECK-NEXT: addi sp, sp, -16 1291; CHECK-NEXT: .cfi_def_cfa_offset 16 1292; CHECK-NEXT: csrr a1, vlenb 1293; CHECK-NEXT: slli a1, a1, 4 1294; CHECK-NEXT: sub sp, sp, a1 1295; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb 1296; CHECK-NEXT: csrr a1, vlenb 1297; CHECK-NEXT: slli a1, a1, 3 1298; CHECK-NEXT: add a1, sp, a1 1299; CHECK-NEXT: addi a1, a1, 16 1300; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 1301; CHECK-NEXT: li a2, 64 1302; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma 1303; CHECK-NEXT: vslidedown.vi v24, v0, 8 1304; CHECK-NEXT: mv a1, a0 1305; CHECK-NEXT: bltu a0, a2, .LBB26_2 1306; CHECK-NEXT: # %bb.1: 1307; CHECK-NEXT: li a1, 64 1308; CHECK-NEXT: .LBB26_2: 1309; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 1310; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t 1311; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 1312; CHECK-NEXT: vor.vv v8, v8, v16, v0.t 1313; CHECK-NEXT: addi a1, sp, 16 1314; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill 1315; CHECK-NEXT: addi a1, a0, -64 1316; CHECK-NEXT: sltu a0, a0, a1 1317; CHECK-NEXT: addi a0, a0, -1 1318; CHECK-NEXT: and a0, a0, a1 1319; CHECK-NEXT: vmv1r.v v0, v24 1320; CHECK-NEXT: csrr a1, vlenb 1321; CHECK-NEXT: slli a1, a1, 3 1322; CHECK-NEXT: add a1, sp, a1 1323; CHECK-NEXT: addi a1, a1, 16 1324; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 1325; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma 1326; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t 1327; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t 1328; CHECK-NEXT: vor.vv v16, v8, v16, v0.t 1329; CHECK-NEXT: addi a0, sp, 16 1330; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload 1331; CHECK-NEXT: csrr a0, vlenb 1332; CHECK-NEXT: slli a0, a0, 4 1333; CHECK-NEXT: add sp, sp, a0 1334; CHECK-NEXT: .cfi_def_cfa sp, 16 1335; CHECK-NEXT: addi sp, sp, 16 1336; CHECK-NEXT: .cfi_def_cfa_offset 0 1337; CHECK-NEXT: ret 1338 %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl) 1339 ret <128 x i16> %v 1340} 1341 1342define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl) { 1343; CHECK-LABEL: vp_bswap_v128i16_unmasked: 1344; CHECK: # %bb.0: 1345; CHECK-NEXT: li a2, 64 1346; CHECK-NEXT: mv a1, a0 1347; CHECK-NEXT: bltu a0, a2, .LBB27_2 1348; CHECK-NEXT: # %bb.1: 1349; CHECK-NEXT: li a1, 64 1350; CHECK-NEXT: .LBB27_2: 1351; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 1352; CHECK-NEXT: vsrl.vi v24, v8, 8 1353; CHECK-NEXT: vsll.vi v8, v8, 8 1354; CHECK-NEXT: vor.vv v8, v8, v24 1355; CHECK-NEXT: addi a1, a0, -64 1356; CHECK-NEXT: sltu a0, a0, a1 1357; CHECK-NEXT: addi a0, a0, -1 1358; CHECK-NEXT: and a0, a0, a1 1359; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma 1360; CHECK-NEXT: vsrl.vi v24, v16, 8 1361; CHECK-NEXT: vsll.vi v16, v16, 8 1362; CHECK-NEXT: vor.vv v16, v16, v24 1363; CHECK-NEXT: ret 1364 %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> splat (i1 true), i32 %evl) 1365 ret <128 x i16> %v 1366} 1367