1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; RBIT 10; 11 12define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { 13; CHECK-LABEL: bitreverse_v8i8: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ptrue p0.b, vl8 16; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 17; CHECK-NEXT: rbit z0.b, p0/m, z0.b 18; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 19; CHECK-NEXT: ret 20 %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) 21 ret <8 x i8> %res 22} 23 24define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { 25; CHECK-LABEL: bitreverse_v16i8: 26; CHECK: // %bb.0: 27; CHECK-NEXT: ptrue p0.b, vl16 28; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 29; CHECK-NEXT: rbit z0.b, p0/m, z0.b 30; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 31; CHECK-NEXT: ret 32 %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) 33 ret <16 x i8> %res 34} 35 36define void @bitreverse_v32i8(ptr %a) vscale_range(2,0) #0 { 37; CHECK-LABEL: bitreverse_v32i8: 38; CHECK: // %bb.0: 39; CHECK-NEXT: ptrue p0.b, vl32 40; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 41; CHECK-NEXT: rbit z0.b, p0/m, z0.b 42; CHECK-NEXT: st1b { z0.b }, p0, [x0] 43; CHECK-NEXT: ret 44 %op = load <32 x i8>, ptr %a 45 %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) 46 store <32 x i8> %res, ptr %a 47 ret void 48} 49 50define void @bitreverse_v64i8(ptr %a) #0 { 51; VBITS_GE_256-LABEL: bitreverse_v64i8: 52; VBITS_GE_256: // %bb.0: 53; VBITS_GE_256-NEXT: ptrue p0.b, vl32 54; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 55; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 56; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 57; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b 58; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b 59; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 60; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 61; VBITS_GE_256-NEXT: ret 62; 63; VBITS_GE_512-LABEL: bitreverse_v64i8: 64; VBITS_GE_512: // %bb.0: 65; VBITS_GE_512-NEXT: ptrue p0.b, vl64 66; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 67; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b 68; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 69; VBITS_GE_512-NEXT: ret 70 %op = load <64 x i8>, ptr %a 71 %res = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %op) 72 store <64 x i8> %res, ptr %a 73 ret void 74} 75 76define void @bitreverse_v128i8(ptr %a) vscale_range(8,0) #0 { 77; CHECK-LABEL: bitreverse_v128i8: 78; CHECK: // %bb.0: 79; CHECK-NEXT: ptrue p0.b, vl128 80; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 81; CHECK-NEXT: rbit z0.b, p0/m, z0.b 82; CHECK-NEXT: st1b { z0.b }, p0, [x0] 83; CHECK-NEXT: ret 84 %op = load <128 x i8>, ptr %a 85 %res = call <128 x i8> @llvm.bitreverse.v128i8(<128 x i8> %op) 86 store <128 x i8> %res, ptr %a 87 ret void 88} 89 90define void @bitreverse_v256i8(ptr %a) vscale_range(16,0) #0 { 91; CHECK-LABEL: bitreverse_v256i8: 92; CHECK: // %bb.0: 93; CHECK-NEXT: ptrue p0.b, vl256 94; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 95; CHECK-NEXT: rbit z0.b, p0/m, z0.b 96; CHECK-NEXT: st1b { z0.b }, p0, [x0] 97; CHECK-NEXT: ret 98 %op = load <256 x i8>, ptr %a 99 %res = call <256 x i8> @llvm.bitreverse.v256i8(<256 x i8> %op) 100 store <256 x i8> %res, ptr %a 101 ret void 102} 103 104define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 105; CHECK-LABEL: bitreverse_v4i16: 106; CHECK: // %bb.0: 107; CHECK-NEXT: ptrue p0.h, vl4 108; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 109; CHECK-NEXT: rbit z0.h, p0/m, z0.h 110; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 111; CHECK-NEXT: ret 112 %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op) 113 ret <4 x i16> %res 114} 115 116define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 117; CHECK-LABEL: bitreverse_v8i16: 118; CHECK: // %bb.0: 119; CHECK-NEXT: ptrue p0.h, vl8 120; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 121; CHECK-NEXT: rbit z0.h, p0/m, z0.h 122; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 123; CHECK-NEXT: ret 124 %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op) 125 ret <8 x i16> %res 126} 127 128define void @bitreverse_v16i16(ptr %a) vscale_range(2,0) #0 { 129; CHECK-LABEL: bitreverse_v16i16: 130; CHECK: // %bb.0: 131; CHECK-NEXT: ptrue p0.h, vl16 132; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 133; CHECK-NEXT: rbit z0.h, p0/m, z0.h 134; CHECK-NEXT: st1h { z0.h }, p0, [x0] 135; CHECK-NEXT: ret 136 %op = load <16 x i16>, ptr %a 137 %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) 138 store <16 x i16> %res, ptr %a 139 ret void 140} 141 142define void @bitreverse_v32i16(ptr %a) #0 { 143; VBITS_GE_256-LABEL: bitreverse_v32i16: 144; VBITS_GE_256: // %bb.0: 145; VBITS_GE_256-NEXT: ptrue p0.h, vl16 146; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 147; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 148; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 149; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h 150; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h 151; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 152; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 153; VBITS_GE_256-NEXT: ret 154; 155; VBITS_GE_512-LABEL: bitreverse_v32i16: 156; VBITS_GE_512: // %bb.0: 157; VBITS_GE_512-NEXT: ptrue p0.h, vl32 158; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 159; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h 160; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 161; VBITS_GE_512-NEXT: ret 162 %op = load <32 x i16>, ptr %a 163 %res = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %op) 164 store <32 x i16> %res, ptr %a 165 ret void 166} 167 168define void @bitreverse_v64i16(ptr %a) vscale_range(8,0) #0 { 169; CHECK-LABEL: bitreverse_v64i16: 170; CHECK: // %bb.0: 171; CHECK-NEXT: ptrue p0.h, vl64 172; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 173; CHECK-NEXT: rbit z0.h, p0/m, z0.h 174; CHECK-NEXT: st1h { z0.h }, p0, [x0] 175; CHECK-NEXT: ret 176 %op = load <64 x i16>, ptr %a 177 %res = call <64 x i16> @llvm.bitreverse.v64i16(<64 x i16> %op) 178 store <64 x i16> %res, ptr %a 179 ret void 180} 181 182define void @bitreverse_v128i16(ptr %a) vscale_range(16,0) #0 { 183; CHECK-LABEL: bitreverse_v128i16: 184; CHECK: // %bb.0: 185; CHECK-NEXT: ptrue p0.h, vl128 186; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 187; CHECK-NEXT: rbit z0.h, p0/m, z0.h 188; CHECK-NEXT: st1h { z0.h }, p0, [x0] 189; CHECK-NEXT: ret 190 %op = load <128 x i16>, ptr %a 191 %res = call <128 x i16> @llvm.bitreverse.v128i16(<128 x i16> %op) 192 store <128 x i16> %res, ptr %a 193 ret void 194} 195 196define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 197; CHECK-LABEL: bitreverse_v2i32: 198; CHECK: // %bb.0: 199; CHECK-NEXT: ptrue p0.s, vl2 200; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 201; CHECK-NEXT: rbit z0.s, p0/m, z0.s 202; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 203; CHECK-NEXT: ret 204 %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op) 205 ret <2 x i32> %res 206} 207 208define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 209; CHECK-LABEL: bitreverse_v4i32: 210; CHECK: // %bb.0: 211; CHECK-NEXT: ptrue p0.s, vl4 212; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 213; CHECK-NEXT: rbit z0.s, p0/m, z0.s 214; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 215; CHECK-NEXT: ret 216 %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op) 217 ret <4 x i32> %res 218} 219 220define void @bitreverse_v8i32(ptr %a) vscale_range(2,0) #0 { 221; CHECK-LABEL: bitreverse_v8i32: 222; CHECK: // %bb.0: 223; CHECK-NEXT: ptrue p0.s, vl8 224; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 225; CHECK-NEXT: rbit z0.s, p0/m, z0.s 226; CHECK-NEXT: st1w { z0.s }, p0, [x0] 227; CHECK-NEXT: ret 228 %op = load <8 x i32>, ptr %a 229 %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) 230 store <8 x i32> %res, ptr %a 231 ret void 232} 233 234define void @bitreverse_v16i32(ptr %a) #0 { 235; VBITS_GE_256-LABEL: bitreverse_v16i32: 236; VBITS_GE_256: // %bb.0: 237; VBITS_GE_256-NEXT: ptrue p0.s, vl8 238; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 239; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 240; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 241; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s 242; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s 243; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 244; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 245; VBITS_GE_256-NEXT: ret 246; 247; VBITS_GE_512-LABEL: bitreverse_v16i32: 248; VBITS_GE_512: // %bb.0: 249; VBITS_GE_512-NEXT: ptrue p0.s, vl16 250; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 251; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s 252; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 253; VBITS_GE_512-NEXT: ret 254 %op = load <16 x i32>, ptr %a 255 %res = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %op) 256 store <16 x i32> %res, ptr %a 257 ret void 258} 259 260define void @bitreverse_v32i32(ptr %a) vscale_range(8,0) #0 { 261; CHECK-LABEL: bitreverse_v32i32: 262; CHECK: // %bb.0: 263; CHECK-NEXT: ptrue p0.s, vl32 264; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 265; CHECK-NEXT: rbit z0.s, p0/m, z0.s 266; CHECK-NEXT: st1w { z0.s }, p0, [x0] 267; CHECK-NEXT: ret 268 %op = load <32 x i32>, ptr %a 269 %res = call <32 x i32> @llvm.bitreverse.v32i32(<32 x i32> %op) 270 store <32 x i32> %res, ptr %a 271 ret void 272} 273 274define void @bitreverse_v64i32(ptr %a) vscale_range(16,0) #0 { 275; CHECK-LABEL: bitreverse_v64i32: 276; CHECK: // %bb.0: 277; CHECK-NEXT: ptrue p0.s, vl64 278; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 279; CHECK-NEXT: rbit z0.s, p0/m, z0.s 280; CHECK-NEXT: st1w { z0.s }, p0, [x0] 281; CHECK-NEXT: ret 282 %op = load <64 x i32>, ptr %a 283 %res = call <64 x i32> @llvm.bitreverse.v64i32(<64 x i32> %op) 284 store <64 x i32> %res, ptr %a 285 ret void 286} 287 288define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { 289; CHECK-LABEL: bitreverse_v1i64: 290; CHECK: // %bb.0: 291; CHECK-NEXT: ptrue p0.d, vl1 292; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 293; CHECK-NEXT: rbit z0.d, p0/m, z0.d 294; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 295; CHECK-NEXT: ret 296 %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op) 297 ret <1 x i64> %res 298} 299 300define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 301; CHECK-LABEL: bitreverse_v2i64: 302; CHECK: // %bb.0: 303; CHECK-NEXT: ptrue p0.d, vl2 304; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 305; CHECK-NEXT: rbit z0.d, p0/m, z0.d 306; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 307; CHECK-NEXT: ret 308 %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op) 309 ret <2 x i64> %res 310} 311 312define void @bitreverse_v4i64(ptr %a) vscale_range(2,0) #0 { 313; CHECK-LABEL: bitreverse_v4i64: 314; CHECK: // %bb.0: 315; CHECK-NEXT: ptrue p0.d, vl4 316; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 317; CHECK-NEXT: rbit z0.d, p0/m, z0.d 318; CHECK-NEXT: st1d { z0.d }, p0, [x0] 319; CHECK-NEXT: ret 320 %op = load <4 x i64>, ptr %a 321 %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) 322 store <4 x i64> %res, ptr %a 323 ret void 324} 325 326define void @bitreverse_v8i64(ptr %a) #0 { 327; VBITS_GE_256-LABEL: bitreverse_v8i64: 328; VBITS_GE_256: // %bb.0: 329; VBITS_GE_256-NEXT: ptrue p0.d, vl4 330; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 331; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 332; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 333; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d 334; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d 335; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 336; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 337; VBITS_GE_256-NEXT: ret 338; 339; VBITS_GE_512-LABEL: bitreverse_v8i64: 340; VBITS_GE_512: // %bb.0: 341; VBITS_GE_512-NEXT: ptrue p0.d, vl8 342; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 343; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d 344; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 345; VBITS_GE_512-NEXT: ret 346 %op = load <8 x i64>, ptr %a 347 %res = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %op) 348 store <8 x i64> %res, ptr %a 349 ret void 350} 351 352define void @bitreverse_v16i64(ptr %a) vscale_range(8,0) #0 { 353; CHECK-LABEL: bitreverse_v16i64: 354; CHECK: // %bb.0: 355; CHECK-NEXT: ptrue p0.d, vl16 356; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 357; CHECK-NEXT: rbit z0.d, p0/m, z0.d 358; CHECK-NEXT: st1d { z0.d }, p0, [x0] 359; CHECK-NEXT: ret 360 %op = load <16 x i64>, ptr %a 361 %res = call <16 x i64> @llvm.bitreverse.v16i64(<16 x i64> %op) 362 store <16 x i64> %res, ptr %a 363 ret void 364} 365 366define void @bitreverse_v32i64(ptr %a) vscale_range(16,0) #0 { 367; CHECK-LABEL: bitreverse_v32i64: 368; CHECK: // %bb.0: 369; CHECK-NEXT: ptrue p0.d, vl32 370; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 371; CHECK-NEXT: rbit z0.d, p0/m, z0.d 372; CHECK-NEXT: st1d { z0.d }, p0, [x0] 373; CHECK-NEXT: ret 374 %op = load <32 x i64>, ptr %a 375 %res = call <32 x i64> @llvm.bitreverse.v32i64(<32 x i64> %op) 376 store <32 x i64> %res, ptr %a 377 ret void 378} 379 380; 381; REVB 382; 383 384; Don't use SVE for 64-bit vectors. 385define <4 x i16> @bswap_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 386; CHECK-LABEL: bswap_v4i16: 387; CHECK: // %bb.0: 388; CHECK-NEXT: rev16 v0.8b, v0.8b 389; CHECK-NEXT: ret 390 %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) 391 ret <4 x i16> %res 392} 393 394; Don't use SVE for 128-bit vectors. 395define <8 x i16> @bswap_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 396; CHECK-LABEL: bswap_v8i16: 397; CHECK: // %bb.0: 398; CHECK-NEXT: rev16 v0.16b, v0.16b 399; CHECK-NEXT: ret 400 %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) 401 ret <8 x i16> %res 402} 403 404define void @bswap_v16i16(ptr %a) vscale_range(2,0) #0 { 405; CHECK-LABEL: bswap_v16i16: 406; CHECK: // %bb.0: 407; CHECK-NEXT: ptrue p0.h, vl16 408; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 409; CHECK-NEXT: revb z0.h, p0/m, z0.h 410; CHECK-NEXT: st1h { z0.h }, p0, [x0] 411; CHECK-NEXT: ret 412 %op = load <16 x i16>, ptr %a 413 %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) 414 store <16 x i16> %res, ptr %a 415 ret void 416} 417 418define void @bswap_v32i16(ptr %a) #0 { 419; VBITS_GE_256-LABEL: bswap_v32i16: 420; VBITS_GE_256: // %bb.0: 421; VBITS_GE_256-NEXT: ptrue p0.h, vl16 422; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 423; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 424; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 425; VBITS_GE_256-NEXT: revb z0.h, p0/m, z0.h 426; VBITS_GE_256-NEXT: revb z1.h, p0/m, z1.h 427; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 428; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 429; VBITS_GE_256-NEXT: ret 430; 431; VBITS_GE_512-LABEL: bswap_v32i16: 432; VBITS_GE_512: // %bb.0: 433; VBITS_GE_512-NEXT: ptrue p0.h, vl32 434; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 435; VBITS_GE_512-NEXT: revb z0.h, p0/m, z0.h 436; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 437; VBITS_GE_512-NEXT: ret 438 %op = load <32 x i16>, ptr %a 439 %res = call <32 x i16> @llvm.bswap.v32i16(<32 x i16> %op) 440 store <32 x i16> %res, ptr %a 441 ret void 442} 443 444define void @bswap_v64i16(ptr %a) vscale_range(8,0) #0 { 445; CHECK-LABEL: bswap_v64i16: 446; CHECK: // %bb.0: 447; CHECK-NEXT: ptrue p0.h, vl64 448; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 449; CHECK-NEXT: revb z0.h, p0/m, z0.h 450; CHECK-NEXT: st1h { z0.h }, p0, [x0] 451; CHECK-NEXT: ret 452 %op = load <64 x i16>, ptr %a 453 %res = call <64 x i16> @llvm.bswap.v64i16(<64 x i16> %op) 454 store <64 x i16> %res, ptr %a 455 ret void 456} 457 458define void @bswap_v128i16(ptr %a) vscale_range(16,0) #0 { 459; CHECK-LABEL: bswap_v128i16: 460; CHECK: // %bb.0: 461; CHECK-NEXT: ptrue p0.h, vl128 462; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 463; CHECK-NEXT: revb z0.h, p0/m, z0.h 464; CHECK-NEXT: st1h { z0.h }, p0, [x0] 465; CHECK-NEXT: ret 466 %op = load <128 x i16>, ptr %a 467 %res = call <128 x i16> @llvm.bswap.v128i16(<128 x i16> %op) 468 store <128 x i16> %res, ptr %a 469 ret void 470} 471 472; Don't use SVE for 64-bit vectors. 473define <2 x i32> @bswap_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 474; CHECK-LABEL: bswap_v2i32: 475; CHECK: // %bb.0: 476; CHECK-NEXT: rev32 v0.8b, v0.8b 477; CHECK-NEXT: ret 478 %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) 479 ret <2 x i32> %res 480} 481 482; Don't use SVE for 128-bit vectors. 483define <4 x i32> @bswap_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 484; CHECK-LABEL: bswap_v4i32: 485; CHECK: // %bb.0: 486; CHECK-NEXT: rev32 v0.16b, v0.16b 487; CHECK-NEXT: ret 488 %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) 489 ret <4 x i32> %res 490} 491 492define void @bswap_v8i32(ptr %a) vscale_range(2,0) #0 { 493; CHECK-LABEL: bswap_v8i32: 494; CHECK: // %bb.0: 495; CHECK-NEXT: ptrue p0.s, vl8 496; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 497; CHECK-NEXT: revb z0.s, p0/m, z0.s 498; CHECK-NEXT: st1w { z0.s }, p0, [x0] 499; CHECK-NEXT: ret 500 %op = load <8 x i32>, ptr %a 501 %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) 502 store <8 x i32> %res, ptr %a 503 ret void 504} 505 506define void @bswap_v16i32(ptr %a) #0 { 507; VBITS_GE_256-LABEL: bswap_v16i32: 508; VBITS_GE_256: // %bb.0: 509; VBITS_GE_256-NEXT: ptrue p0.s, vl8 510; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 511; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 512; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 513; VBITS_GE_256-NEXT: revb z0.s, p0/m, z0.s 514; VBITS_GE_256-NEXT: revb z1.s, p0/m, z1.s 515; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 516; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 517; VBITS_GE_256-NEXT: ret 518; 519; VBITS_GE_512-LABEL: bswap_v16i32: 520; VBITS_GE_512: // %bb.0: 521; VBITS_GE_512-NEXT: ptrue p0.s, vl16 522; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 523; VBITS_GE_512-NEXT: revb z0.s, p0/m, z0.s 524; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 525; VBITS_GE_512-NEXT: ret 526 %op = load <16 x i32>, ptr %a 527 %res = call <16 x i32> @llvm.bswap.v16i32(<16 x i32> %op) 528 store <16 x i32> %res, ptr %a 529 ret void 530} 531 532define void @bswap_v32i32(ptr %a) vscale_range(8,0) #0 { 533; CHECK-LABEL: bswap_v32i32: 534; CHECK: // %bb.0: 535; CHECK-NEXT: ptrue p0.s, vl32 536; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 537; CHECK-NEXT: revb z0.s, p0/m, z0.s 538; CHECK-NEXT: st1w { z0.s }, p0, [x0] 539; CHECK-NEXT: ret 540 %op = load <32 x i32>, ptr %a 541 %res = call <32 x i32> @llvm.bswap.v32i32(<32 x i32> %op) 542 store <32 x i32> %res, ptr %a 543 ret void 544} 545 546define void @bswap_v64i32(ptr %a) vscale_range(16,0) #0 { 547; CHECK-LABEL: bswap_v64i32: 548; CHECK: // %bb.0: 549; CHECK-NEXT: ptrue p0.s, vl64 550; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 551; CHECK-NEXT: revb z0.s, p0/m, z0.s 552; CHECK-NEXT: st1w { z0.s }, p0, [x0] 553; CHECK-NEXT: ret 554 %op = load <64 x i32>, ptr %a 555 %res = call <64 x i32> @llvm.bswap.v64i32(<64 x i32> %op) 556 store <64 x i32> %res, ptr %a 557 ret void 558} 559 560; Don't use SVE for 64-bit vectors. 561define <1 x i64> @bswap_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { 562; CHECK-LABEL: bswap_v1i64: 563; CHECK: // %bb.0: 564; CHECK-NEXT: rev64 v0.8b, v0.8b 565; CHECK-NEXT: ret 566 %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) 567 ret <1 x i64> %res 568} 569 570; Don't use SVE for 128-bit vectors. 571define <2 x i64> @bswap_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 572; CHECK-LABEL: bswap_v2i64: 573; CHECK: // %bb.0: 574; CHECK-NEXT: rev64 v0.16b, v0.16b 575; CHECK-NEXT: ret 576 %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) 577 ret <2 x i64> %res 578} 579 580define void @bswap_v4i64(ptr %a) vscale_range(2,0) #0 { 581; CHECK-LABEL: bswap_v4i64: 582; CHECK: // %bb.0: 583; CHECK-NEXT: ptrue p0.d, vl4 584; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 585; CHECK-NEXT: revb z0.d, p0/m, z0.d 586; CHECK-NEXT: st1d { z0.d }, p0, [x0] 587; CHECK-NEXT: ret 588 %op = load <4 x i64>, ptr %a 589 %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) 590 store <4 x i64> %res, ptr %a 591 ret void 592} 593 594define void @bswap_v8i64(ptr %a) #0 { 595; VBITS_GE_256-LABEL: bswap_v8i64: 596; VBITS_GE_256: // %bb.0: 597; VBITS_GE_256-NEXT: ptrue p0.d, vl4 598; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 599; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 600; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 601; VBITS_GE_256-NEXT: revb z0.d, p0/m, z0.d 602; VBITS_GE_256-NEXT: revb z1.d, p0/m, z1.d 603; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 604; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 605; VBITS_GE_256-NEXT: ret 606; 607; VBITS_GE_512-LABEL: bswap_v8i64: 608; VBITS_GE_512: // %bb.0: 609; VBITS_GE_512-NEXT: ptrue p0.d, vl8 610; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 611; VBITS_GE_512-NEXT: revb z0.d, p0/m, z0.d 612; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 613; VBITS_GE_512-NEXT: ret 614 %op = load <8 x i64>, ptr %a 615 %res = call <8 x i64> @llvm.bswap.v8i64(<8 x i64> %op) 616 store <8 x i64> %res, ptr %a 617 ret void 618} 619 620define void @bswap_v16i64(ptr %a) vscale_range(8,0) #0 { 621; CHECK-LABEL: bswap_v16i64: 622; CHECK: // %bb.0: 623; CHECK-NEXT: ptrue p0.d, vl16 624; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 625; CHECK-NEXT: revb z0.d, p0/m, z0.d 626; CHECK-NEXT: st1d { z0.d }, p0, [x0] 627; CHECK-NEXT: ret 628 %op = load <16 x i64>, ptr %a 629 %res = call <16 x i64> @llvm.bswap.v16i64(<16 x i64> %op) 630 store <16 x i64> %res, ptr %a 631 ret void 632} 633 634define void @bswap_v32i64(ptr %a) vscale_range(16,0) #0 { 635; CHECK-LABEL: bswap_v32i64: 636; CHECK: // %bb.0: 637; CHECK-NEXT: ptrue p0.d, vl32 638; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 639; CHECK-NEXT: revb z0.d, p0/m, z0.d 640; CHECK-NEXT: st1d { z0.d }, p0, [x0] 641; CHECK-NEXT: ret 642 %op = load <32 x i64>, ptr %a 643 %res = call <32 x i64> @llvm.bswap.v32i64(<32 x i64> %op) 644 store <32 x i64> %res, ptr %a 645 ret void 646} 647 648attributes #0 = { "target-features"="+sve" } 649 650declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) 651declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) 652declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) 653declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) 654declare <128 x i8> @llvm.bitreverse.v128i8(<128 x i8>) 655declare <256 x i8> @llvm.bitreverse.v256i8(<256 x i8>) 656declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>) 657declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) 658declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) 659declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) 660declare <64 x i16> @llvm.bitreverse.v64i16(<64 x i16>) 661declare <128 x i16> @llvm.bitreverse.v128i16(<128 x i16>) 662declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) 663declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) 664declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) 665declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) 666declare <32 x i32> @llvm.bitreverse.v32i32(<32 x i32>) 667declare <64 x i32> @llvm.bitreverse.v64i32(<64 x i32>) 668declare <1 x i64> @llvm.bitreverse.v1i64(<1 x i64>) 669declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) 670declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) 671declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) 672declare <16 x i64> @llvm.bitreverse.v16i64(<16 x i64>) 673declare <32 x i64> @llvm.bitreverse.v32i64(<32 x i64>) 674 675declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>) 676declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) 677declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>) 678declare <32 x i16> @llvm.bswap.v32i16(<32 x i16>) 679declare <64 x i16> @llvm.bswap.v64i16(<64 x i16>) 680declare <128 x i16> @llvm.bswap.v128i16(<128 x i16>) 681declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) 682declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) 683declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>) 684declare <16 x i32> @llvm.bswap.v16i32(<16 x i32>) 685declare <32 x i32> @llvm.bswap.v32i32(<32 x i32>) 686declare <64 x i32> @llvm.bswap.v64i32(<64 x i32>) 687declare <1 x i64> @llvm.bswap.v1i64(<1 x i64>) 688declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) 689declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) 690declare <8 x i64> @llvm.bswap.v8i64(<8 x i64>) 691declare <16 x i64> @llvm.bswap.v16i64(<16 x i64>) 692declare <32 x i64> @llvm.bswap.v32i64(<32 x i64>) 693