1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; CLZ 10; 11 12; Don't use SVE for 64-bit vectors. 13define <8 x i8> @ctlz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { 14; CHECK-LABEL: ctlz_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: clz v0.8b, v0.8b 17; CHECK-NEXT: ret 18 %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op) 19 ret <8 x i8> %res 20} 21 22; Don't use SVE for 128-bit vectors. 23define <16 x i8> @ctlz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { 24; CHECK-LABEL: ctlz_v16i8: 25; CHECK: // %bb.0: 26; CHECK-NEXT: clz v0.16b, v0.16b 27; CHECK-NEXT: ret 28 %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op) 29 ret <16 x i8> %res 30} 31 32define void @ctlz_v32i8(ptr %a) vscale_range(2,0) #0 { 33; CHECK-LABEL: ctlz_v32i8: 34; CHECK: // %bb.0: 35; CHECK-NEXT: ptrue p0.b, vl32 36; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 37; CHECK-NEXT: clz z0.b, p0/m, z0.b 38; CHECK-NEXT: st1b { z0.b }, p0, [x0] 39; CHECK-NEXT: ret 40 %op = load <32 x i8>, ptr %a 41 %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) 42 store <32 x i8> %res, ptr %a 43 ret void 44} 45 46define void @ctlz_v64i8(ptr %a) #0 { 47; VBITS_GE_256-LABEL: ctlz_v64i8: 48; VBITS_GE_256: // %bb.0: 49; VBITS_GE_256-NEXT: ptrue p0.b, vl32 50; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 51; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 52; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 53; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b 54; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b 55; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 56; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 57; VBITS_GE_256-NEXT: ret 58; 59; VBITS_GE_512-LABEL: ctlz_v64i8: 60; VBITS_GE_512: // %bb.0: 61; VBITS_GE_512-NEXT: ptrue p0.b, vl64 62; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 63; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b 64; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 65; VBITS_GE_512-NEXT: ret 66 %op = load <64 x i8>, ptr %a 67 %res = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %op) 68 store <64 x i8> %res, ptr %a 69 ret void 70} 71 72define void @ctlz_v128i8(ptr %a) vscale_range(8,0) #0 { 73; CHECK-LABEL: ctlz_v128i8: 74; CHECK: // %bb.0: 75; CHECK-NEXT: ptrue p0.b, vl128 76; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 77; CHECK-NEXT: clz z0.b, p0/m, z0.b 78; CHECK-NEXT: st1b { z0.b }, p0, [x0] 79; CHECK-NEXT: ret 80 %op = load <128 x i8>, ptr %a 81 %res = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %op) 82 store <128 x i8> %res, ptr %a 83 ret void 84} 85 86define void @ctlz_v256i8(ptr %a) vscale_range(16,0) #0 { 87; CHECK-LABEL: ctlz_v256i8: 88; CHECK: // %bb.0: 89; CHECK-NEXT: ptrue p0.b, vl256 90; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 91; CHECK-NEXT: clz z0.b, p0/m, z0.b 92; CHECK-NEXT: st1b { z0.b }, p0, [x0] 93; CHECK-NEXT: ret 94 %op = load <256 x i8>, ptr %a 95 %res = call <256 x i8> @llvm.ctlz.v256i8(<256 x i8> %op) 96 store <256 x i8> %res, ptr %a 97 ret void 98} 99 100; Don't use SVE for 64-bit vectors. 101define <4 x i16> @ctlz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 102; CHECK-LABEL: ctlz_v4i16: 103; CHECK: // %bb.0: 104; CHECK-NEXT: clz v0.4h, v0.4h 105; CHECK-NEXT: ret 106 %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op) 107 ret <4 x i16> %res 108} 109 110; Don't use SVE for 128-bit vectors. 111define <8 x i16> @ctlz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 112; CHECK-LABEL: ctlz_v8i16: 113; CHECK: // %bb.0: 114; CHECK-NEXT: clz v0.8h, v0.8h 115; CHECK-NEXT: ret 116 %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op) 117 ret <8 x i16> %res 118} 119 120define void @ctlz_v16i16(ptr %a) vscale_range(2,0) #0 { 121; CHECK-LABEL: ctlz_v16i16: 122; CHECK: // %bb.0: 123; CHECK-NEXT: ptrue p0.h, vl16 124; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 125; CHECK-NEXT: clz z0.h, p0/m, z0.h 126; CHECK-NEXT: st1h { z0.h }, p0, [x0] 127; CHECK-NEXT: ret 128 %op = load <16 x i16>, ptr %a 129 %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) 130 store <16 x i16> %res, ptr %a 131 ret void 132} 133 134define void @ctlz_v32i16(ptr %a) #0 { 135; VBITS_GE_256-LABEL: ctlz_v32i16: 136; VBITS_GE_256: // %bb.0: 137; VBITS_GE_256-NEXT: ptrue p0.h, vl16 138; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 139; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 140; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 141; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h 142; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h 143; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 144; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 145; VBITS_GE_256-NEXT: ret 146; 147; VBITS_GE_512-LABEL: ctlz_v32i16: 148; VBITS_GE_512: // %bb.0: 149; VBITS_GE_512-NEXT: ptrue p0.h, vl32 150; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 151; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h 152; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 153; VBITS_GE_512-NEXT: ret 154 %op = load <32 x i16>, ptr %a 155 %res = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %op) 156 store <32 x i16> %res, ptr %a 157 ret void 158} 159 160define void @ctlz_v64i16(ptr %a) vscale_range(8,0) #0 { 161; CHECK-LABEL: ctlz_v64i16: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ptrue p0.h, vl64 164; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 165; CHECK-NEXT: clz z0.h, p0/m, z0.h 166; CHECK-NEXT: st1h { z0.h }, p0, [x0] 167; CHECK-NEXT: ret 168 %op = load <64 x i16>, ptr %a 169 %res = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %op) 170 store <64 x i16> %res, ptr %a 171 ret void 172} 173 174define void @ctlz_v128i16(ptr %a) vscale_range(16,0) #0 { 175; CHECK-LABEL: ctlz_v128i16: 176; CHECK: // %bb.0: 177; CHECK-NEXT: ptrue p0.h, vl128 178; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 179; CHECK-NEXT: clz z0.h, p0/m, z0.h 180; CHECK-NEXT: st1h { z0.h }, p0, [x0] 181; CHECK-NEXT: ret 182 %op = load <128 x i16>, ptr %a 183 %res = call <128 x i16> @llvm.ctlz.v128i16(<128 x i16> %op) 184 store <128 x i16> %res, ptr %a 185 ret void 186} 187 188; Don't use SVE for 64-bit vectors. 189define <2 x i32> @ctlz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 190; CHECK-LABEL: ctlz_v2i32: 191; CHECK: // %bb.0: 192; CHECK-NEXT: clz v0.2s, v0.2s 193; CHECK-NEXT: ret 194 %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op) 195 ret <2 x i32> %res 196} 197 198; Don't use SVE for 128-bit vectors. 199define <4 x i32> @ctlz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 200; CHECK-LABEL: ctlz_v4i32: 201; CHECK: // %bb.0: 202; CHECK-NEXT: clz v0.4s, v0.4s 203; CHECK-NEXT: ret 204 %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op) 205 ret <4 x i32> %res 206} 207 208define void @ctlz_v8i32(ptr %a) vscale_range(2,0) #0 { 209; CHECK-LABEL: ctlz_v8i32: 210; CHECK: // %bb.0: 211; CHECK-NEXT: ptrue p0.s, vl8 212; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 213; CHECK-NEXT: clz z0.s, p0/m, z0.s 214; CHECK-NEXT: st1w { z0.s }, p0, [x0] 215; CHECK-NEXT: ret 216 %op = load <8 x i32>, ptr %a 217 %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) 218 store <8 x i32> %res, ptr %a 219 ret void 220} 221 222define void @ctlz_v16i32(ptr %a) #0 { 223; VBITS_GE_256-LABEL: ctlz_v16i32: 224; VBITS_GE_256: // %bb.0: 225; VBITS_GE_256-NEXT: ptrue p0.s, vl8 226; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 227; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 228; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 229; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s 230; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s 231; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 232; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 233; VBITS_GE_256-NEXT: ret 234; 235; VBITS_GE_512-LABEL: ctlz_v16i32: 236; VBITS_GE_512: // %bb.0: 237; VBITS_GE_512-NEXT: ptrue p0.s, vl16 238; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 239; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s 240; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 241; VBITS_GE_512-NEXT: ret 242 %op = load <16 x i32>, ptr %a 243 %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %op) 244 store <16 x i32> %res, ptr %a 245 ret void 246} 247 248define void @ctlz_v32i32(ptr %a) vscale_range(8,0) #0 { 249; CHECK-LABEL: ctlz_v32i32: 250; CHECK: // %bb.0: 251; CHECK-NEXT: ptrue p0.s, vl32 252; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 253; CHECK-NEXT: clz z0.s, p0/m, z0.s 254; CHECK-NEXT: st1w { z0.s }, p0, [x0] 255; CHECK-NEXT: ret 256 %op = load <32 x i32>, ptr %a 257 %res = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %op) 258 store <32 x i32> %res, ptr %a 259 ret void 260} 261 262define void @ctlz_v64i32(ptr %a) vscale_range(16,0) #0 { 263; CHECK-LABEL: ctlz_v64i32: 264; CHECK: // %bb.0: 265; CHECK-NEXT: ptrue p0.s, vl64 266; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 267; CHECK-NEXT: clz z0.s, p0/m, z0.s 268; CHECK-NEXT: st1w { z0.s }, p0, [x0] 269; CHECK-NEXT: ret 270 %op = load <64 x i32>, ptr %a 271 %res = call <64 x i32> @llvm.ctlz.v64i32(<64 x i32> %op) 272 store <64 x i32> %res, ptr %a 273 ret void 274} 275 276define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { 277; CHECK-LABEL: ctlz_v1i64: 278; CHECK: // %bb.0: 279; CHECK-NEXT: ptrue p0.d, vl1 280; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 281; CHECK-NEXT: clz z0.d, p0/m, z0.d 282; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 283; CHECK-NEXT: ret 284 %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op) 285 ret <1 x i64> %res 286} 287 288define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 289; CHECK-LABEL: ctlz_v2i64: 290; CHECK: // %bb.0: 291; CHECK-NEXT: ptrue p0.d, vl2 292; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 293; CHECK-NEXT: clz z0.d, p0/m, z0.d 294; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 295; CHECK-NEXT: ret 296 %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op) 297 ret <2 x i64> %res 298} 299 300define void @ctlz_v4i64(ptr %a) vscale_range(2,0) #0 { 301; CHECK-LABEL: ctlz_v4i64: 302; CHECK: // %bb.0: 303; CHECK-NEXT: ptrue p0.d, vl4 304; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 305; CHECK-NEXT: clz z0.d, p0/m, z0.d 306; CHECK-NEXT: st1d { z0.d }, p0, [x0] 307; CHECK-NEXT: ret 308 %op = load <4 x i64>, ptr %a 309 %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) 310 store <4 x i64> %res, ptr %a 311 ret void 312} 313 314define void @ctlz_v8i64(ptr %a) #0 { 315; VBITS_GE_256-LABEL: ctlz_v8i64: 316; VBITS_GE_256: // %bb.0: 317; VBITS_GE_256-NEXT: ptrue p0.d, vl4 318; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 319; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 320; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 321; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d 322; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d 323; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 324; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 325; VBITS_GE_256-NEXT: ret 326; 327; VBITS_GE_512-LABEL: ctlz_v8i64: 328; VBITS_GE_512: // %bb.0: 329; VBITS_GE_512-NEXT: ptrue p0.d, vl8 330; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 331; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d 332; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 333; VBITS_GE_512-NEXT: ret 334 %op = load <8 x i64>, ptr %a 335 %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %op) 336 store <8 x i64> %res, ptr %a 337 ret void 338} 339 340define void @ctlz_v16i64(ptr %a) vscale_range(8,0) #0 { 341; CHECK-LABEL: ctlz_v16i64: 342; CHECK: // %bb.0: 343; CHECK-NEXT: ptrue p0.d, vl16 344; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 345; CHECK-NEXT: clz z0.d, p0/m, z0.d 346; CHECK-NEXT: st1d { z0.d }, p0, [x0] 347; CHECK-NEXT: ret 348 %op = load <16 x i64>, ptr %a 349 %res = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> %op) 350 store <16 x i64> %res, ptr %a 351 ret void 352} 353 354define void @ctlz_v32i64(ptr %a) vscale_range(16,0) #0 { 355; CHECK-LABEL: ctlz_v32i64: 356; CHECK: // %bb.0: 357; CHECK-NEXT: ptrue p0.d, vl32 358; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 359; CHECK-NEXT: clz z0.d, p0/m, z0.d 360; CHECK-NEXT: st1d { z0.d }, p0, [x0] 361; CHECK-NEXT: ret 362 %op = load <32 x i64>, ptr %a 363 %res = call <32 x i64> @llvm.ctlz.v32i64(<32 x i64> %op) 364 store <32 x i64> %res, ptr %a 365 ret void 366} 367 368; 369; CNT 370; 371 372; Don't use SVE for 64-bit vectors. 373define <8 x i8> @ctpop_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { 374; CHECK-LABEL: ctpop_v8i8: 375; CHECK: // %bb.0: 376; CHECK-NEXT: cnt v0.8b, v0.8b 377; CHECK-NEXT: ret 378 %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op) 379 ret <8 x i8> %res 380} 381 382; Don't use SVE for 128-bit vectors. 383define <16 x i8> @ctpop_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { 384; CHECK-LABEL: ctpop_v16i8: 385; CHECK: // %bb.0: 386; CHECK-NEXT: cnt v0.16b, v0.16b 387; CHECK-NEXT: ret 388 %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op) 389 ret <16 x i8> %res 390} 391 392define void @ctpop_v32i8(ptr %a) vscale_range(2,0) #0 { 393; CHECK-LABEL: ctpop_v32i8: 394; CHECK: // %bb.0: 395; CHECK-NEXT: ptrue p0.b, vl32 396; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 397; CHECK-NEXT: cnt z0.b, p0/m, z0.b 398; CHECK-NEXT: st1b { z0.b }, p0, [x0] 399; CHECK-NEXT: ret 400 %op = load <32 x i8>, ptr %a 401 %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) 402 store <32 x i8> %res, ptr %a 403 ret void 404} 405 406define void @ctpop_v64i8(ptr %a) #0 { 407; VBITS_GE_256-LABEL: ctpop_v64i8: 408; VBITS_GE_256: // %bb.0: 409; VBITS_GE_256-NEXT: ptrue p0.b, vl32 410; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 411; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 412; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 413; VBITS_GE_256-NEXT: cnt z0.b, p0/m, z0.b 414; VBITS_GE_256-NEXT: cnt z1.b, p0/m, z1.b 415; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 416; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 417; VBITS_GE_256-NEXT: ret 418; 419; VBITS_GE_512-LABEL: ctpop_v64i8: 420; VBITS_GE_512: // %bb.0: 421; VBITS_GE_512-NEXT: ptrue p0.b, vl64 422; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 423; VBITS_GE_512-NEXT: cnt z0.b, p0/m, z0.b 424; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 425; VBITS_GE_512-NEXT: ret 426 %op = load <64 x i8>, ptr %a 427 %res = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %op) 428 store <64 x i8> %res, ptr %a 429 ret void 430} 431 432define void @ctpop_v128i8(ptr %a) vscale_range(8,0) #0 { 433; CHECK-LABEL: ctpop_v128i8: 434; CHECK: // %bb.0: 435; CHECK-NEXT: ptrue p0.b, vl128 436; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 437; CHECK-NEXT: cnt z0.b, p0/m, z0.b 438; CHECK-NEXT: st1b { z0.b }, p0, [x0] 439; CHECK-NEXT: ret 440 %op = load <128 x i8>, ptr %a 441 %res = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %op) 442 store <128 x i8> %res, ptr %a 443 ret void 444} 445 446define void @ctpop_v256i8(ptr %a) vscale_range(16,0) #0 { 447; CHECK-LABEL: ctpop_v256i8: 448; CHECK: // %bb.0: 449; CHECK-NEXT: ptrue p0.b, vl256 450; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 451; CHECK-NEXT: cnt z0.b, p0/m, z0.b 452; CHECK-NEXT: st1b { z0.b }, p0, [x0] 453; CHECK-NEXT: ret 454 %op = load <256 x i8>, ptr %a 455 %res = call <256 x i8> @llvm.ctpop.v256i8(<256 x i8> %op) 456 store <256 x i8> %res, ptr %a 457 ret void 458} 459 460; Don't use SVE for 64-bit vectors. 461define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 462; CHECK-LABEL: ctpop_v4i16: 463; CHECK: // %bb.0: 464; CHECK-NEXT: cnt v0.8b, v0.8b 465; CHECK-NEXT: uaddlp v0.4h, v0.8b 466; CHECK-NEXT: ret 467 %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op) 468 ret <4 x i16> %res 469} 470 471; Don't use SVE for 128-bit vectors. 472define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 473; CHECK-LABEL: ctpop_v8i16: 474; CHECK: // %bb.0: 475; CHECK-NEXT: cnt v0.16b, v0.16b 476; CHECK-NEXT: uaddlp v0.8h, v0.16b 477; CHECK-NEXT: ret 478 %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op) 479 ret <8 x i16> %res 480} 481 482define void @ctpop_v16i16(ptr %a) vscale_range(2,0) #0 { 483; CHECK-LABEL: ctpop_v16i16: 484; CHECK: // %bb.0: 485; CHECK-NEXT: ptrue p0.h, vl16 486; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 487; CHECK-NEXT: cnt z0.h, p0/m, z0.h 488; CHECK-NEXT: st1h { z0.h }, p0, [x0] 489; CHECK-NEXT: ret 490 %op = load <16 x i16>, ptr %a 491 %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) 492 store <16 x i16> %res, ptr %a 493 ret void 494} 495 496define void @ctpop_v32i16(ptr %a) #0 { 497; VBITS_GE_256-LABEL: ctpop_v32i16: 498; VBITS_GE_256: // %bb.0: 499; VBITS_GE_256-NEXT: ptrue p0.h, vl16 500; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 501; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 502; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 503; VBITS_GE_256-NEXT: cnt z0.h, p0/m, z0.h 504; VBITS_GE_256-NEXT: cnt z1.h, p0/m, z1.h 505; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 506; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 507; VBITS_GE_256-NEXT: ret 508; 509; VBITS_GE_512-LABEL: ctpop_v32i16: 510; VBITS_GE_512: // %bb.0: 511; VBITS_GE_512-NEXT: ptrue p0.h, vl32 512; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 513; VBITS_GE_512-NEXT: cnt z0.h, p0/m, z0.h 514; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 515; VBITS_GE_512-NEXT: ret 516 %op = load <32 x i16>, ptr %a 517 %res = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %op) 518 store <32 x i16> %res, ptr %a 519 ret void 520} 521 522define void @ctpop_v64i16(ptr %a) vscale_range(8,0) #0 { 523; CHECK-LABEL: ctpop_v64i16: 524; CHECK: // %bb.0: 525; CHECK-NEXT: ptrue p0.h, vl64 526; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 527; CHECK-NEXT: cnt z0.h, p0/m, z0.h 528; CHECK-NEXT: st1h { z0.h }, p0, [x0] 529; CHECK-NEXT: ret 530 %op = load <64 x i16>, ptr %a 531 %res = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %op) 532 store <64 x i16> %res, ptr %a 533 ret void 534} 535 536define void @ctpop_v128i16(ptr %a) vscale_range(16,0) #0 { 537; CHECK-LABEL: ctpop_v128i16: 538; CHECK: // %bb.0: 539; CHECK-NEXT: ptrue p0.h, vl128 540; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 541; CHECK-NEXT: cnt z0.h, p0/m, z0.h 542; CHECK-NEXT: st1h { z0.h }, p0, [x0] 543; CHECK-NEXT: ret 544 %op = load <128 x i16>, ptr %a 545 %res = call <128 x i16> @llvm.ctpop.v128i16(<128 x i16> %op) 546 store <128 x i16> %res, ptr %a 547 ret void 548} 549 550; Don't use SVE for 64-bit vectors. 551define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 552; CHECK-LABEL: ctpop_v2i32: 553; CHECK: // %bb.0: 554; CHECK-NEXT: cnt v0.8b, v0.8b 555; CHECK-NEXT: uaddlp v0.4h, v0.8b 556; CHECK-NEXT: uaddlp v0.2s, v0.4h 557; CHECK-NEXT: ret 558 %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op) 559 ret <2 x i32> %res 560} 561 562; Don't use SVE for 128-bit vectors. 563define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 564; CHECK-LABEL: ctpop_v4i32: 565; CHECK: // %bb.0: 566; CHECK-NEXT: cnt v0.16b, v0.16b 567; CHECK-NEXT: uaddlp v0.8h, v0.16b 568; CHECK-NEXT: uaddlp v0.4s, v0.8h 569; CHECK-NEXT: ret 570 %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op) 571 ret <4 x i32> %res 572} 573 574define void @ctpop_v8i32(ptr %a) vscale_range(2,0) #0 { 575; CHECK-LABEL: ctpop_v8i32: 576; CHECK: // %bb.0: 577; CHECK-NEXT: ptrue p0.s, vl8 578; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 579; CHECK-NEXT: cnt z0.s, p0/m, z0.s 580; CHECK-NEXT: st1w { z0.s }, p0, [x0] 581; CHECK-NEXT: ret 582 %op = load <8 x i32>, ptr %a 583 %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) 584 store <8 x i32> %res, ptr %a 585 ret void 586} 587 588define void @ctpop_v16i32(ptr %a) #0 { 589; VBITS_GE_256-LABEL: ctpop_v16i32: 590; VBITS_GE_256: // %bb.0: 591; VBITS_GE_256-NEXT: ptrue p0.s, vl8 592; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 593; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 594; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 595; VBITS_GE_256-NEXT: cnt z0.s, p0/m, z0.s 596; VBITS_GE_256-NEXT: cnt z1.s, p0/m, z1.s 597; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 598; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 599; VBITS_GE_256-NEXT: ret 600; 601; VBITS_GE_512-LABEL: ctpop_v16i32: 602; VBITS_GE_512: // %bb.0: 603; VBITS_GE_512-NEXT: ptrue p0.s, vl16 604; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 605; VBITS_GE_512-NEXT: cnt z0.s, p0/m, z0.s 606; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 607; VBITS_GE_512-NEXT: ret 608 %op = load <16 x i32>, ptr %a 609 %res = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %op) 610 store <16 x i32> %res, ptr %a 611 ret void 612} 613 614define void @ctpop_v32i32(ptr %a) vscale_range(8,0) #0 { 615; CHECK-LABEL: ctpop_v32i32: 616; CHECK: // %bb.0: 617; CHECK-NEXT: ptrue p0.s, vl32 618; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 619; CHECK-NEXT: cnt z0.s, p0/m, z0.s 620; CHECK-NEXT: st1w { z0.s }, p0, [x0] 621; CHECK-NEXT: ret 622 %op = load <32 x i32>, ptr %a 623 %res = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %op) 624 store <32 x i32> %res, ptr %a 625 ret void 626} 627 628define void @ctpop_v64i32(ptr %a) vscale_range(16,0) #0 { 629; CHECK-LABEL: ctpop_v64i32: 630; CHECK: // %bb.0: 631; CHECK-NEXT: ptrue p0.s, vl64 632; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 633; CHECK-NEXT: cnt z0.s, p0/m, z0.s 634; CHECK-NEXT: st1w { z0.s }, p0, [x0] 635; CHECK-NEXT: ret 636 %op = load <64 x i32>, ptr %a 637 %res = call <64 x i32> @llvm.ctpop.v64i32(<64 x i32> %op) 638 store <64 x i32> %res, ptr %a 639 ret void 640} 641 642; Don't use SVE for 64-bit vectors. 643define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { 644; CHECK-LABEL: ctpop_v1i64: 645; CHECK: // %bb.0: 646; CHECK-NEXT: cnt v0.8b, v0.8b 647; CHECK-NEXT: uaddlp v0.4h, v0.8b 648; CHECK-NEXT: uaddlp v0.2s, v0.4h 649; CHECK-NEXT: uaddlp v0.1d, v0.2s 650; CHECK-NEXT: ret 651 %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op) 652 ret <1 x i64> %res 653} 654 655; Don't use SVE for 128-bit vectors. 656define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 657; CHECK-LABEL: ctpop_v2i64: 658; CHECK: // %bb.0: 659; CHECK-NEXT: cnt v0.16b, v0.16b 660; CHECK-NEXT: uaddlp v0.8h, v0.16b 661; CHECK-NEXT: uaddlp v0.4s, v0.8h 662; CHECK-NEXT: uaddlp v0.2d, v0.4s 663; CHECK-NEXT: ret 664 %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op) 665 ret <2 x i64> %res 666} 667 668define void @ctpop_v4i64(ptr %a) vscale_range(2,0) #0 { 669; CHECK-LABEL: ctpop_v4i64: 670; CHECK: // %bb.0: 671; CHECK-NEXT: ptrue p0.d, vl4 672; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 673; CHECK-NEXT: cnt z0.d, p0/m, z0.d 674; CHECK-NEXT: st1d { z0.d }, p0, [x0] 675; CHECK-NEXT: ret 676 %op = load <4 x i64>, ptr %a 677 %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) 678 store <4 x i64> %res, ptr %a 679 ret void 680} 681 682define void @ctpop_v8i64(ptr %a) #0 { 683; VBITS_GE_256-LABEL: ctpop_v8i64: 684; VBITS_GE_256: // %bb.0: 685; VBITS_GE_256-NEXT: ptrue p0.d, vl4 686; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 687; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 688; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 689; VBITS_GE_256-NEXT: cnt z0.d, p0/m, z0.d 690; VBITS_GE_256-NEXT: cnt z1.d, p0/m, z1.d 691; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 692; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 693; VBITS_GE_256-NEXT: ret 694; 695; VBITS_GE_512-LABEL: ctpop_v8i64: 696; VBITS_GE_512: // %bb.0: 697; VBITS_GE_512-NEXT: ptrue p0.d, vl8 698; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 699; VBITS_GE_512-NEXT: cnt z0.d, p0/m, z0.d 700; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 701; VBITS_GE_512-NEXT: ret 702 %op = load <8 x i64>, ptr %a 703 %res = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %op) 704 store <8 x i64> %res, ptr %a 705 ret void 706} 707 708define void @ctpop_v16i64(ptr %a) vscale_range(8,0) #0 { 709; CHECK-LABEL: ctpop_v16i64: 710; CHECK: // %bb.0: 711; CHECK-NEXT: ptrue p0.d, vl16 712; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 713; CHECK-NEXT: cnt z0.d, p0/m, z0.d 714; CHECK-NEXT: st1d { z0.d }, p0, [x0] 715; CHECK-NEXT: ret 716 %op = load <16 x i64>, ptr %a 717 %res = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> %op) 718 store <16 x i64> %res, ptr %a 719 ret void 720} 721 722define void @ctpop_v32i64(ptr %a) vscale_range(16,0) #0 { 723; CHECK-LABEL: ctpop_v32i64: 724; CHECK: // %bb.0: 725; CHECK-NEXT: ptrue p0.d, vl32 726; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 727; CHECK-NEXT: cnt z0.d, p0/m, z0.d 728; CHECK-NEXT: st1d { z0.d }, p0, [x0] 729; CHECK-NEXT: ret 730 %op = load <32 x i64>, ptr %a 731 %res = call <32 x i64> @llvm.ctpop.v32i64(<32 x i64> %op) 732 store <32 x i64> %res, ptr %a 733 ret void 734} 735 736; 737; Count trailing zeros 738; 739 740define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { 741; CHECK-LABEL: cttz_v8i8: 742; CHECK: // %bb.0: 743; CHECK-NEXT: ptrue p0.b, vl8 744; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 745; CHECK-NEXT: rbit z0.b, p0/m, z0.b 746; CHECK-NEXT: clz v0.8b, v0.8b 747; CHECK-NEXT: ret 748 %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op) 749 ret <8 x i8> %res 750} 751 752define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { 753; CHECK-LABEL: cttz_v16i8: 754; CHECK: // %bb.0: 755; CHECK-NEXT: ptrue p0.b, vl16 756; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 757; CHECK-NEXT: rbit z0.b, p0/m, z0.b 758; CHECK-NEXT: clz v0.16b, v0.16b 759; CHECK-NEXT: ret 760 %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op) 761 ret <16 x i8> %res 762} 763 764define void @cttz_v32i8(ptr %a) vscale_range(2,0) #0 { 765; CHECK-LABEL: cttz_v32i8: 766; CHECK: // %bb.0: 767; CHECK-NEXT: ptrue p0.b, vl32 768; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 769; CHECK-NEXT: rbit z0.b, p0/m, z0.b 770; CHECK-NEXT: clz z0.b, p0/m, z0.b 771; CHECK-NEXT: st1b { z0.b }, p0, [x0] 772; CHECK-NEXT: ret 773 %op = load <32 x i8>, ptr %a 774 %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) 775 store <32 x i8> %res, ptr %a 776 ret void 777} 778 779define void @cttz_v64i8(ptr %a) #0 { 780; VBITS_GE_256-LABEL: cttz_v64i8: 781; VBITS_GE_256: // %bb.0: 782; VBITS_GE_256-NEXT: ptrue p0.b, vl32 783; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 784; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 785; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 786; VBITS_GE_256-NEXT: rbit z0.b, p0/m, z0.b 787; VBITS_GE_256-NEXT: rbit z1.b, p0/m, z1.b 788; VBITS_GE_256-NEXT: clz z0.b, p0/m, z0.b 789; VBITS_GE_256-NEXT: clz z1.b, p0/m, z1.b 790; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 791; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 792; VBITS_GE_256-NEXT: ret 793; 794; VBITS_GE_512-LABEL: cttz_v64i8: 795; VBITS_GE_512: // %bb.0: 796; VBITS_GE_512-NEXT: ptrue p0.b, vl64 797; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 798; VBITS_GE_512-NEXT: rbit z0.b, p0/m, z0.b 799; VBITS_GE_512-NEXT: clz z0.b, p0/m, z0.b 800; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 801; VBITS_GE_512-NEXT: ret 802 %op = load <64 x i8>, ptr %a 803 %res = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %op) 804 store <64 x i8> %res, ptr %a 805 ret void 806} 807 808define void @cttz_v128i8(ptr %a) vscale_range(8,0) #0 { 809; CHECK-LABEL: cttz_v128i8: 810; CHECK: // %bb.0: 811; CHECK-NEXT: ptrue p0.b, vl128 812; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 813; CHECK-NEXT: rbit z0.b, p0/m, z0.b 814; CHECK-NEXT: clz z0.b, p0/m, z0.b 815; CHECK-NEXT: st1b { z0.b }, p0, [x0] 816; CHECK-NEXT: ret 817 %op = load <128 x i8>, ptr %a 818 %res = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %op) 819 store <128 x i8> %res, ptr %a 820 ret void 821} 822 823define void @cttz_v256i8(ptr %a) vscale_range(16,0) #0 { 824; CHECK-LABEL: cttz_v256i8: 825; CHECK: // %bb.0: 826; CHECK-NEXT: ptrue p0.b, vl256 827; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 828; CHECK-NEXT: rbit z0.b, p0/m, z0.b 829; CHECK-NEXT: clz z0.b, p0/m, z0.b 830; CHECK-NEXT: st1b { z0.b }, p0, [x0] 831; CHECK-NEXT: ret 832 %op = load <256 x i8>, ptr %a 833 %res = call <256 x i8> @llvm.cttz.v256i8(<256 x i8> %op) 834 store <256 x i8> %res, ptr %a 835 ret void 836} 837 838define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 839; CHECK-LABEL: cttz_v4i16: 840; CHECK: // %bb.0: 841; CHECK-NEXT: ptrue p0.h, vl4 842; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 843; CHECK-NEXT: rbit z0.h, p0/m, z0.h 844; CHECK-NEXT: clz v0.4h, v0.4h 845; CHECK-NEXT: ret 846 %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op) 847 ret <4 x i16> %res 848} 849 850define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 851; CHECK-LABEL: cttz_v8i16: 852; CHECK: // %bb.0: 853; CHECK-NEXT: ptrue p0.h, vl8 854; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 855; CHECK-NEXT: rbit z0.h, p0/m, z0.h 856; CHECK-NEXT: clz v0.8h, v0.8h 857; CHECK-NEXT: ret 858 %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op) 859 ret <8 x i16> %res 860} 861 862define void @cttz_v16i16(ptr %a) vscale_range(2,0) #0 { 863; CHECK-LABEL: cttz_v16i16: 864; CHECK: // %bb.0: 865; CHECK-NEXT: ptrue p0.h, vl16 866; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 867; CHECK-NEXT: rbit z0.h, p0/m, z0.h 868; CHECK-NEXT: clz z0.h, p0/m, z0.h 869; CHECK-NEXT: st1h { z0.h }, p0, [x0] 870; CHECK-NEXT: ret 871 %op = load <16 x i16>, ptr %a 872 %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) 873 store <16 x i16> %res, ptr %a 874 ret void 875} 876 877define void @cttz_v32i16(ptr %a) #0 { 878; VBITS_GE_256-LABEL: cttz_v32i16: 879; VBITS_GE_256: // %bb.0: 880; VBITS_GE_256-NEXT: ptrue p0.h, vl16 881; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 882; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 883; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 884; VBITS_GE_256-NEXT: rbit z0.h, p0/m, z0.h 885; VBITS_GE_256-NEXT: rbit z1.h, p0/m, z1.h 886; VBITS_GE_256-NEXT: clz z0.h, p0/m, z0.h 887; VBITS_GE_256-NEXT: clz z1.h, p0/m, z1.h 888; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 889; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 890; VBITS_GE_256-NEXT: ret 891; 892; VBITS_GE_512-LABEL: cttz_v32i16: 893; VBITS_GE_512: // %bb.0: 894; VBITS_GE_512-NEXT: ptrue p0.h, vl32 895; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 896; VBITS_GE_512-NEXT: rbit z0.h, p0/m, z0.h 897; VBITS_GE_512-NEXT: clz z0.h, p0/m, z0.h 898; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 899; VBITS_GE_512-NEXT: ret 900 %op = load <32 x i16>, ptr %a 901 %res = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %op) 902 store <32 x i16> %res, ptr %a 903 ret void 904} 905 906define void @cttz_v64i16(ptr %a) vscale_range(8,0) #0 { 907; CHECK-LABEL: cttz_v64i16: 908; CHECK: // %bb.0: 909; CHECK-NEXT: ptrue p0.h, vl64 910; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 911; CHECK-NEXT: rbit z0.h, p0/m, z0.h 912; CHECK-NEXT: clz z0.h, p0/m, z0.h 913; CHECK-NEXT: st1h { z0.h }, p0, [x0] 914; CHECK-NEXT: ret 915 %op = load <64 x i16>, ptr %a 916 %res = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %op) 917 store <64 x i16> %res, ptr %a 918 ret void 919} 920 921define void @cttz_v128i16(ptr %a) vscale_range(16,0) #0 { 922; CHECK-LABEL: cttz_v128i16: 923; CHECK: // %bb.0: 924; CHECK-NEXT: ptrue p0.h, vl128 925; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 926; CHECK-NEXT: rbit z0.h, p0/m, z0.h 927; CHECK-NEXT: clz z0.h, p0/m, z0.h 928; CHECK-NEXT: st1h { z0.h }, p0, [x0] 929; CHECK-NEXT: ret 930 %op = load <128 x i16>, ptr %a 931 %res = call <128 x i16> @llvm.cttz.v128i16(<128 x i16> %op) 932 store <128 x i16> %res, ptr %a 933 ret void 934} 935 936; Don't use SVE for 64-bit vectors. 937define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 938; CHECK-LABEL: cttz_v2i32: 939; CHECK: // %bb.0: 940; CHECK-NEXT: ptrue p0.s, vl2 941; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 942; CHECK-NEXT: rbit z0.s, p0/m, z0.s 943; CHECK-NEXT: clz v0.2s, v0.2s 944; CHECK-NEXT: ret 945 %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op) 946 ret <2 x i32> %res 947} 948 949; Don't use SVE for 128-bit vectors. 950define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 951; CHECK-LABEL: cttz_v4i32: 952; CHECK: // %bb.0: 953; CHECK-NEXT: ptrue p0.s, vl4 954; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 955; CHECK-NEXT: rbit z0.s, p0/m, z0.s 956; CHECK-NEXT: clz v0.4s, v0.4s 957; CHECK-NEXT: ret 958 %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op) 959 ret <4 x i32> %res 960} 961 962define void @cttz_v8i32(ptr %a) vscale_range(2,0) #0 { 963; CHECK-LABEL: cttz_v8i32: 964; CHECK: // %bb.0: 965; CHECK-NEXT: ptrue p0.s, vl8 966; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 967; CHECK-NEXT: rbit z0.s, p0/m, z0.s 968; CHECK-NEXT: clz z0.s, p0/m, z0.s 969; CHECK-NEXT: st1w { z0.s }, p0, [x0] 970; CHECK-NEXT: ret 971 %op = load <8 x i32>, ptr %a 972 %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) 973 store <8 x i32> %res, ptr %a 974 ret void 975} 976 977define void @cttz_v16i32(ptr %a) #0 { 978; VBITS_GE_256-LABEL: cttz_v16i32: 979; VBITS_GE_256: // %bb.0: 980; VBITS_GE_256-NEXT: ptrue p0.s, vl8 981; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 982; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 983; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 984; VBITS_GE_256-NEXT: rbit z0.s, p0/m, z0.s 985; VBITS_GE_256-NEXT: rbit z1.s, p0/m, z1.s 986; VBITS_GE_256-NEXT: clz z0.s, p0/m, z0.s 987; VBITS_GE_256-NEXT: clz z1.s, p0/m, z1.s 988; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 989; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 990; VBITS_GE_256-NEXT: ret 991; 992; VBITS_GE_512-LABEL: cttz_v16i32: 993; VBITS_GE_512: // %bb.0: 994; VBITS_GE_512-NEXT: ptrue p0.s, vl16 995; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 996; VBITS_GE_512-NEXT: rbit z0.s, p0/m, z0.s 997; VBITS_GE_512-NEXT: clz z0.s, p0/m, z0.s 998; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 999; VBITS_GE_512-NEXT: ret 1000 %op = load <16 x i32>, ptr %a 1001 %res = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %op) 1002 store <16 x i32> %res, ptr %a 1003 ret void 1004} 1005 1006define void @cttz_v32i32(ptr %a) vscale_range(8,0) #0 { 1007; CHECK-LABEL: cttz_v32i32: 1008; CHECK: // %bb.0: 1009; CHECK-NEXT: ptrue p0.s, vl32 1010; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1011; CHECK-NEXT: rbit z0.s, p0/m, z0.s 1012; CHECK-NEXT: clz z0.s, p0/m, z0.s 1013; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1014; CHECK-NEXT: ret 1015 %op = load <32 x i32>, ptr %a 1016 %res = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %op) 1017 store <32 x i32> %res, ptr %a 1018 ret void 1019} 1020 1021define void @cttz_v64i32(ptr %a) vscale_range(16,0) #0 { 1022; CHECK-LABEL: cttz_v64i32: 1023; CHECK: // %bb.0: 1024; CHECK-NEXT: ptrue p0.s, vl64 1025; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1026; CHECK-NEXT: rbit z0.s, p0/m, z0.s 1027; CHECK-NEXT: clz z0.s, p0/m, z0.s 1028; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1029; CHECK-NEXT: ret 1030 %op = load <64 x i32>, ptr %a 1031 %res = call <64 x i32> @llvm.cttz.v64i32(<64 x i32> %op) 1032 store <64 x i32> %res, ptr %a 1033 ret void 1034} 1035 1036define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 { 1037; CHECK-LABEL: cttz_v1i64: 1038; CHECK: // %bb.0: 1039; CHECK-NEXT: ptrue p0.d, vl1 1040; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 1041; CHECK-NEXT: rbit z0.d, p0/m, z0.d 1042; CHECK-NEXT: clz z0.d, p0/m, z0.d 1043; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1044; CHECK-NEXT: ret 1045 %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op) 1046 ret <1 x i64> %res 1047} 1048 1049define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 1050; CHECK-LABEL: cttz_v2i64: 1051; CHECK: // %bb.0: 1052; CHECK-NEXT: ptrue p0.d, vl2 1053; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1054; CHECK-NEXT: rbit z0.d, p0/m, z0.d 1055; CHECK-NEXT: clz z0.d, p0/m, z0.d 1056; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 1057; CHECK-NEXT: ret 1058 %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op) 1059 ret <2 x i64> %res 1060} 1061 1062define void @cttz_v4i64(ptr %a) vscale_range(2,0) #0 { 1063; CHECK-LABEL: cttz_v4i64: 1064; CHECK: // %bb.0: 1065; CHECK-NEXT: ptrue p0.d, vl4 1066; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1067; CHECK-NEXT: rbit z0.d, p0/m, z0.d 1068; CHECK-NEXT: clz z0.d, p0/m, z0.d 1069; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1070; CHECK-NEXT: ret 1071 %op = load <4 x i64>, ptr %a 1072 %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) 1073 store <4 x i64> %res, ptr %a 1074 ret void 1075} 1076 1077define void @cttz_v8i64(ptr %a) #0 { 1078; VBITS_GE_256-LABEL: cttz_v8i64: 1079; VBITS_GE_256: // %bb.0: 1080; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1081; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1082; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1083; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1084; VBITS_GE_256-NEXT: rbit z0.d, p0/m, z0.d 1085; VBITS_GE_256-NEXT: rbit z1.d, p0/m, z1.d 1086; VBITS_GE_256-NEXT: clz z0.d, p0/m, z0.d 1087; VBITS_GE_256-NEXT: clz z1.d, p0/m, z1.d 1088; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1089; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1090; VBITS_GE_256-NEXT: ret 1091; 1092; VBITS_GE_512-LABEL: cttz_v8i64: 1093; VBITS_GE_512: // %bb.0: 1094; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1095; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1096; VBITS_GE_512-NEXT: rbit z0.d, p0/m, z0.d 1097; VBITS_GE_512-NEXT: clz z0.d, p0/m, z0.d 1098; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1099; VBITS_GE_512-NEXT: ret 1100 %op = load <8 x i64>, ptr %a 1101 %res = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %op) 1102 store <8 x i64> %res, ptr %a 1103 ret void 1104} 1105 1106define void @cttz_v16i64(ptr %a) vscale_range(8,0) #0 { 1107; CHECK-LABEL: cttz_v16i64: 1108; CHECK: // %bb.0: 1109; CHECK-NEXT: ptrue p0.d, vl16 1110; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1111; CHECK-NEXT: rbit z0.d, p0/m, z0.d 1112; CHECK-NEXT: clz z0.d, p0/m, z0.d 1113; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1114; CHECK-NEXT: ret 1115 %op = load <16 x i64>, ptr %a 1116 %res = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> %op) 1117 store <16 x i64> %res, ptr %a 1118 ret void 1119} 1120 1121define void @cttz_v32i64(ptr %a) vscale_range(16,0) #0 { 1122; CHECK-LABEL: cttz_v32i64: 1123; CHECK: // %bb.0: 1124; CHECK-NEXT: ptrue p0.d, vl32 1125; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1126; CHECK-NEXT: rbit z0.d, p0/m, z0.d 1127; CHECK-NEXT: clz z0.d, p0/m, z0.d 1128; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1129; CHECK-NEXT: ret 1130 %op = load <32 x i64>, ptr %a 1131 %res = call <32 x i64> @llvm.cttz.v32i64(<32 x i64> %op) 1132 store <32 x i64> %res, ptr %a 1133 ret void 1134} 1135 1136attributes #0 = { "target-features"="+sve" } 1137 1138declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>) 1139declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>) 1140declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>) 1141declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>) 1142declare <128 x i8> @llvm.ctlz.v128i8(<128 x i8>) 1143declare <256 x i8> @llvm.ctlz.v256i8(<256 x i8>) 1144declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>) 1145declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>) 1146declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>) 1147declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>) 1148declare <64 x i16> @llvm.ctlz.v64i16(<64 x i16>) 1149declare <128 x i16> @llvm.ctlz.v128i16(<128 x i16>) 1150declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>) 1151declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>) 1152declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>) 1153declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>) 1154declare <32 x i32> @llvm.ctlz.v32i32(<32 x i32>) 1155declare <64 x i32> @llvm.ctlz.v64i32(<64 x i32>) 1156declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>) 1157declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>) 1158declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>) 1159declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>) 1160declare <16 x i64> @llvm.ctlz.v16i64(<16 x i64>) 1161declare <32 x i64> @llvm.ctlz.v32i64(<32 x i64>) 1162 1163declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) 1164declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) 1165declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) 1166declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>) 1167declare <128 x i8> @llvm.ctpop.v128i8(<128 x i8>) 1168declare <256 x i8> @llvm.ctpop.v256i8(<256 x i8>) 1169declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) 1170declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) 1171declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) 1172declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>) 1173declare <64 x i16> @llvm.ctpop.v64i16(<64 x i16>) 1174declare <128 x i16> @llvm.ctpop.v128i16(<128 x i16>) 1175declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) 1176declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) 1177declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) 1178declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) 1179declare <32 x i32> @llvm.ctpop.v32i32(<32 x i32>) 1180declare <64 x i32> @llvm.ctpop.v64i32(<64 x i32>) 1181declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) 1182declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) 1183declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) 1184declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) 1185declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) 1186declare <32 x i64> @llvm.ctpop.v32i64(<32 x i64>) 1187 1188declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>) 1189declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>) 1190declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>) 1191declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>) 1192declare <128 x i8> @llvm.cttz.v128i8(<128 x i8>) 1193declare <256 x i8> @llvm.cttz.v256i8(<256 x i8>) 1194declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>) 1195declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>) 1196declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>) 1197declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>) 1198declare <64 x i16> @llvm.cttz.v64i16(<64 x i16>) 1199declare <128 x i16> @llvm.cttz.v128i16(<128 x i16>) 1200declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>) 1201declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>) 1202declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>) 1203declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>) 1204declare <32 x i32> @llvm.cttz.v32i32(<32 x i32>) 1205declare <64 x i32> @llvm.cttz.v64i32(<64 x i32>) 1206declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>) 1207declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>) 1208declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>) 1209declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>) 1210declare <16 x i64> @llvm.cttz.v16i64(<16 x i64>) 1211declare <32 x i64> @llvm.cttz.v32i64(<32 x i64>) 1212