1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; ASHR 10; 11 12; Don't use SVE for 64-bit vectors. 13define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 14; CHECK-LABEL: ashr_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: neg v1.8b, v1.8b 17; CHECK-NEXT: sshl v0.8b, v0.8b, v1.8b 18; CHECK-NEXT: ret 19 %res = ashr <8 x i8> %op1, %op2 20 ret <8 x i8> %res 21} 22 23; Don't use SVE for 128-bit vectors. 24define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 25; CHECK-LABEL: ashr_v16i8: 26; CHECK: // %bb.0: 27; CHECK-NEXT: neg v1.16b, v1.16b 28; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b 29; CHECK-NEXT: ret 30 %res = ashr <16 x i8> %op1, %op2 31 ret <16 x i8> %res 32} 33 34define void @ashr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 35; CHECK-LABEL: ashr_v32i8: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ptrue p0.b, vl32 38; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 39; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 40; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b 41; CHECK-NEXT: st1b { z0.b }, p0, [x0] 42; CHECK-NEXT: ret 43 %op1 = load <32 x i8>, ptr %a 44 %op2 = load <32 x i8>, ptr %b 45 %res = ashr <32 x i8> %op1, %op2 46 store <32 x i8> %res, ptr %a 47 ret void 48} 49 50define void @ashr_v64i8(ptr %a, ptr %b) #0 { 51; VBITS_GE_256-LABEL: ashr_v64i8: 52; VBITS_GE_256: // %bb.0: 53; VBITS_GE_256-NEXT: ptrue p0.b, vl32 54; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 55; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 56; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 57; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 58; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 59; VBITS_GE_256-NEXT: asr z0.b, p0/m, z0.b, z1.b 60; VBITS_GE_256-NEXT: movprfx z1, z2 61; VBITS_GE_256-NEXT: asr z1.b, p0/m, z1.b, z3.b 62; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 63; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 64; VBITS_GE_256-NEXT: ret 65; 66; VBITS_GE_512-LABEL: ashr_v64i8: 67; VBITS_GE_512: // %bb.0: 68; VBITS_GE_512-NEXT: ptrue p0.b, vl64 69; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 70; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 71; VBITS_GE_512-NEXT: asr z0.b, p0/m, z0.b, z1.b 72; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 73; VBITS_GE_512-NEXT: ret 74 %op1 = load <64 x i8>, ptr %a 75 %op2 = load <64 x i8>, ptr %b 76 %res = ashr <64 x i8> %op1, %op2 77 store <64 x i8> %res, ptr %a 78 ret void 79} 80 81define void @ashr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 82; CHECK-LABEL: ashr_v128i8: 83; CHECK: // %bb.0: 84; CHECK-NEXT: ptrue p0.b, vl128 85; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 86; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 87; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b 88; CHECK-NEXT: st1b { z0.b }, p0, [x0] 89; CHECK-NEXT: ret 90 %op1 = load <128 x i8>, ptr %a 91 %op2 = load <128 x i8>, ptr %b 92 %res = ashr <128 x i8> %op1, %op2 93 store <128 x i8> %res, ptr %a 94 ret void 95} 96 97define void @ashr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 98; CHECK-LABEL: ashr_v256i8: 99; CHECK: // %bb.0: 100; CHECK-NEXT: ptrue p0.b, vl256 101; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 102; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 103; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b 104; CHECK-NEXT: st1b { z0.b }, p0, [x0] 105; CHECK-NEXT: ret 106 %op1 = load <256 x i8>, ptr %a 107 %op2 = load <256 x i8>, ptr %b 108 %res = ashr <256 x i8> %op1, %op2 109 store <256 x i8> %res, ptr %a 110 ret void 111} 112 113; Don't use SVE for 64-bit vectors. 114define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 115; CHECK-LABEL: ashr_v4i16: 116; CHECK: // %bb.0: 117; CHECK-NEXT: neg v1.4h, v1.4h 118; CHECK-NEXT: sshl v0.4h, v0.4h, v1.4h 119; CHECK-NEXT: ret 120 %res = ashr <4 x i16> %op1, %op2 121 ret <4 x i16> %res 122} 123 124; Don't use SVE for 128-bit vectors. 125define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 126; CHECK-LABEL: ashr_v8i16: 127; CHECK: // %bb.0: 128; CHECK-NEXT: neg v1.8h, v1.8h 129; CHECK-NEXT: sshl v0.8h, v0.8h, v1.8h 130; CHECK-NEXT: ret 131 %res = ashr <8 x i16> %op1, %op2 132 ret <8 x i16> %res 133} 134 135define void @ashr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 136; CHECK-LABEL: ashr_v16i16: 137; CHECK: // %bb.0: 138; CHECK-NEXT: ptrue p0.h, vl16 139; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 140; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 141; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h 142; CHECK-NEXT: st1h { z0.h }, p0, [x0] 143; CHECK-NEXT: ret 144 %op1 = load <16 x i16>, ptr %a 145 %op2 = load <16 x i16>, ptr %b 146 %res = ashr <16 x i16> %op1, %op2 147 store <16 x i16> %res, ptr %a 148 ret void 149} 150 151define void @ashr_v32i16(ptr %a, ptr %b) #0 { 152; VBITS_GE_256-LABEL: ashr_v32i16: 153; VBITS_GE_256: // %bb.0: 154; VBITS_GE_256-NEXT: ptrue p0.h, vl16 155; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 156; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 157; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 158; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 159; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 160; VBITS_GE_256-NEXT: asr z0.h, p0/m, z0.h, z1.h 161; VBITS_GE_256-NEXT: movprfx z1, z2 162; VBITS_GE_256-NEXT: asr z1.h, p0/m, z1.h, z3.h 163; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 164; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 165; VBITS_GE_256-NEXT: ret 166; 167; VBITS_GE_512-LABEL: ashr_v32i16: 168; VBITS_GE_512: // %bb.0: 169; VBITS_GE_512-NEXT: ptrue p0.h, vl32 170; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 171; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 172; VBITS_GE_512-NEXT: asr z0.h, p0/m, z0.h, z1.h 173; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 174; VBITS_GE_512-NEXT: ret 175 %op1 = load <32 x i16>, ptr %a 176 %op2 = load <32 x i16>, ptr %b 177 %res = ashr <32 x i16> %op1, %op2 178 store <32 x i16> %res, ptr %a 179 ret void 180} 181 182define void @ashr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 183; CHECK-LABEL: ashr_v64i16: 184; CHECK: // %bb.0: 185; CHECK-NEXT: ptrue p0.h, vl64 186; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 187; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 188; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h 189; CHECK-NEXT: st1h { z0.h }, p0, [x0] 190; CHECK-NEXT: ret 191 %op1 = load <64 x i16>, ptr %a 192 %op2 = load <64 x i16>, ptr %b 193 %res = ashr <64 x i16> %op1, %op2 194 store <64 x i16> %res, ptr %a 195 ret void 196} 197 198define void @ashr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 199; CHECK-LABEL: ashr_v128i16: 200; CHECK: // %bb.0: 201; CHECK-NEXT: ptrue p0.h, vl128 202; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 203; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 204; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h 205; CHECK-NEXT: st1h { z0.h }, p0, [x0] 206; CHECK-NEXT: ret 207 %op1 = load <128 x i16>, ptr %a 208 %op2 = load <128 x i16>, ptr %b 209 %res = ashr <128 x i16> %op1, %op2 210 store <128 x i16> %res, ptr %a 211 ret void 212} 213 214; Don't use SVE for 64-bit vectors. 215define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 216; CHECK-LABEL: ashr_v2i32: 217; CHECK: // %bb.0: 218; CHECK-NEXT: neg v1.2s, v1.2s 219; CHECK-NEXT: sshl v0.2s, v0.2s, v1.2s 220; CHECK-NEXT: ret 221 %res = ashr <2 x i32> %op1, %op2 222 ret <2 x i32> %res 223} 224 225; Don't use SVE for 128-bit vectors. 226define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 227; CHECK-LABEL: ashr_v4i32: 228; CHECK: // %bb.0: 229; CHECK-NEXT: neg v1.4s, v1.4s 230; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s 231; CHECK-NEXT: ret 232 %res = ashr <4 x i32> %op1, %op2 233 ret <4 x i32> %res 234} 235 236define void @ashr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 237; CHECK-LABEL: ashr_v8i32: 238; CHECK: // %bb.0: 239; CHECK-NEXT: ptrue p0.s, vl8 240; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 241; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 242; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s 243; CHECK-NEXT: st1w { z0.s }, p0, [x0] 244; CHECK-NEXT: ret 245 %op1 = load <8 x i32>, ptr %a 246 %op2 = load <8 x i32>, ptr %b 247 %res = ashr <8 x i32> %op1, %op2 248 store <8 x i32> %res, ptr %a 249 ret void 250} 251 252define void @ashr_v16i32(ptr %a, ptr %b) #0 { 253; VBITS_GE_256-LABEL: ashr_v16i32: 254; VBITS_GE_256: // %bb.0: 255; VBITS_GE_256-NEXT: ptrue p0.s, vl8 256; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 257; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 258; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 259; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 260; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 261; VBITS_GE_256-NEXT: asr z0.s, p0/m, z0.s, z1.s 262; VBITS_GE_256-NEXT: movprfx z1, z2 263; VBITS_GE_256-NEXT: asr z1.s, p0/m, z1.s, z3.s 264; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 265; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 266; VBITS_GE_256-NEXT: ret 267; 268; VBITS_GE_512-LABEL: ashr_v16i32: 269; VBITS_GE_512: // %bb.0: 270; VBITS_GE_512-NEXT: ptrue p0.s, vl16 271; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 272; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 273; VBITS_GE_512-NEXT: asr z0.s, p0/m, z0.s, z1.s 274; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 275; VBITS_GE_512-NEXT: ret 276 %op1 = load <16 x i32>, ptr %a 277 %op2 = load <16 x i32>, ptr %b 278 %res = ashr <16 x i32> %op1, %op2 279 store <16 x i32> %res, ptr %a 280 ret void 281} 282 283define void @ashr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 284; CHECK-LABEL: ashr_v32i32: 285; CHECK: // %bb.0: 286; CHECK-NEXT: ptrue p0.s, vl32 287; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 288; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 289; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s 290; CHECK-NEXT: st1w { z0.s }, p0, [x0] 291; CHECK-NEXT: ret 292 %op1 = load <32 x i32>, ptr %a 293 %op2 = load <32 x i32>, ptr %b 294 %res = ashr <32 x i32> %op1, %op2 295 store <32 x i32> %res, ptr %a 296 ret void 297} 298 299define void @ashr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 300; CHECK-LABEL: ashr_v64i32: 301; CHECK: // %bb.0: 302; CHECK-NEXT: ptrue p0.s, vl64 303; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 304; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 305; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s 306; CHECK-NEXT: st1w { z0.s }, p0, [x0] 307; CHECK-NEXT: ret 308 %op1 = load <64 x i32>, ptr %a 309 %op2 = load <64 x i32>, ptr %b 310 %res = ashr <64 x i32> %op1, %op2 311 store <64 x i32> %res, ptr %a 312 ret void 313} 314 315; Don't use SVE for 64-bit vectors. 316define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 317; CHECK-LABEL: ashr_v1i64: 318; CHECK: // %bb.0: 319; CHECK-NEXT: neg d1, d1 320; CHECK-NEXT: sshl d0, d0, d1 321; CHECK-NEXT: ret 322 %res = ashr <1 x i64> %op1, %op2 323 ret <1 x i64> %res 324} 325 326; Don't use SVE for 128-bit vectors. 327define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 328; CHECK-LABEL: ashr_v2i64: 329; CHECK: // %bb.0: 330; CHECK-NEXT: neg v1.2d, v1.2d 331; CHECK-NEXT: sshl v0.2d, v0.2d, v1.2d 332; CHECK-NEXT: ret 333 %res = ashr <2 x i64> %op1, %op2 334 ret <2 x i64> %res 335} 336 337define void @ashr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 338; CHECK-LABEL: ashr_v4i64: 339; CHECK: // %bb.0: 340; CHECK-NEXT: ptrue p0.d, vl4 341; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 342; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 343; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d 344; CHECK-NEXT: st1d { z0.d }, p0, [x0] 345; CHECK-NEXT: ret 346 %op1 = load <4 x i64>, ptr %a 347 %op2 = load <4 x i64>, ptr %b 348 %res = ashr <4 x i64> %op1, %op2 349 store <4 x i64> %res, ptr %a 350 ret void 351} 352 353define void @ashr_v8i64(ptr %a, ptr %b) #0 { 354; VBITS_GE_256-LABEL: ashr_v8i64: 355; VBITS_GE_256: // %bb.0: 356; VBITS_GE_256-NEXT: ptrue p0.d, vl4 357; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 358; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 359; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 360; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 361; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 362; VBITS_GE_256-NEXT: asr z0.d, p0/m, z0.d, z1.d 363; VBITS_GE_256-NEXT: movprfx z1, z2 364; VBITS_GE_256-NEXT: asr z1.d, p0/m, z1.d, z3.d 365; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 366; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 367; VBITS_GE_256-NEXT: ret 368; 369; VBITS_GE_512-LABEL: ashr_v8i64: 370; VBITS_GE_512: // %bb.0: 371; VBITS_GE_512-NEXT: ptrue p0.d, vl8 372; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 373; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 374; VBITS_GE_512-NEXT: asr z0.d, p0/m, z0.d, z1.d 375; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 376; VBITS_GE_512-NEXT: ret 377 %op1 = load <8 x i64>, ptr %a 378 %op2 = load <8 x i64>, ptr %b 379 %res = ashr <8 x i64> %op1, %op2 380 store <8 x i64> %res, ptr %a 381 ret void 382} 383 384define void @ashr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 385; CHECK-LABEL: ashr_v16i64: 386; CHECK: // %bb.0: 387; CHECK-NEXT: ptrue p0.d, vl16 388; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 389; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 390; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d 391; CHECK-NEXT: st1d { z0.d }, p0, [x0] 392; CHECK-NEXT: ret 393 %op1 = load <16 x i64>, ptr %a 394 %op2 = load <16 x i64>, ptr %b 395 %res = ashr <16 x i64> %op1, %op2 396 store <16 x i64> %res, ptr %a 397 ret void 398} 399 400define void @ashr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 401; CHECK-LABEL: ashr_v32i64: 402; CHECK: // %bb.0: 403; CHECK-NEXT: ptrue p0.d, vl32 404; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 405; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 406; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d 407; CHECK-NEXT: st1d { z0.d }, p0, [x0] 408; CHECK-NEXT: ret 409 %op1 = load <32 x i64>, ptr %a 410 %op2 = load <32 x i64>, ptr %b 411 %res = ashr <32 x i64> %op1, %op2 412 store <32 x i64> %res, ptr %a 413 ret void 414} 415 416; 417; LSHR 418; 419 420; Don't use SVE for 64-bit vectors. 421define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 422; CHECK-LABEL: lshr_v8i8: 423; CHECK: // %bb.0: 424; CHECK-NEXT: neg v1.8b, v1.8b 425; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b 426; CHECK-NEXT: ret 427 %res = lshr <8 x i8> %op1, %op2 428 ret <8 x i8> %res 429} 430 431; Don't use SVE for 128-bit vectors. 432define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 433; CHECK-LABEL: lshr_v16i8: 434; CHECK: // %bb.0: 435; CHECK-NEXT: neg v1.16b, v1.16b 436; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b 437; CHECK-NEXT: ret 438 %res = lshr <16 x i8> %op1, %op2 439 ret <16 x i8> %res 440} 441 442define void @lshr_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 443; CHECK-LABEL: lshr_v32i8: 444; CHECK: // %bb.0: 445; CHECK-NEXT: ptrue p0.b, vl32 446; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 447; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 448; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b 449; CHECK-NEXT: st1b { z0.b }, p0, [x0] 450; CHECK-NEXT: ret 451 %op1 = load <32 x i8>, ptr %a 452 %op2 = load <32 x i8>, ptr %b 453 %res = lshr <32 x i8> %op1, %op2 454 store <32 x i8> %res, ptr %a 455 ret void 456} 457 458define void @lshr_v64i8(ptr %a, ptr %b) #0 { 459; VBITS_GE_256-LABEL: lshr_v64i8: 460; VBITS_GE_256: // %bb.0: 461; VBITS_GE_256-NEXT: ptrue p0.b, vl32 462; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 463; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 464; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 465; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 466; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 467; VBITS_GE_256-NEXT: lsr z0.b, p0/m, z0.b, z1.b 468; VBITS_GE_256-NEXT: movprfx z1, z2 469; VBITS_GE_256-NEXT: lsr z1.b, p0/m, z1.b, z3.b 470; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 471; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 472; VBITS_GE_256-NEXT: ret 473; 474; VBITS_GE_512-LABEL: lshr_v64i8: 475; VBITS_GE_512: // %bb.0: 476; VBITS_GE_512-NEXT: ptrue p0.b, vl64 477; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 478; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 479; VBITS_GE_512-NEXT: lsr z0.b, p0/m, z0.b, z1.b 480; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 481; VBITS_GE_512-NEXT: ret 482 %op1 = load <64 x i8>, ptr %a 483 %op2 = load <64 x i8>, ptr %b 484 %res = lshr <64 x i8> %op1, %op2 485 store <64 x i8> %res, ptr %a 486 ret void 487} 488 489define void @lshr_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 490; CHECK-LABEL: lshr_v128i8: 491; CHECK: // %bb.0: 492; CHECK-NEXT: ptrue p0.b, vl128 493; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 494; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 495; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b 496; CHECK-NEXT: st1b { z0.b }, p0, [x0] 497; CHECK-NEXT: ret 498 %op1 = load <128 x i8>, ptr %a 499 %op2 = load <128 x i8>, ptr %b 500 %res = lshr <128 x i8> %op1, %op2 501 store <128 x i8> %res, ptr %a 502 ret void 503} 504 505define void @lshr_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 506; CHECK-LABEL: lshr_v256i8: 507; CHECK: // %bb.0: 508; CHECK-NEXT: ptrue p0.b, vl256 509; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 510; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 511; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b 512; CHECK-NEXT: st1b { z0.b }, p0, [x0] 513; CHECK-NEXT: ret 514 %op1 = load <256 x i8>, ptr %a 515 %op2 = load <256 x i8>, ptr %b 516 %res = lshr <256 x i8> %op1, %op2 517 store <256 x i8> %res, ptr %a 518 ret void 519} 520 521; Don't use SVE for 64-bit vectors. 522define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 523; CHECK-LABEL: lshr_v4i16: 524; CHECK: // %bb.0: 525; CHECK-NEXT: neg v1.4h, v1.4h 526; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h 527; CHECK-NEXT: ret 528 %res = lshr <4 x i16> %op1, %op2 529 ret <4 x i16> %res 530} 531 532; Don't use SVE for 128-bit vectors. 533define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 534; CHECK-LABEL: lshr_v8i16: 535; CHECK: // %bb.0: 536; CHECK-NEXT: neg v1.8h, v1.8h 537; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h 538; CHECK-NEXT: ret 539 %res = lshr <8 x i16> %op1, %op2 540 ret <8 x i16> %res 541} 542 543define void @lshr_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 544; CHECK-LABEL: lshr_v16i16: 545; CHECK: // %bb.0: 546; CHECK-NEXT: ptrue p0.h, vl16 547; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 548; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 549; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h 550; CHECK-NEXT: st1h { z0.h }, p0, [x0] 551; CHECK-NEXT: ret 552 %op1 = load <16 x i16>, ptr %a 553 %op2 = load <16 x i16>, ptr %b 554 %res = lshr <16 x i16> %op1, %op2 555 store <16 x i16> %res, ptr %a 556 ret void 557} 558 559define void @lshr_v32i16(ptr %a, ptr %b) #0 { 560; VBITS_GE_256-LABEL: lshr_v32i16: 561; VBITS_GE_256: // %bb.0: 562; VBITS_GE_256-NEXT: ptrue p0.h, vl16 563; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 564; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 565; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 566; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 567; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 568; VBITS_GE_256-NEXT: lsr z0.h, p0/m, z0.h, z1.h 569; VBITS_GE_256-NEXT: movprfx z1, z2 570; VBITS_GE_256-NEXT: lsr z1.h, p0/m, z1.h, z3.h 571; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 572; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 573; VBITS_GE_256-NEXT: ret 574; 575; VBITS_GE_512-LABEL: lshr_v32i16: 576; VBITS_GE_512: // %bb.0: 577; VBITS_GE_512-NEXT: ptrue p0.h, vl32 578; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 579; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 580; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, z1.h 581; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 582; VBITS_GE_512-NEXT: ret 583 %op1 = load <32 x i16>, ptr %a 584 %op2 = load <32 x i16>, ptr %b 585 %res = lshr <32 x i16> %op1, %op2 586 store <32 x i16> %res, ptr %a 587 ret void 588} 589 590define void @lshr_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 591; CHECK-LABEL: lshr_v64i16: 592; CHECK: // %bb.0: 593; CHECK-NEXT: ptrue p0.h, vl64 594; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 595; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 596; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h 597; CHECK-NEXT: st1h { z0.h }, p0, [x0] 598; CHECK-NEXT: ret 599 %op1 = load <64 x i16>, ptr %a 600 %op2 = load <64 x i16>, ptr %b 601 %res = lshr <64 x i16> %op1, %op2 602 store <64 x i16> %res, ptr %a 603 ret void 604} 605 606define void @lshr_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 607; CHECK-LABEL: lshr_v128i16: 608; CHECK: // %bb.0: 609; CHECK-NEXT: ptrue p0.h, vl128 610; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 611; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 612; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h 613; CHECK-NEXT: st1h { z0.h }, p0, [x0] 614; CHECK-NEXT: ret 615 %op1 = load <128 x i16>, ptr %a 616 %op2 = load <128 x i16>, ptr %b 617 %res = lshr <128 x i16> %op1, %op2 618 store <128 x i16> %res, ptr %a 619 ret void 620} 621 622; Don't use SVE for 64-bit vectors. 623define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 624; CHECK-LABEL: lshr_v2i32: 625; CHECK: // %bb.0: 626; CHECK-NEXT: neg v1.2s, v1.2s 627; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s 628; CHECK-NEXT: ret 629 %res = lshr <2 x i32> %op1, %op2 630 ret <2 x i32> %res 631} 632 633; Don't use SVE for 128-bit vectors. 634define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 635; CHECK-LABEL: lshr_v4i32: 636; CHECK: // %bb.0: 637; CHECK-NEXT: neg v1.4s, v1.4s 638; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s 639; CHECK-NEXT: ret 640 %res = lshr <4 x i32> %op1, %op2 641 ret <4 x i32> %res 642} 643 644define void @lshr_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 645; CHECK-LABEL: lshr_v8i32: 646; CHECK: // %bb.0: 647; CHECK-NEXT: ptrue p0.s, vl8 648; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 649; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 650; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s 651; CHECK-NEXT: st1w { z0.s }, p0, [x0] 652; CHECK-NEXT: ret 653 %op1 = load <8 x i32>, ptr %a 654 %op2 = load <8 x i32>, ptr %b 655 %res = lshr <8 x i32> %op1, %op2 656 store <8 x i32> %res, ptr %a 657 ret void 658} 659 660define void @lshr_v16i32(ptr %a, ptr %b) #0 { 661; VBITS_GE_256-LABEL: lshr_v16i32: 662; VBITS_GE_256: // %bb.0: 663; VBITS_GE_256-NEXT: ptrue p0.s, vl8 664; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 665; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 666; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 667; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 668; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 669; VBITS_GE_256-NEXT: lsr z0.s, p0/m, z0.s, z1.s 670; VBITS_GE_256-NEXT: movprfx z1, z2 671; VBITS_GE_256-NEXT: lsr z1.s, p0/m, z1.s, z3.s 672; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 673; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 674; VBITS_GE_256-NEXT: ret 675; 676; VBITS_GE_512-LABEL: lshr_v16i32: 677; VBITS_GE_512: // %bb.0: 678; VBITS_GE_512-NEXT: ptrue p0.s, vl16 679; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 680; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 681; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, z1.s 682; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 683; VBITS_GE_512-NEXT: ret 684 %op1 = load <16 x i32>, ptr %a 685 %op2 = load <16 x i32>, ptr %b 686 %res = lshr <16 x i32> %op1, %op2 687 store <16 x i32> %res, ptr %a 688 ret void 689} 690 691define void @lshr_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 692; CHECK-LABEL: lshr_v32i32: 693; CHECK: // %bb.0: 694; CHECK-NEXT: ptrue p0.s, vl32 695; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 696; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 697; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s 698; CHECK-NEXT: st1w { z0.s }, p0, [x0] 699; CHECK-NEXT: ret 700 %op1 = load <32 x i32>, ptr %a 701 %op2 = load <32 x i32>, ptr %b 702 %res = lshr <32 x i32> %op1, %op2 703 store <32 x i32> %res, ptr %a 704 ret void 705} 706 707define void @lshr_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 708; CHECK-LABEL: lshr_v64i32: 709; CHECK: // %bb.0: 710; CHECK-NEXT: ptrue p0.s, vl64 711; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 712; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 713; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s 714; CHECK-NEXT: st1w { z0.s }, p0, [x0] 715; CHECK-NEXT: ret 716 %op1 = load <64 x i32>, ptr %a 717 %op2 = load <64 x i32>, ptr %b 718 %res = lshr <64 x i32> %op1, %op2 719 store <64 x i32> %res, ptr %a 720 ret void 721} 722 723; Don't use SVE for 64-bit vectors. 724define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 725; CHECK-LABEL: lshr_v1i64: 726; CHECK: // %bb.0: 727; CHECK-NEXT: neg d1, d1 728; CHECK-NEXT: ushl d0, d0, d1 729; CHECK-NEXT: ret 730 %res = lshr <1 x i64> %op1, %op2 731 ret <1 x i64> %res 732} 733 734; Don't use SVE for 128-bit vectors. 735define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 736; CHECK-LABEL: lshr_v2i64: 737; CHECK: // %bb.0: 738; CHECK-NEXT: neg v1.2d, v1.2d 739; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d 740; CHECK-NEXT: ret 741 %res = lshr <2 x i64> %op1, %op2 742 ret <2 x i64> %res 743} 744 745define void @lshr_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 746; CHECK-LABEL: lshr_v4i64: 747; CHECK: // %bb.0: 748; CHECK-NEXT: ptrue p0.d, vl4 749; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 750; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 751; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d 752; CHECK-NEXT: st1d { z0.d }, p0, [x0] 753; CHECK-NEXT: ret 754 %op1 = load <4 x i64>, ptr %a 755 %op2 = load <4 x i64>, ptr %b 756 %res = lshr <4 x i64> %op1, %op2 757 store <4 x i64> %res, ptr %a 758 ret void 759} 760 761define void @lshr_v8i64(ptr %a, ptr %b) #0 { 762; VBITS_GE_256-LABEL: lshr_v8i64: 763; VBITS_GE_256: // %bb.0: 764; VBITS_GE_256-NEXT: ptrue p0.d, vl4 765; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 766; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 767; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 768; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 769; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 770; VBITS_GE_256-NEXT: lsr z0.d, p0/m, z0.d, z1.d 771; VBITS_GE_256-NEXT: movprfx z1, z2 772; VBITS_GE_256-NEXT: lsr z1.d, p0/m, z1.d, z3.d 773; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 774; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 775; VBITS_GE_256-NEXT: ret 776; 777; VBITS_GE_512-LABEL: lshr_v8i64: 778; VBITS_GE_512: // %bb.0: 779; VBITS_GE_512-NEXT: ptrue p0.d, vl8 780; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 781; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 782; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, z1.d 783; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 784; VBITS_GE_512-NEXT: ret 785 %op1 = load <8 x i64>, ptr %a 786 %op2 = load <8 x i64>, ptr %b 787 %res = lshr <8 x i64> %op1, %op2 788 store <8 x i64> %res, ptr %a 789 ret void 790} 791 792define void @lshr_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 793; CHECK-LABEL: lshr_v16i64: 794; CHECK: // %bb.0: 795; CHECK-NEXT: ptrue p0.d, vl16 796; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 797; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 798; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d 799; CHECK-NEXT: st1d { z0.d }, p0, [x0] 800; CHECK-NEXT: ret 801 %op1 = load <16 x i64>, ptr %a 802 %op2 = load <16 x i64>, ptr %b 803 %res = lshr <16 x i64> %op1, %op2 804 store <16 x i64> %res, ptr %a 805 ret void 806} 807 808define void @lshr_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 809; CHECK-LABEL: lshr_v32i64: 810; CHECK: // %bb.0: 811; CHECK-NEXT: ptrue p0.d, vl32 812; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 813; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 814; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d 815; CHECK-NEXT: st1d { z0.d }, p0, [x0] 816; CHECK-NEXT: ret 817 %op1 = load <32 x i64>, ptr %a 818 %op2 = load <32 x i64>, ptr %b 819 %res = lshr <32 x i64> %op1, %op2 820 store <32 x i64> %res, ptr %a 821 ret void 822} 823 824; 825; SHL 826; 827 828; Don't use SVE for 64-bit vectors. 829define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 830; CHECK-LABEL: shl_v8i8: 831; CHECK: // %bb.0: 832; CHECK-NEXT: ushl v0.8b, v0.8b, v1.8b 833; CHECK-NEXT: ret 834 %res = shl <8 x i8> %op1, %op2 835 ret <8 x i8> %res 836} 837 838; Don't use SVE for 128-bit vectors. 839define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 840; CHECK-LABEL: shl_v16i8: 841; CHECK: // %bb.0: 842; CHECK-NEXT: ushl v0.16b, v0.16b, v1.16b 843; CHECK-NEXT: ret 844 %res = shl <16 x i8> %op1, %op2 845 ret <16 x i8> %res 846} 847 848define void @shl_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 849; CHECK-LABEL: shl_v32i8: 850; CHECK: // %bb.0: 851; CHECK-NEXT: ptrue p0.b, vl32 852; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 853; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 854; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b 855; CHECK-NEXT: st1b { z0.b }, p0, [x0] 856; CHECK-NEXT: ret 857 %op1 = load <32 x i8>, ptr %a 858 %op2 = load <32 x i8>, ptr %b 859 %res = shl <32 x i8> %op1, %op2 860 store <32 x i8> %res, ptr %a 861 ret void 862} 863 864define void @shl_v64i8(ptr %a, ptr %b) #0 { 865; VBITS_GE_256-LABEL: shl_v64i8: 866; VBITS_GE_256: // %bb.0: 867; VBITS_GE_256-NEXT: ptrue p0.b, vl32 868; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 869; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 870; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 871; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 872; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 873; VBITS_GE_256-NEXT: lsl z0.b, p0/m, z0.b, z1.b 874; VBITS_GE_256-NEXT: movprfx z1, z2 875; VBITS_GE_256-NEXT: lsl z1.b, p0/m, z1.b, z3.b 876; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 877; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 878; VBITS_GE_256-NEXT: ret 879; 880; VBITS_GE_512-LABEL: shl_v64i8: 881; VBITS_GE_512: // %bb.0: 882; VBITS_GE_512-NEXT: ptrue p0.b, vl64 883; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 884; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 885; VBITS_GE_512-NEXT: lsl z0.b, p0/m, z0.b, z1.b 886; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 887; VBITS_GE_512-NEXT: ret 888 %op1 = load <64 x i8>, ptr %a 889 %op2 = load <64 x i8>, ptr %b 890 %res = shl <64 x i8> %op1, %op2 891 store <64 x i8> %res, ptr %a 892 ret void 893} 894 895define void @shl_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 896; CHECK-LABEL: shl_v128i8: 897; CHECK: // %bb.0: 898; CHECK-NEXT: ptrue p0.b, vl128 899; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 900; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 901; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b 902; CHECK-NEXT: st1b { z0.b }, p0, [x0] 903; CHECK-NEXT: ret 904 %op1 = load <128 x i8>, ptr %a 905 %op2 = load <128 x i8>, ptr %b 906 %res = shl <128 x i8> %op1, %op2 907 store <128 x i8> %res, ptr %a 908 ret void 909} 910 911define void @shl_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 912; CHECK-LABEL: shl_v256i8: 913; CHECK: // %bb.0: 914; CHECK-NEXT: ptrue p0.b, vl256 915; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 916; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 917; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b 918; CHECK-NEXT: st1b { z0.b }, p0, [x0] 919; CHECK-NEXT: ret 920 %op1 = load <256 x i8>, ptr %a 921 %op2 = load <256 x i8>, ptr %b 922 %res = shl <256 x i8> %op1, %op2 923 store <256 x i8> %res, ptr %a 924 ret void 925} 926 927; Don't use SVE for 64-bit vectors. 928define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 929; CHECK-LABEL: shl_v4i16: 930; CHECK: // %bb.0: 931; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h 932; CHECK-NEXT: ret 933 %res = shl <4 x i16> %op1, %op2 934 ret <4 x i16> %res 935} 936 937; Don't use SVE for 128-bit vectors. 938define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 939; CHECK-LABEL: shl_v8i16: 940; CHECK: // %bb.0: 941; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h 942; CHECK-NEXT: ret 943 %res = shl <8 x i16> %op1, %op2 944 ret <8 x i16> %res 945} 946 947define void @shl_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 948; CHECK-LABEL: shl_v16i16: 949; CHECK: // %bb.0: 950; CHECK-NEXT: ptrue p0.h, vl16 951; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 952; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 953; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h 954; CHECK-NEXT: st1h { z0.h }, p0, [x0] 955; CHECK-NEXT: ret 956 %op1 = load <16 x i16>, ptr %a 957 %op2 = load <16 x i16>, ptr %b 958 %res = shl <16 x i16> %op1, %op2 959 store <16 x i16> %res, ptr %a 960 ret void 961} 962 963define void @shl_v32i16(ptr %a, ptr %b) #0 { 964; VBITS_GE_256-LABEL: shl_v32i16: 965; VBITS_GE_256: // %bb.0: 966; VBITS_GE_256-NEXT: ptrue p0.h, vl16 967; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 968; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 969; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 970; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 971; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 972; VBITS_GE_256-NEXT: lsl z0.h, p0/m, z0.h, z1.h 973; VBITS_GE_256-NEXT: movprfx z1, z2 974; VBITS_GE_256-NEXT: lsl z1.h, p0/m, z1.h, z3.h 975; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 976; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 977; VBITS_GE_256-NEXT: ret 978; 979; VBITS_GE_512-LABEL: shl_v32i16: 980; VBITS_GE_512: // %bb.0: 981; VBITS_GE_512-NEXT: ptrue p0.h, vl32 982; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 983; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 984; VBITS_GE_512-NEXT: lsl z0.h, p0/m, z0.h, z1.h 985; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 986; VBITS_GE_512-NEXT: ret 987 %op1 = load <32 x i16>, ptr %a 988 %op2 = load <32 x i16>, ptr %b 989 %res = shl <32 x i16> %op1, %op2 990 store <32 x i16> %res, ptr %a 991 ret void 992} 993 994define void @shl_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 995; CHECK-LABEL: shl_v64i16: 996; CHECK: // %bb.0: 997; CHECK-NEXT: ptrue p0.h, vl64 998; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 999; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1000; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h 1001; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1002; CHECK-NEXT: ret 1003 %op1 = load <64 x i16>, ptr %a 1004 %op2 = load <64 x i16>, ptr %b 1005 %res = shl <64 x i16> %op1, %op2 1006 store <64 x i16> %res, ptr %a 1007 ret void 1008} 1009 1010define void @shl_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1011; CHECK-LABEL: shl_v128i16: 1012; CHECK: // %bb.0: 1013; CHECK-NEXT: ptrue p0.h, vl128 1014; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1015; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1016; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h 1017; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1018; CHECK-NEXT: ret 1019 %op1 = load <128 x i16>, ptr %a 1020 %op2 = load <128 x i16>, ptr %b 1021 %res = shl <128 x i16> %op1, %op2 1022 store <128 x i16> %res, ptr %a 1023 ret void 1024} 1025 1026; Don't use SVE for 64-bit vectors. 1027define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 1028; CHECK-LABEL: shl_v2i32: 1029; CHECK: // %bb.0: 1030; CHECK-NEXT: ushl v0.2s, v0.2s, v1.2s 1031; CHECK-NEXT: ret 1032 %res = shl <2 x i32> %op1, %op2 1033 ret <2 x i32> %res 1034} 1035 1036; Don't use SVE for 128-bit vectors. 1037define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 1038; CHECK-LABEL: shl_v4i32: 1039; CHECK: // %bb.0: 1040; CHECK-NEXT: ushl v0.4s, v0.4s, v1.4s 1041; CHECK-NEXT: ret 1042 %res = shl <4 x i32> %op1, %op2 1043 ret <4 x i32> %res 1044} 1045 1046define void @shl_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1047; CHECK-LABEL: shl_v8i32: 1048; CHECK: // %bb.0: 1049; CHECK-NEXT: ptrue p0.s, vl8 1050; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1051; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1052; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s 1053; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1054; CHECK-NEXT: ret 1055 %op1 = load <8 x i32>, ptr %a 1056 %op2 = load <8 x i32>, ptr %b 1057 %res = shl <8 x i32> %op1, %op2 1058 store <8 x i32> %res, ptr %a 1059 ret void 1060} 1061 1062define void @shl_v16i32(ptr %a, ptr %b) #0 { 1063; VBITS_GE_256-LABEL: shl_v16i32: 1064; VBITS_GE_256: // %bb.0: 1065; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1066; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1067; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1068; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1069; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1070; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1071; VBITS_GE_256-NEXT: lsl z0.s, p0/m, z0.s, z1.s 1072; VBITS_GE_256-NEXT: movprfx z1, z2 1073; VBITS_GE_256-NEXT: lsl z1.s, p0/m, z1.s, z3.s 1074; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1075; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1076; VBITS_GE_256-NEXT: ret 1077; 1078; VBITS_GE_512-LABEL: shl_v16i32: 1079; VBITS_GE_512: // %bb.0: 1080; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1081; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1082; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1083; VBITS_GE_512-NEXT: lsl z0.s, p0/m, z0.s, z1.s 1084; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1085; VBITS_GE_512-NEXT: ret 1086 %op1 = load <16 x i32>, ptr %a 1087 %op2 = load <16 x i32>, ptr %b 1088 %res = shl <16 x i32> %op1, %op2 1089 store <16 x i32> %res, ptr %a 1090 ret void 1091} 1092 1093define void @shl_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1094; CHECK-LABEL: shl_v32i32: 1095; CHECK: // %bb.0: 1096; CHECK-NEXT: ptrue p0.s, vl32 1097; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1098; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1099; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s 1100; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1101; CHECK-NEXT: ret 1102 %op1 = load <32 x i32>, ptr %a 1103 %op2 = load <32 x i32>, ptr %b 1104 %res = shl <32 x i32> %op1, %op2 1105 store <32 x i32> %res, ptr %a 1106 ret void 1107} 1108 1109define void @shl_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1110; CHECK-LABEL: shl_v64i32: 1111; CHECK: // %bb.0: 1112; CHECK-NEXT: ptrue p0.s, vl64 1113; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1114; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1115; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s 1116; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1117; CHECK-NEXT: ret 1118 %op1 = load <64 x i32>, ptr %a 1119 %op2 = load <64 x i32>, ptr %b 1120 %res = shl <64 x i32> %op1, %op2 1121 store <64 x i32> %res, ptr %a 1122 ret void 1123} 1124 1125; Don't use SVE for 64-bit vectors. 1126define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 1127; CHECK-LABEL: shl_v1i64: 1128; CHECK: // %bb.0: 1129; CHECK-NEXT: ushl d0, d0, d1 1130; CHECK-NEXT: ret 1131 %res = shl <1 x i64> %op1, %op2 1132 ret <1 x i64> %res 1133} 1134 1135; Don't use SVE for 128-bit vectors. 1136define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 1137; CHECK-LABEL: shl_v2i64: 1138; CHECK: // %bb.0: 1139; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d 1140; CHECK-NEXT: ret 1141 %res = shl <2 x i64> %op1, %op2 1142 ret <2 x i64> %res 1143} 1144 1145define void @shl_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1146; CHECK-LABEL: shl_v4i64: 1147; CHECK: // %bb.0: 1148; CHECK-NEXT: ptrue p0.d, vl4 1149; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1150; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1151; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d 1152; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1153; CHECK-NEXT: ret 1154 %op1 = load <4 x i64>, ptr %a 1155 %op2 = load <4 x i64>, ptr %b 1156 %res = shl <4 x i64> %op1, %op2 1157 store <4 x i64> %res, ptr %a 1158 ret void 1159} 1160 1161define void @shl_v8i64(ptr %a, ptr %b) #0 { 1162; VBITS_GE_256-LABEL: shl_v8i64: 1163; VBITS_GE_256: // %bb.0: 1164; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1165; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1166; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1167; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1168; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1169; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1170; VBITS_GE_256-NEXT: lsl z0.d, p0/m, z0.d, z1.d 1171; VBITS_GE_256-NEXT: movprfx z1, z2 1172; VBITS_GE_256-NEXT: lsl z1.d, p0/m, z1.d, z3.d 1173; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1174; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1175; VBITS_GE_256-NEXT: ret 1176; 1177; VBITS_GE_512-LABEL: shl_v8i64: 1178; VBITS_GE_512: // %bb.0: 1179; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1180; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1181; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1182; VBITS_GE_512-NEXT: lsl z0.d, p0/m, z0.d, z1.d 1183; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1184; VBITS_GE_512-NEXT: ret 1185 %op1 = load <8 x i64>, ptr %a 1186 %op2 = load <8 x i64>, ptr %b 1187 %res = shl <8 x i64> %op1, %op2 1188 store <8 x i64> %res, ptr %a 1189 ret void 1190} 1191 1192define void @shl_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1193; CHECK-LABEL: shl_v16i64: 1194; CHECK: // %bb.0: 1195; CHECK-NEXT: ptrue p0.d, vl16 1196; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1197; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1198; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d 1199; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1200; CHECK-NEXT: ret 1201 %op1 = load <16 x i64>, ptr %a 1202 %op2 = load <16 x i64>, ptr %b 1203 %res = shl <16 x i64> %op1, %op2 1204 store <16 x i64> %res, ptr %a 1205 ret void 1206} 1207 1208define void @shl_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1209; CHECK-LABEL: shl_v32i64: 1210; CHECK: // %bb.0: 1211; CHECK-NEXT: ptrue p0.d, vl32 1212; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1213; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1214; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d 1215; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1216; CHECK-NEXT: ret 1217 %op1 = load <32 x i64>, ptr %a 1218 %op2 = load <32 x i64>, ptr %b 1219 %res = shl <32 x i64> %op1, %op2 1220 store <32 x i64> %res, ptr %a 1221 ret void 1222} 1223 1224attributes #0 = { "target-features"="+sve" } 1225