1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; ADD 10; 11 12; Don't use SVE for 64-bit vectors. 13define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 14; CHECK-LABEL: add_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: add v0.8b, v0.8b, v1.8b 17; CHECK-NEXT: ret 18 %res = add <8 x i8> %op1, %op2 19 ret <8 x i8> %res 20} 21 22; Don't use SVE for 128-bit vectors. 23define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 24; CHECK-LABEL: add_v16i8: 25; CHECK: // %bb.0: 26; CHECK-NEXT: add v0.16b, v0.16b, v1.16b 27; CHECK-NEXT: ret 28 %res = add <16 x i8> %op1, %op2 29 ret <16 x i8> %res 30} 31 32define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 33; CHECK-LABEL: add_v32i8: 34; CHECK: // %bb.0: 35; CHECK-NEXT: ptrue p0.b, vl32 36; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 37; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 38; CHECK-NEXT: add z0.b, z0.b, z1.b 39; CHECK-NEXT: st1b { z0.b }, p0, [x0] 40; CHECK-NEXT: ret 41 %op1 = load <32 x i8>, ptr %a 42 %op2 = load <32 x i8>, ptr %b 43 %res = add <32 x i8> %op1, %op2 44 store <32 x i8> %res, ptr %a 45 ret void 46} 47 48define void @add_v64i8(ptr %a, ptr %b) #0 { 49; VBITS_GE_256-LABEL: add_v64i8: 50; VBITS_GE_256: // %bb.0: 51; VBITS_GE_256-NEXT: ptrue p0.b, vl32 52; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 53; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 54; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 55; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 56; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 57; VBITS_GE_256-NEXT: add z0.b, z0.b, z1.b 58; VBITS_GE_256-NEXT: add z1.b, z2.b, z3.b 59; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 60; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 61; VBITS_GE_256-NEXT: ret 62; 63; VBITS_GE_512-LABEL: add_v64i8: 64; VBITS_GE_512: // %bb.0: 65; VBITS_GE_512-NEXT: ptrue p0.b, vl64 66; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 67; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 68; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b 69; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 70; VBITS_GE_512-NEXT: ret 71 %op1 = load <64 x i8>, ptr %a 72 %op2 = load <64 x i8>, ptr %b 73 %res = add <64 x i8> %op1, %op2 74 store <64 x i8> %res, ptr %a 75 ret void 76} 77 78define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 79; CHECK-LABEL: add_v128i8: 80; CHECK: // %bb.0: 81; CHECK-NEXT: ptrue p0.b, vl128 82; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 83; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 84; CHECK-NEXT: add z0.b, z0.b, z1.b 85; CHECK-NEXT: st1b { z0.b }, p0, [x0] 86; CHECK-NEXT: ret 87 %op1 = load <128 x i8>, ptr %a 88 %op2 = load <128 x i8>, ptr %b 89 %res = add <128 x i8> %op1, %op2 90 store <128 x i8> %res, ptr %a 91 ret void 92} 93 94define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 95; CHECK-LABEL: add_v256i8: 96; CHECK: // %bb.0: 97; CHECK-NEXT: ptrue p0.b, vl256 98; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 99; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 100; CHECK-NEXT: add z0.b, z0.b, z1.b 101; CHECK-NEXT: st1b { z0.b }, p0, [x0] 102; CHECK-NEXT: ret 103 %op1 = load <256 x i8>, ptr %a 104 %op2 = load <256 x i8>, ptr %b 105 %res = add <256 x i8> %op1, %op2 106 store <256 x i8> %res, ptr %a 107 ret void 108} 109 110; Don't use SVE for 64-bit vectors. 111define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 112; CHECK-LABEL: add_v4i16: 113; CHECK: // %bb.0: 114; CHECK-NEXT: add v0.4h, v0.4h, v1.4h 115; CHECK-NEXT: ret 116 %res = add <4 x i16> %op1, %op2 117 ret <4 x i16> %res 118} 119 120; Don't use SVE for 128-bit vectors. 121define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 122; CHECK-LABEL: add_v8i16: 123; CHECK: // %bb.0: 124; CHECK-NEXT: add v0.8h, v0.8h, v1.8h 125; CHECK-NEXT: ret 126 %res = add <8 x i16> %op1, %op2 127 ret <8 x i16> %res 128} 129 130define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 131; CHECK-LABEL: add_v16i16: 132; CHECK: // %bb.0: 133; CHECK-NEXT: ptrue p0.h, vl16 134; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 135; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 136; CHECK-NEXT: add z0.h, z0.h, z1.h 137; CHECK-NEXT: st1h { z0.h }, p0, [x0] 138; CHECK-NEXT: ret 139 %op1 = load <16 x i16>, ptr %a 140 %op2 = load <16 x i16>, ptr %b 141 %res = add <16 x i16> %op1, %op2 142 store <16 x i16> %res, ptr %a 143 ret void 144} 145 146define void @add_v32i16(ptr %a, ptr %b) #0 { 147; VBITS_GE_256-LABEL: add_v32i16: 148; VBITS_GE_256: // %bb.0: 149; VBITS_GE_256-NEXT: ptrue p0.h, vl16 150; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 151; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 152; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 153; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 154; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 155; VBITS_GE_256-NEXT: add z0.h, z0.h, z1.h 156; VBITS_GE_256-NEXT: add z1.h, z2.h, z3.h 157; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 158; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 159; VBITS_GE_256-NEXT: ret 160; 161; VBITS_GE_512-LABEL: add_v32i16: 162; VBITS_GE_512: // %bb.0: 163; VBITS_GE_512-NEXT: ptrue p0.h, vl32 164; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 165; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 166; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h 167; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 168; VBITS_GE_512-NEXT: ret 169 %op1 = load <32 x i16>, ptr %a 170 %op2 = load <32 x i16>, ptr %b 171 %res = add <32 x i16> %op1, %op2 172 store <32 x i16> %res, ptr %a 173 ret void 174} 175 176define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 177; CHECK-LABEL: add_v64i16: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ptrue p0.h, vl64 180; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 181; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 182; CHECK-NEXT: add z0.h, z0.h, z1.h 183; CHECK-NEXT: st1h { z0.h }, p0, [x0] 184; CHECK-NEXT: ret 185 %op1 = load <64 x i16>, ptr %a 186 %op2 = load <64 x i16>, ptr %b 187 %res = add <64 x i16> %op1, %op2 188 store <64 x i16> %res, ptr %a 189 ret void 190} 191 192define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 193; CHECK-LABEL: add_v128i16: 194; CHECK: // %bb.0: 195; CHECK-NEXT: ptrue p0.h, vl128 196; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 197; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 198; CHECK-NEXT: add z0.h, z0.h, z1.h 199; CHECK-NEXT: st1h { z0.h }, p0, [x0] 200; CHECK-NEXT: ret 201 %op1 = load <128 x i16>, ptr %a 202 %op2 = load <128 x i16>, ptr %b 203 %res = add <128 x i16> %op1, %op2 204 store <128 x i16> %res, ptr %a 205 ret void 206} 207 208; Don't use SVE for 64-bit vectors. 209define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 210; CHECK-LABEL: add_v2i32: 211; CHECK: // %bb.0: 212; CHECK-NEXT: add v0.2s, v0.2s, v1.2s 213; CHECK-NEXT: ret 214 %res = add <2 x i32> %op1, %op2 215 ret <2 x i32> %res 216} 217 218; Don't use SVE for 128-bit vectors. 219define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 220; CHECK-LABEL: add_v4i32: 221; CHECK: // %bb.0: 222; CHECK-NEXT: add v0.4s, v0.4s, v1.4s 223; CHECK-NEXT: ret 224 %res = add <4 x i32> %op1, %op2 225 ret <4 x i32> %res 226} 227 228define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 229; CHECK-LABEL: add_v8i32: 230; CHECK: // %bb.0: 231; CHECK-NEXT: ptrue p0.s, vl8 232; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 233; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 234; CHECK-NEXT: add z0.s, z0.s, z1.s 235; CHECK-NEXT: st1w { z0.s }, p0, [x0] 236; CHECK-NEXT: ret 237 %op1 = load <8 x i32>, ptr %a 238 %op2 = load <8 x i32>, ptr %b 239 %res = add <8 x i32> %op1, %op2 240 store <8 x i32> %res, ptr %a 241 ret void 242} 243 244define void @add_v16i32(ptr %a, ptr %b) #0 { 245; VBITS_GE_256-LABEL: add_v16i32: 246; VBITS_GE_256: // %bb.0: 247; VBITS_GE_256-NEXT: ptrue p0.s, vl8 248; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 249; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 250; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 251; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 252; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 253; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s 254; VBITS_GE_256-NEXT: add z1.s, z2.s, z3.s 255; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 256; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 257; VBITS_GE_256-NEXT: ret 258; 259; VBITS_GE_512-LABEL: add_v16i32: 260; VBITS_GE_512: // %bb.0: 261; VBITS_GE_512-NEXT: ptrue p0.s, vl16 262; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 263; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 264; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s 265; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 266; VBITS_GE_512-NEXT: ret 267 %op1 = load <16 x i32>, ptr %a 268 %op2 = load <16 x i32>, ptr %b 269 %res = add <16 x i32> %op1, %op2 270 store <16 x i32> %res, ptr %a 271 ret void 272} 273 274define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 275; CHECK-LABEL: add_v32i32: 276; CHECK: // %bb.0: 277; CHECK-NEXT: ptrue p0.s, vl32 278; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 279; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 280; CHECK-NEXT: add z0.s, z0.s, z1.s 281; CHECK-NEXT: st1w { z0.s }, p0, [x0] 282; CHECK-NEXT: ret 283 %op1 = load <32 x i32>, ptr %a 284 %op2 = load <32 x i32>, ptr %b 285 %res = add <32 x i32> %op1, %op2 286 store <32 x i32> %res, ptr %a 287 ret void 288} 289 290define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 291; CHECK-LABEL: add_v64i32: 292; CHECK: // %bb.0: 293; CHECK-NEXT: ptrue p0.s, vl64 294; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 295; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 296; CHECK-NEXT: add z0.s, z0.s, z1.s 297; CHECK-NEXT: st1w { z0.s }, p0, [x0] 298; CHECK-NEXT: ret 299 %op1 = load <64 x i32>, ptr %a 300 %op2 = load <64 x i32>, ptr %b 301 %res = add <64 x i32> %op1, %op2 302 store <64 x i32> %res, ptr %a 303 ret void 304} 305 306; Don't use SVE for 64-bit vectors. 307define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 308; CHECK-LABEL: add_v1i64: 309; CHECK: // %bb.0: 310; CHECK-NEXT: add d0, d0, d1 311; CHECK-NEXT: ret 312 %res = add <1 x i64> %op1, %op2 313 ret <1 x i64> %res 314} 315 316; Don't use SVE for 128-bit vectors. 317define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 318; CHECK-LABEL: add_v2i64: 319; CHECK: // %bb.0: 320; CHECK-NEXT: add v0.2d, v0.2d, v1.2d 321; CHECK-NEXT: ret 322 %res = add <2 x i64> %op1, %op2 323 ret <2 x i64> %res 324} 325 326define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 327; CHECK-LABEL: add_v4i64: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ptrue p0.d, vl4 330; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 331; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 332; CHECK-NEXT: add z0.d, z0.d, z1.d 333; CHECK-NEXT: st1d { z0.d }, p0, [x0] 334; CHECK-NEXT: ret 335 %op1 = load <4 x i64>, ptr %a 336 %op2 = load <4 x i64>, ptr %b 337 %res = add <4 x i64> %op1, %op2 338 store <4 x i64> %res, ptr %a 339 ret void 340} 341 342define void @add_v8i64(ptr %a, ptr %b) #0 { 343; VBITS_GE_256-LABEL: add_v8i64: 344; VBITS_GE_256: // %bb.0: 345; VBITS_GE_256-NEXT: ptrue p0.d, vl4 346; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 347; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 348; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 349; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 350; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 351; VBITS_GE_256-NEXT: add z0.d, z0.d, z1.d 352; VBITS_GE_256-NEXT: add z1.d, z2.d, z3.d 353; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 354; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 355; VBITS_GE_256-NEXT: ret 356; 357; VBITS_GE_512-LABEL: add_v8i64: 358; VBITS_GE_512: // %bb.0: 359; VBITS_GE_512-NEXT: ptrue p0.d, vl8 360; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 361; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 362; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d 363; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 364; VBITS_GE_512-NEXT: ret 365 %op1 = load <8 x i64>, ptr %a 366 %op2 = load <8 x i64>, ptr %b 367 %res = add <8 x i64> %op1, %op2 368 store <8 x i64> %res, ptr %a 369 ret void 370} 371 372define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 373; CHECK-LABEL: add_v16i64: 374; CHECK: // %bb.0: 375; CHECK-NEXT: ptrue p0.d, vl16 376; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 377; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 378; CHECK-NEXT: add z0.d, z0.d, z1.d 379; CHECK-NEXT: st1d { z0.d }, p0, [x0] 380; CHECK-NEXT: ret 381 %op1 = load <16 x i64>, ptr %a 382 %op2 = load <16 x i64>, ptr %b 383 %res = add <16 x i64> %op1, %op2 384 store <16 x i64> %res, ptr %a 385 ret void 386} 387 388define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 389; CHECK-LABEL: add_v32i64: 390; CHECK: // %bb.0: 391; CHECK-NEXT: ptrue p0.d, vl16 392; CHECK-NEXT: mov x8, #16 // =0x10 393; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 394; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 395; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] 396; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] 397; CHECK-NEXT: add z0.d, z0.d, z1.d 398; CHECK-NEXT: add z1.d, z2.d, z3.d 399; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 400; CHECK-NEXT: st1d { z1.d }, p0, [x0] 401; CHECK-NEXT: ret 402 %op1 = load <32 x i64>, ptr %a 403 %op2 = load <32 x i64>, ptr %b 404 %res = add <32 x i64> %op1, %op2 405 store <32 x i64> %res, ptr %a 406 ret void 407} 408 409; 410; MUL 411; 412 413; Don't use SVE for 64-bit vectors. 414define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 415; CHECK-LABEL: mul_v8i8: 416; CHECK: // %bb.0: 417; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b 418; CHECK-NEXT: ret 419 %res = mul <8 x i8> %op1, %op2 420 ret <8 x i8> %res 421} 422 423; Don't use SVE for 128-bit vectors. 424define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 425; CHECK-LABEL: mul_v16i8: 426; CHECK: // %bb.0: 427; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b 428; CHECK-NEXT: ret 429 %res = mul <16 x i8> %op1, %op2 430 ret <16 x i8> %res 431} 432 433define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 434; CHECK-LABEL: mul_v32i8: 435; CHECK: // %bb.0: 436; CHECK-NEXT: ptrue p0.b, vl32 437; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 438; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 439; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 440; CHECK-NEXT: st1b { z0.b }, p0, [x0] 441; CHECK-NEXT: ret 442 %op1 = load <32 x i8>, ptr %a 443 %op2 = load <32 x i8>, ptr %b 444 %res = mul <32 x i8> %op1, %op2 445 store <32 x i8> %res, ptr %a 446 ret void 447} 448 449define void @mul_v64i8(ptr %a, ptr %b) #0 { 450; VBITS_GE_256-LABEL: mul_v64i8: 451; VBITS_GE_256: // %bb.0: 452; VBITS_GE_256-NEXT: ptrue p0.b, vl32 453; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 454; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 455; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 456; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 457; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 458; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z1.b 459; VBITS_GE_256-NEXT: movprfx z1, z2 460; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b 461; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 462; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 463; VBITS_GE_256-NEXT: ret 464; 465; VBITS_GE_512-LABEL: mul_v64i8: 466; VBITS_GE_512: // %bb.0: 467; VBITS_GE_512-NEXT: ptrue p0.b, vl64 468; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 469; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 470; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b 471; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 472; VBITS_GE_512-NEXT: ret 473 %op1 = load <64 x i8>, ptr %a 474 %op2 = load <64 x i8>, ptr %b 475 %res = mul <64 x i8> %op1, %op2 476 store <64 x i8> %res, ptr %a 477 ret void 478} 479 480define void @mul_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 481; CHECK-LABEL: mul_v128i8: 482; CHECK: // %bb.0: 483; CHECK-NEXT: ptrue p0.b, vl128 484; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 485; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 486; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 487; CHECK-NEXT: st1b { z0.b }, p0, [x0] 488; CHECK-NEXT: ret 489 %op1 = load <128 x i8>, ptr %a 490 %op2 = load <128 x i8>, ptr %b 491 %res = mul <128 x i8> %op1, %op2 492 store <128 x i8> %res, ptr %a 493 ret void 494} 495 496define void @mul_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 497; CHECK-LABEL: mul_v256i8: 498; CHECK: // %bb.0: 499; CHECK-NEXT: ptrue p0.b, vl256 500; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 501; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 502; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b 503; CHECK-NEXT: st1b { z0.b }, p0, [x0] 504; CHECK-NEXT: ret 505 %op1 = load <256 x i8>, ptr %a 506 %op2 = load <256 x i8>, ptr %b 507 %res = mul <256 x i8> %op1, %op2 508 store <256 x i8> %res, ptr %a 509 ret void 510} 511 512; Don't use SVE for 64-bit vectors. 513define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 514; CHECK-LABEL: mul_v4i16: 515; CHECK: // %bb.0: 516; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h 517; CHECK-NEXT: ret 518 %res = mul <4 x i16> %op1, %op2 519 ret <4 x i16> %res 520} 521 522; Don't use SVE for 128-bit vectors. 523define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 524; CHECK-LABEL: mul_v8i16: 525; CHECK: // %bb.0: 526; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h 527; CHECK-NEXT: ret 528 %res = mul <8 x i16> %op1, %op2 529 ret <8 x i16> %res 530} 531 532define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 533; CHECK-LABEL: mul_v16i16: 534; CHECK: // %bb.0: 535; CHECK-NEXT: ptrue p0.h, vl16 536; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 537; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 538; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 539; CHECK-NEXT: st1h { z0.h }, p0, [x0] 540; CHECK-NEXT: ret 541 %op1 = load <16 x i16>, ptr %a 542 %op2 = load <16 x i16>, ptr %b 543 %res = mul <16 x i16> %op1, %op2 544 store <16 x i16> %res, ptr %a 545 ret void 546} 547 548define void @mul_v32i16(ptr %a, ptr %b) #0 { 549; VBITS_GE_256-LABEL: mul_v32i16: 550; VBITS_GE_256: // %bb.0: 551; VBITS_GE_256-NEXT: ptrue p0.h, vl16 552; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 553; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 554; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 555; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 556; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 557; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z1.h 558; VBITS_GE_256-NEXT: movprfx z1, z2 559; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h 560; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 561; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 562; VBITS_GE_256-NEXT: ret 563; 564; VBITS_GE_512-LABEL: mul_v32i16: 565; VBITS_GE_512: // %bb.0: 566; VBITS_GE_512-NEXT: ptrue p0.h, vl32 567; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 568; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 569; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h 570; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 571; VBITS_GE_512-NEXT: ret 572 %op1 = load <32 x i16>, ptr %a 573 %op2 = load <32 x i16>, ptr %b 574 %res = mul <32 x i16> %op1, %op2 575 store <32 x i16> %res, ptr %a 576 ret void 577} 578 579define void @mul_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 580; CHECK-LABEL: mul_v64i16: 581; CHECK: // %bb.0: 582; CHECK-NEXT: ptrue p0.h, vl64 583; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 584; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 585; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 586; CHECK-NEXT: st1h { z0.h }, p0, [x0] 587; CHECK-NEXT: ret 588 %op1 = load <64 x i16>, ptr %a 589 %op2 = load <64 x i16>, ptr %b 590 %res = mul <64 x i16> %op1, %op2 591 store <64 x i16> %res, ptr %a 592 ret void 593} 594 595define void @mul_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 596; CHECK-LABEL: mul_v128i16: 597; CHECK: // %bb.0: 598; CHECK-NEXT: ptrue p0.h, vl128 599; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 600; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 601; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h 602; CHECK-NEXT: st1h { z0.h }, p0, [x0] 603; CHECK-NEXT: ret 604 %op1 = load <128 x i16>, ptr %a 605 %op2 = load <128 x i16>, ptr %b 606 %res = mul <128 x i16> %op1, %op2 607 store <128 x i16> %res, ptr %a 608 ret void 609} 610 611; Don't use SVE for 64-bit vectors. 612define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 613; CHECK-LABEL: mul_v2i32: 614; CHECK: // %bb.0: 615; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s 616; CHECK-NEXT: ret 617 %res = mul <2 x i32> %op1, %op2 618 ret <2 x i32> %res 619} 620 621; Don't use SVE for 128-bit vectors. 622define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 623; CHECK-LABEL: mul_v4i32: 624; CHECK: // %bb.0: 625; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s 626; CHECK-NEXT: ret 627 %res = mul <4 x i32> %op1, %op2 628 ret <4 x i32> %res 629} 630 631define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 632; CHECK-LABEL: mul_v8i32: 633; CHECK: // %bb.0: 634; CHECK-NEXT: ptrue p0.s, vl8 635; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 636; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 637; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 638; CHECK-NEXT: st1w { z0.s }, p0, [x0] 639; CHECK-NEXT: ret 640 %op1 = load <8 x i32>, ptr %a 641 %op2 = load <8 x i32>, ptr %b 642 %res = mul <8 x i32> %op1, %op2 643 store <8 x i32> %res, ptr %a 644 ret void 645} 646 647define void @mul_v16i32(ptr %a, ptr %b) #0 { 648; VBITS_GE_256-LABEL: mul_v16i32: 649; VBITS_GE_256: // %bb.0: 650; VBITS_GE_256-NEXT: ptrue p0.s, vl8 651; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 652; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 653; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 654; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 655; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 656; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z1.s 657; VBITS_GE_256-NEXT: movprfx z1, z2 658; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s 659; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 660; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 661; VBITS_GE_256-NEXT: ret 662; 663; VBITS_GE_512-LABEL: mul_v16i32: 664; VBITS_GE_512: // %bb.0: 665; VBITS_GE_512-NEXT: ptrue p0.s, vl16 666; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 667; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 668; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s 669; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 670; VBITS_GE_512-NEXT: ret 671 %op1 = load <16 x i32>, ptr %a 672 %op2 = load <16 x i32>, ptr %b 673 %res = mul <16 x i32> %op1, %op2 674 store <16 x i32> %res, ptr %a 675 ret void 676} 677 678define void @mul_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 679; CHECK-LABEL: mul_v32i32: 680; CHECK: // %bb.0: 681; CHECK-NEXT: ptrue p0.s, vl32 682; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 683; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 684; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 685; CHECK-NEXT: st1w { z0.s }, p0, [x0] 686; CHECK-NEXT: ret 687 %op1 = load <32 x i32>, ptr %a 688 %op2 = load <32 x i32>, ptr %b 689 %res = mul <32 x i32> %op1, %op2 690 store <32 x i32> %res, ptr %a 691 ret void 692} 693 694define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 695; CHECK-LABEL: mul_v64i32: 696; CHECK: // %bb.0: 697; CHECK-NEXT: ptrue p0.s, vl64 698; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 699; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 700; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s 701; CHECK-NEXT: st1w { z0.s }, p0, [x0] 702; CHECK-NEXT: ret 703 %op1 = load <64 x i32>, ptr %a 704 %op2 = load <64 x i32>, ptr %b 705 %res = mul <64 x i32> %op1, %op2 706 store <64 x i32> %res, ptr %a 707 ret void 708} 709 710define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 { 711; CHECK-LABEL: mul_v1i64: 712; CHECK: // %bb.0: 713; CHECK-NEXT: ptrue p0.d, vl1 714; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 715; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 716; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 717; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 718; CHECK-NEXT: ret 719 %res = mul <1 x i64> %op1, %op2 720 ret <1 x i64> %res 721} 722 723define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 { 724; CHECK-LABEL: mul_v2i64: 725; CHECK: // %bb.0: 726; CHECK-NEXT: ptrue p0.d, vl2 727; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 728; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 729; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 730; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 731; CHECK-NEXT: ret 732 %res = mul <2 x i64> %op1, %op2 733 ret <2 x i64> %res 734} 735 736define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 737; CHECK-LABEL: mul_v4i64: 738; CHECK: // %bb.0: 739; CHECK-NEXT: ptrue p0.d, vl4 740; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 741; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 742; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 743; CHECK-NEXT: st1d { z0.d }, p0, [x0] 744; CHECK-NEXT: ret 745 %op1 = load <4 x i64>, ptr %a 746 %op2 = load <4 x i64>, ptr %b 747 %res = mul <4 x i64> %op1, %op2 748 store <4 x i64> %res, ptr %a 749 ret void 750} 751 752define void @mul_v8i64(ptr %a, ptr %b) #0 { 753; VBITS_GE_256-LABEL: mul_v8i64: 754; VBITS_GE_256: // %bb.0: 755; VBITS_GE_256-NEXT: ptrue p0.d, vl4 756; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 757; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 758; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 759; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 760; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 761; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z1.d 762; VBITS_GE_256-NEXT: movprfx z1, z2 763; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d 764; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 765; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 766; VBITS_GE_256-NEXT: ret 767; 768; VBITS_GE_512-LABEL: mul_v8i64: 769; VBITS_GE_512: // %bb.0: 770; VBITS_GE_512-NEXT: ptrue p0.d, vl8 771; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 772; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 773; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d 774; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 775; VBITS_GE_512-NEXT: ret 776 %op1 = load <8 x i64>, ptr %a 777 %op2 = load <8 x i64>, ptr %b 778 %res = mul <8 x i64> %op1, %op2 779 store <8 x i64> %res, ptr %a 780 ret void 781} 782 783define void @mul_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 784; CHECK-LABEL: mul_v16i64: 785; CHECK: // %bb.0: 786; CHECK-NEXT: ptrue p0.d, vl16 787; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 788; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 789; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 790; CHECK-NEXT: st1d { z0.d }, p0, [x0] 791; CHECK-NEXT: ret 792 %op1 = load <16 x i64>, ptr %a 793 %op2 = load <16 x i64>, ptr %b 794 %res = mul <16 x i64> %op1, %op2 795 store <16 x i64> %res, ptr %a 796 ret void 797} 798 799define void @mul_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 800; CHECK-LABEL: mul_v32i64: 801; CHECK: // %bb.0: 802; CHECK-NEXT: ptrue p0.d, vl32 803; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 804; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 805; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 806; CHECK-NEXT: st1d { z0.d }, p0, [x0] 807; CHECK-NEXT: ret 808 %op1 = load <32 x i64>, ptr %a 809 %op2 = load <32 x i64>, ptr %b 810 %res = mul <32 x i64> %op1, %op2 811 store <32 x i64> %res, ptr %a 812 ret void 813} 814 815; 816; SUB 817; 818 819; Don't use SVE for 64-bit vectors. 820define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 { 821; CHECK-LABEL: sub_v8i8: 822; CHECK: // %bb.0: 823; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b 824; CHECK-NEXT: ret 825 %res = sub <8 x i8> %op1, %op2 826 ret <8 x i8> %res 827} 828 829; Don't use SVE for 128-bit vectors. 830define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 { 831; CHECK-LABEL: sub_v16i8: 832; CHECK: // %bb.0: 833; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b 834; CHECK-NEXT: ret 835 %res = sub <16 x i8> %op1, %op2 836 ret <16 x i8> %res 837} 838 839define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 840; CHECK-LABEL: sub_v32i8: 841; CHECK: // %bb.0: 842; CHECK-NEXT: ptrue p0.b, vl32 843; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 844; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 845; CHECK-NEXT: sub z0.b, z0.b, z1.b 846; CHECK-NEXT: st1b { z0.b }, p0, [x0] 847; CHECK-NEXT: ret 848 %op1 = load <32 x i8>, ptr %a 849 %op2 = load <32 x i8>, ptr %b 850 %res = sub <32 x i8> %op1, %op2 851 store <32 x i8> %res, ptr %a 852 ret void 853} 854 855define void @sub_v64i8(ptr %a, ptr %b) #0 { 856; VBITS_GE_256-LABEL: sub_v64i8: 857; VBITS_GE_256: // %bb.0: 858; VBITS_GE_256-NEXT: ptrue p0.b, vl32 859; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 860; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 861; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 862; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 863; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 864; VBITS_GE_256-NEXT: sub z0.b, z0.b, z1.b 865; VBITS_GE_256-NEXT: sub z1.b, z2.b, z3.b 866; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 867; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 868; VBITS_GE_256-NEXT: ret 869; 870; VBITS_GE_512-LABEL: sub_v64i8: 871; VBITS_GE_512: // %bb.0: 872; VBITS_GE_512-NEXT: ptrue p0.b, vl64 873; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 874; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 875; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b 876; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 877; VBITS_GE_512-NEXT: ret 878 %op1 = load <64 x i8>, ptr %a 879 %op2 = load <64 x i8>, ptr %b 880 %res = sub <64 x i8> %op1, %op2 881 store <64 x i8> %res, ptr %a 882 ret void 883} 884 885define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 886; CHECK-LABEL: sub_v128i8: 887; CHECK: // %bb.0: 888; CHECK-NEXT: ptrue p0.b, vl128 889; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 890; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 891; CHECK-NEXT: sub z0.b, z0.b, z1.b 892; CHECK-NEXT: st1b { z0.b }, p0, [x0] 893; CHECK-NEXT: ret 894 %op1 = load <128 x i8>, ptr %a 895 %op2 = load <128 x i8>, ptr %b 896 %res = sub <128 x i8> %op1, %op2 897 store <128 x i8> %res, ptr %a 898 ret void 899} 900 901define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 902; CHECK-LABEL: sub_v256i8: 903; CHECK: // %bb.0: 904; CHECK-NEXT: ptrue p0.b, vl256 905; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 906; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] 907; CHECK-NEXT: sub z0.b, z0.b, z1.b 908; CHECK-NEXT: st1b { z0.b }, p0, [x0] 909; CHECK-NEXT: ret 910 %op1 = load <256 x i8>, ptr %a 911 %op2 = load <256 x i8>, ptr %b 912 %res = sub <256 x i8> %op1, %op2 913 store <256 x i8> %res, ptr %a 914 ret void 915} 916 917; Don't use SVE for 64-bit vectors. 918define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 { 919; CHECK-LABEL: sub_v4i16: 920; CHECK: // %bb.0: 921; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h 922; CHECK-NEXT: ret 923 %res = sub <4 x i16> %op1, %op2 924 ret <4 x i16> %res 925} 926 927; Don't use SVE for 128-bit vectors. 928define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 { 929; CHECK-LABEL: sub_v8i16: 930; CHECK: // %bb.0: 931; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h 932; CHECK-NEXT: ret 933 %res = sub <8 x i16> %op1, %op2 934 ret <8 x i16> %res 935} 936 937define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 938; CHECK-LABEL: sub_v16i16: 939; CHECK: // %bb.0: 940; CHECK-NEXT: ptrue p0.h, vl16 941; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 942; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 943; CHECK-NEXT: sub z0.h, z0.h, z1.h 944; CHECK-NEXT: st1h { z0.h }, p0, [x0] 945; CHECK-NEXT: ret 946 %op1 = load <16 x i16>, ptr %a 947 %op2 = load <16 x i16>, ptr %b 948 %res = sub <16 x i16> %op1, %op2 949 store <16 x i16> %res, ptr %a 950 ret void 951} 952 953define void @sub_v32i16(ptr %a, ptr %b) #0 { 954; VBITS_GE_256-LABEL: sub_v32i16: 955; VBITS_GE_256: // %bb.0: 956; VBITS_GE_256-NEXT: ptrue p0.h, vl16 957; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 958; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 959; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 960; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 961; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 962; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h 963; VBITS_GE_256-NEXT: sub z1.h, z2.h, z3.h 964; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 965; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 966; VBITS_GE_256-NEXT: ret 967; 968; VBITS_GE_512-LABEL: sub_v32i16: 969; VBITS_GE_512: // %bb.0: 970; VBITS_GE_512-NEXT: ptrue p0.h, vl32 971; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 972; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 973; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h 974; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 975; VBITS_GE_512-NEXT: ret 976 %op1 = load <32 x i16>, ptr %a 977 %op2 = load <32 x i16>, ptr %b 978 %res = sub <32 x i16> %op1, %op2 979 store <32 x i16> %res, ptr %a 980 ret void 981} 982 983define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 984; CHECK-LABEL: sub_v64i16: 985; CHECK: // %bb.0: 986; CHECK-NEXT: ptrue p0.h, vl64 987; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 988; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 989; CHECK-NEXT: sub z0.h, z0.h, z1.h 990; CHECK-NEXT: st1h { z0.h }, p0, [x0] 991; CHECK-NEXT: ret 992 %op1 = load <64 x i16>, ptr %a 993 %op2 = load <64 x i16>, ptr %b 994 %res = sub <64 x i16> %op1, %op2 995 store <64 x i16> %res, ptr %a 996 ret void 997} 998 999define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1000; CHECK-LABEL: sub_v128i16: 1001; CHECK: // %bb.0: 1002; CHECK-NEXT: ptrue p0.h, vl128 1003; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1004; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1005; CHECK-NEXT: sub z0.h, z0.h, z1.h 1006; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1007; CHECK-NEXT: ret 1008 %op1 = load <128 x i16>, ptr %a 1009 %op2 = load <128 x i16>, ptr %b 1010 %res = sub <128 x i16> %op1, %op2 1011 store <128 x i16> %res, ptr %a 1012 ret void 1013} 1014 1015; Don't use SVE for 64-bit vectors. 1016define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 { 1017; CHECK-LABEL: sub_v2i32: 1018; CHECK: // %bb.0: 1019; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s 1020; CHECK-NEXT: ret 1021 %res = sub <2 x i32> %op1, %op2 1022 ret <2 x i32> %res 1023} 1024 1025; Don't use SVE for 128-bit vectors. 1026define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 { 1027; CHECK-LABEL: sub_v4i32: 1028; CHECK: // %bb.0: 1029; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s 1030; CHECK-NEXT: ret 1031 %res = sub <4 x i32> %op1, %op2 1032 ret <4 x i32> %res 1033} 1034 1035define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1036; CHECK-LABEL: sub_v8i32: 1037; CHECK: // %bb.0: 1038; CHECK-NEXT: ptrue p0.s, vl8 1039; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1040; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1041; CHECK-NEXT: sub z0.s, z0.s, z1.s 1042; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1043; CHECK-NEXT: ret 1044 %op1 = load <8 x i32>, ptr %a 1045 %op2 = load <8 x i32>, ptr %b 1046 %res = sub <8 x i32> %op1, %op2 1047 store <8 x i32> %res, ptr %a 1048 ret void 1049} 1050 1051define void @sub_v16i32(ptr %a, ptr %b) #0 { 1052; VBITS_GE_256-LABEL: sub_v16i32: 1053; VBITS_GE_256: // %bb.0: 1054; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1055; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1056; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1057; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1058; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1059; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1060; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s 1061; VBITS_GE_256-NEXT: sub z1.s, z2.s, z3.s 1062; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1063; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1064; VBITS_GE_256-NEXT: ret 1065; 1066; VBITS_GE_512-LABEL: sub_v16i32: 1067; VBITS_GE_512: // %bb.0: 1068; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1069; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1070; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1071; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s 1072; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1073; VBITS_GE_512-NEXT: ret 1074 %op1 = load <16 x i32>, ptr %a 1075 %op2 = load <16 x i32>, ptr %b 1076 %res = sub <16 x i32> %op1, %op2 1077 store <16 x i32> %res, ptr %a 1078 ret void 1079} 1080 1081define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1082; CHECK-LABEL: sub_v32i32: 1083; CHECK: // %bb.0: 1084; CHECK-NEXT: ptrue p0.s, vl32 1085; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1086; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1087; CHECK-NEXT: sub z0.s, z0.s, z1.s 1088; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1089; CHECK-NEXT: ret 1090 %op1 = load <32 x i32>, ptr %a 1091 %op2 = load <32 x i32>, ptr %b 1092 %res = sub <32 x i32> %op1, %op2 1093 store <32 x i32> %res, ptr %a 1094 ret void 1095} 1096 1097define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1098; CHECK-LABEL: sub_v64i32: 1099; CHECK: // %bb.0: 1100; CHECK-NEXT: ptrue p0.s, vl64 1101; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1102; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1103; CHECK-NEXT: sub z0.s, z0.s, z1.s 1104; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1105; CHECK-NEXT: ret 1106 %op1 = load <64 x i32>, ptr %a 1107 %op2 = load <64 x i32>, ptr %b 1108 %res = sub <64 x i32> %op1, %op2 1109 store <64 x i32> %res, ptr %a 1110 ret void 1111} 1112 1113; Don't use SVE for 64-bit vectors. 1114define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 { 1115; CHECK-LABEL: sub_v1i64: 1116; CHECK: // %bb.0: 1117; CHECK-NEXT: sub d0, d0, d1 1118; CHECK-NEXT: ret 1119 %res = sub <1 x i64> %op1, %op2 1120 ret <1 x i64> %res 1121} 1122 1123; Don't use SVE for 128-bit vectors. 1124define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 { 1125; CHECK-LABEL: sub_v2i64: 1126; CHECK: // %bb.0: 1127; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d 1128; CHECK-NEXT: ret 1129 %res = sub <2 x i64> %op1, %op2 1130 ret <2 x i64> %res 1131} 1132 1133define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1134; CHECK-LABEL: sub_v4i64: 1135; CHECK: // %bb.0: 1136; CHECK-NEXT: ptrue p0.d, vl4 1137; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1138; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1139; CHECK-NEXT: sub z0.d, z0.d, z1.d 1140; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1141; CHECK-NEXT: ret 1142 %op1 = load <4 x i64>, ptr %a 1143 %op2 = load <4 x i64>, ptr %b 1144 %res = sub <4 x i64> %op1, %op2 1145 store <4 x i64> %res, ptr %a 1146 ret void 1147} 1148 1149define void @sub_v8i64(ptr %a, ptr %b) #0 { 1150; VBITS_GE_256-LABEL: sub_v8i64: 1151; VBITS_GE_256: // %bb.0: 1152; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1153; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1154; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1155; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1156; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1157; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1158; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d 1159; VBITS_GE_256-NEXT: sub z1.d, z2.d, z3.d 1160; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1161; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1162; VBITS_GE_256-NEXT: ret 1163; 1164; VBITS_GE_512-LABEL: sub_v8i64: 1165; VBITS_GE_512: // %bb.0: 1166; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1167; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1168; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1169; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d 1170; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1171; VBITS_GE_512-NEXT: ret 1172 %op1 = load <8 x i64>, ptr %a 1173 %op2 = load <8 x i64>, ptr %b 1174 %res = sub <8 x i64> %op1, %op2 1175 store <8 x i64> %res, ptr %a 1176 ret void 1177} 1178 1179define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1180; CHECK-LABEL: sub_v16i64: 1181; CHECK: // %bb.0: 1182; CHECK-NEXT: ptrue p0.d, vl16 1183; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1184; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1185; CHECK-NEXT: sub z0.d, z0.d, z1.d 1186; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1187; CHECK-NEXT: ret 1188 %op1 = load <16 x i64>, ptr %a 1189 %op2 = load <16 x i64>, ptr %b 1190 %res = sub <16 x i64> %op1, %op2 1191 store <16 x i64> %res, ptr %a 1192 ret void 1193} 1194 1195define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1196; CHECK-LABEL: sub_v32i64: 1197; CHECK: // %bb.0: 1198; CHECK-NEXT: ptrue p0.d, vl32 1199; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1200; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1201; CHECK-NEXT: sub z0.d, z0.d, z1.d 1202; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1203; CHECK-NEXT: ret 1204 %op1 = load <32 x i64>, ptr %a 1205 %op2 = load <32 x i64>, ptr %b 1206 %res = sub <32 x i64> %op1, %op2 1207 store <32 x i64> %res, ptr %a 1208 ret void 1209} 1210 1211 1212; 1213; ABS 1214; 1215 1216; Don't use SVE for 64-bit vectors. 1217define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 { 1218; CHECK-LABEL: abs_v8i8: 1219; CHECK: // %bb.0: 1220; CHECK-NEXT: abs v0.8b, v0.8b 1221; CHECK-NEXT: ret 1222 %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) 1223 ret <8 x i8> %res 1224} 1225 1226; Don't use SVE for 128-bit vectors. 1227define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 { 1228; CHECK-LABEL: abs_v16i8: 1229; CHECK: // %bb.0: 1230; CHECK-NEXT: abs v0.16b, v0.16b 1231; CHECK-NEXT: ret 1232 %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) 1233 ret <16 x i8> %res 1234} 1235 1236define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 { 1237; CHECK-LABEL: abs_v32i8: 1238; CHECK: // %bb.0: 1239; CHECK-NEXT: ptrue p0.b, vl32 1240; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1241; CHECK-NEXT: abs z0.b, p0/m, z0.b 1242; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1243; CHECK-NEXT: ret 1244 %op1 = load <32 x i8>, ptr %a 1245 %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) 1246 store <32 x i8> %res, ptr %a 1247 ret void 1248} 1249 1250define void @abs_v64i8(ptr %a) #0 { 1251; VBITS_GE_256-LABEL: abs_v64i8: 1252; VBITS_GE_256: // %bb.0: 1253; VBITS_GE_256-NEXT: ptrue p0.b, vl32 1254; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 1255; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 1256; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 1257; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b 1258; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b 1259; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] 1260; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] 1261; VBITS_GE_256-NEXT: ret 1262; 1263; VBITS_GE_512-LABEL: abs_v64i8: 1264; VBITS_GE_512: // %bb.0: 1265; VBITS_GE_512-NEXT: ptrue p0.b, vl64 1266; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 1267; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b 1268; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] 1269; VBITS_GE_512-NEXT: ret 1270 %op1 = load <64 x i8>, ptr %a 1271 %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false) 1272 store <64 x i8> %res, ptr %a 1273 ret void 1274} 1275 1276define void @abs_v128i8(ptr %a) vscale_range(8,0) #0 { 1277; CHECK-LABEL: abs_v128i8: 1278; CHECK: // %bb.0: 1279; CHECK-NEXT: ptrue p0.b, vl128 1280; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1281; CHECK-NEXT: abs z0.b, p0/m, z0.b 1282; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1283; CHECK-NEXT: ret 1284 %op1 = load <128 x i8>, ptr %a 1285 %res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false) 1286 store <128 x i8> %res, ptr %a 1287 ret void 1288} 1289 1290define void @abs_v256i8(ptr %a) vscale_range(16,0) #0 { 1291; CHECK-LABEL: abs_v256i8: 1292; CHECK: // %bb.0: 1293; CHECK-NEXT: ptrue p0.b, vl256 1294; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1295; CHECK-NEXT: abs z0.b, p0/m, z0.b 1296; CHECK-NEXT: st1b { z0.b }, p0, [x0] 1297; CHECK-NEXT: ret 1298 %op1 = load <256 x i8>, ptr %a 1299 %res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false) 1300 store <256 x i8> %res, ptr %a 1301 ret void 1302} 1303 1304; Don't use SVE for 64-bit vectors. 1305define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 { 1306; CHECK-LABEL: abs_v4i16: 1307; CHECK: // %bb.0: 1308; CHECK-NEXT: abs v0.4h, v0.4h 1309; CHECK-NEXT: ret 1310 %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) 1311 ret <4 x i16> %res 1312} 1313 1314; Don't use SVE for 128-bit vectors. 1315define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 { 1316; CHECK-LABEL: abs_v8i16: 1317; CHECK: // %bb.0: 1318; CHECK-NEXT: abs v0.8h, v0.8h 1319; CHECK-NEXT: ret 1320 %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) 1321 ret <8 x i16> %res 1322} 1323 1324define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 { 1325; CHECK-LABEL: abs_v16i16: 1326; CHECK: // %bb.0: 1327; CHECK-NEXT: ptrue p0.h, vl16 1328; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1329; CHECK-NEXT: abs z0.h, p0/m, z0.h 1330; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1331; CHECK-NEXT: ret 1332 %op1 = load <16 x i16>, ptr %a 1333 %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) 1334 store <16 x i16> %res, ptr %a 1335 ret void 1336} 1337 1338define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 { 1339; CHECK-LABEL: abs_v32i16: 1340; CHECK: // %bb.0: 1341; CHECK-NEXT: ptrue p0.h, vl16 1342; CHECK-NEXT: mov x8, #16 // =0x10 1343; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1344; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 1345; CHECK-NEXT: abs z0.h, p0/m, z0.h 1346; CHECK-NEXT: abs z1.h, p0/m, z1.h 1347; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1348; CHECK-NEXT: st1h { z1.h }, p0, [x0] 1349; CHECK-NEXT: ret 1350 %op1 = load <32 x i16>, ptr %a 1351 %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false) 1352 store <32 x i16> %res, ptr %a 1353 ret void 1354} 1355 1356define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 { 1357; CHECK-LABEL: abs_v64i16: 1358; CHECK: // %bb.0: 1359; CHECK-NEXT: ptrue p0.h, vl16 1360; CHECK-NEXT: mov x8, #32 // =0x20 1361; CHECK-NEXT: mov x9, #48 // =0x30 1362; CHECK-NEXT: mov x10, #16 // =0x10 1363; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1364; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] 1365; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] 1366; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0] 1367; CHECK-NEXT: abs z0.h, p0/m, z0.h 1368; CHECK-NEXT: abs z1.h, p0/m, z1.h 1369; CHECK-NEXT: abs z2.h, p0/m, z2.h 1370; CHECK-NEXT: abs z3.h, p0/m, z3.h 1371; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1372; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] 1373; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] 1374; CHECK-NEXT: st1h { z3.h }, p0, [x0] 1375; CHECK-NEXT: ret 1376 %op1 = load <64 x i16>, ptr %a 1377 %res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false) 1378 store <64 x i16> %res, ptr %a 1379 ret void 1380} 1381 1382define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 { 1383; CHECK-LABEL: abs_v128i16: 1384; CHECK: // %bb.0: 1385; CHECK-NEXT: ptrue p0.h, vl16 1386; CHECK-NEXT: mov x8, #96 // =0x60 1387; CHECK-NEXT: mov x9, #112 // =0x70 1388; CHECK-NEXT: mov x10, #64 // =0x40 1389; CHECK-NEXT: mov x11, #80 // =0x50 1390; CHECK-NEXT: mov x12, #32 // =0x20 1391; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1392; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] 1393; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] 1394; CHECK-NEXT: mov x13, #48 // =0x30 1395; CHECK-NEXT: mov x14, #16 // =0x10 1396; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] 1397; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] 1398; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] 1399; CHECK-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1] 1400; CHECK-NEXT: abs z0.h, p0/m, z0.h 1401; CHECK-NEXT: abs z1.h, p0/m, z1.h 1402; CHECK-NEXT: abs z2.h, p0/m, z2.h 1403; CHECK-NEXT: abs z3.h, p0/m, z3.h 1404; CHECK-NEXT: abs z4.h, p0/m, z4.h 1405; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1406; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1407; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1] 1408; CHECK-NEXT: movprfx z1, z5 1409; CHECK-NEXT: abs z1.h, p0/m, z5.h 1410; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1] 1411; CHECK-NEXT: movprfx z2, z6 1412; CHECK-NEXT: abs z2.h, p0/m, z6.h 1413; CHECK-NEXT: abs z0.h, p0/m, z0.h 1414; CHECK-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1] 1415; CHECK-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1] 1416; CHECK-NEXT: st1h { z1.h }, p0, [x0, x13, lsl #1] 1417; CHECK-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1] 1418; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1419; CHECK-NEXT: ret 1420 %op1 = load <128 x i16>, ptr %a 1421 %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false) 1422 store <128 x i16> %res, ptr %a 1423 ret void 1424} 1425 1426; Don't use SVE for 64-bit vectors. 1427define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 { 1428; CHECK-LABEL: abs_v2i32: 1429; CHECK: // %bb.0: 1430; CHECK-NEXT: abs v0.2s, v0.2s 1431; CHECK-NEXT: ret 1432 %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) 1433 ret <2 x i32> %res 1434} 1435 1436; Don't use SVE for 128-bit vectors. 1437define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 { 1438; CHECK-LABEL: abs_v4i32: 1439; CHECK: // %bb.0: 1440; CHECK-NEXT: abs v0.4s, v0.4s 1441; CHECK-NEXT: ret 1442 %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) 1443 ret <4 x i32> %res 1444} 1445 1446define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 { 1447; CHECK-LABEL: abs_v8i32: 1448; CHECK: // %bb.0: 1449; CHECK-NEXT: ptrue p0.s, vl8 1450; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1451; CHECK-NEXT: abs z0.s, p0/m, z0.s 1452; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1453; CHECK-NEXT: ret 1454 %op1 = load <8 x i32>, ptr %a 1455 %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) 1456 store <8 x i32> %res, ptr %a 1457 ret void 1458} 1459 1460define void @abs_v16i32(ptr %a) #0 { 1461; VBITS_GE_256-LABEL: abs_v16i32: 1462; VBITS_GE_256: // %bb.0: 1463; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1464; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1465; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1466; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1467; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s 1468; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s 1469; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1470; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1471; VBITS_GE_256-NEXT: ret 1472; 1473; VBITS_GE_512-LABEL: abs_v16i32: 1474; VBITS_GE_512: // %bb.0: 1475; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1476; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1477; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s 1478; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1479; VBITS_GE_512-NEXT: ret 1480 %op1 = load <16 x i32>, ptr %a 1481 %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false) 1482 store <16 x i32> %res, ptr %a 1483 ret void 1484} 1485 1486define void @abs_v32i32(ptr %a) vscale_range(8,0) #0 { 1487; CHECK-LABEL: abs_v32i32: 1488; CHECK: // %bb.0: 1489; CHECK-NEXT: ptrue p0.s, vl32 1490; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1491; CHECK-NEXT: abs z0.s, p0/m, z0.s 1492; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1493; CHECK-NEXT: ret 1494 %op1 = load <32 x i32>, ptr %a 1495 %res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false) 1496 store <32 x i32> %res, ptr %a 1497 ret void 1498} 1499 1500define void @abs_v64i32(ptr %a) vscale_range(16,0) #0 { 1501; CHECK-LABEL: abs_v64i32: 1502; CHECK: // %bb.0: 1503; CHECK-NEXT: ptrue p0.s, vl64 1504; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1505; CHECK-NEXT: abs z0.s, p0/m, z0.s 1506; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1507; CHECK-NEXT: ret 1508 %op1 = load <64 x i32>, ptr %a 1509 %res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false) 1510 store <64 x i32> %res, ptr %a 1511 ret void 1512} 1513 1514; Don't use SVE for 64-bit vectors. 1515define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 { 1516; CHECK-LABEL: abs_v1i64: 1517; CHECK: // %bb.0: 1518; CHECK-NEXT: abs d0, d0 1519; CHECK-NEXT: ret 1520 %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) 1521 ret <1 x i64> %res 1522} 1523 1524; Don't use SVE for 128-bit vectors. 1525define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 { 1526; CHECK-LABEL: abs_v2i64: 1527; CHECK: // %bb.0: 1528; CHECK-NEXT: abs v0.2d, v0.2d 1529; CHECK-NEXT: ret 1530 %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) 1531 ret <2 x i64> %res 1532} 1533 1534define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 { 1535; CHECK-LABEL: abs_v4i64: 1536; CHECK: // %bb.0: 1537; CHECK-NEXT: ptrue p0.d, vl4 1538; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1539; CHECK-NEXT: abs z0.d, p0/m, z0.d 1540; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1541; CHECK-NEXT: ret 1542 %op1 = load <4 x i64>, ptr %a 1543 %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) 1544 store <4 x i64> %res, ptr %a 1545 ret void 1546} 1547 1548define void @abs_v8i64(ptr %a) #0 { 1549; VBITS_GE_256-LABEL: abs_v8i64: 1550; VBITS_GE_256: // %bb.0: 1551; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1552; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1553; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1554; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1555; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d 1556; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d 1557; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1558; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1559; VBITS_GE_256-NEXT: ret 1560; 1561; VBITS_GE_512-LABEL: abs_v8i64: 1562; VBITS_GE_512: // %bb.0: 1563; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1564; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1565; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d 1566; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1567; VBITS_GE_512-NEXT: ret 1568 %op1 = load <8 x i64>, ptr %a 1569 %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false) 1570 store <8 x i64> %res, ptr %a 1571 ret void 1572} 1573 1574define void @abs_v16i64(ptr %a) vscale_range(8,0) #0 { 1575; CHECK-LABEL: abs_v16i64: 1576; CHECK: // %bb.0: 1577; CHECK-NEXT: ptrue p0.d, vl16 1578; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1579; CHECK-NEXT: abs z0.d, p0/m, z0.d 1580; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1581; CHECK-NEXT: ret 1582 %op1 = load <16 x i64>, ptr %a 1583 %res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false) 1584 store <16 x i64> %res, ptr %a 1585 ret void 1586} 1587 1588define void @abs_v32i64(ptr %a) vscale_range(16,0) #0 { 1589; CHECK-LABEL: abs_v32i64: 1590; CHECK: // %bb.0: 1591; CHECK-NEXT: ptrue p0.d, vl32 1592; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1593; CHECK-NEXT: abs z0.d, p0/m, z0.d 1594; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1595; CHECK-NEXT: ret 1596 %op1 = load <32 x i64>, ptr %a 1597 %res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false) 1598 store <32 x i64> %res, ptr %a 1599 ret void 1600} 1601 1602declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1) 1603declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) 1604declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) 1605declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) 1606declare <128 x i8> @llvm.abs.v128i8(<128 x i8>, i1) 1607declare <256 x i8> @llvm.abs.v256i8(<256 x i8>, i1) 1608declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) 1609declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) 1610declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) 1611declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) 1612declare <64 x i16> @llvm.abs.v64i16(<64 x i16>, i1) 1613declare <128 x i16> @llvm.abs.v128i16(<128 x i16>, i1) 1614declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) 1615declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) 1616declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) 1617declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) 1618declare <32 x i32> @llvm.abs.v32i32(<32 x i32>, i1) 1619declare <64 x i32> @llvm.abs.v64i32(<64 x i32>, i1) 1620declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1) 1621declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) 1622declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) 1623declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) 1624declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1) 1625declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1) 1626 1627attributes #0 = { "target-features"="+sve" } 1628