1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; FADD 10; 11 12; Don't use SVE for 64-bit vectors. 13define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 { 14; CHECK-LABEL: fadd_v4f16: 15; CHECK: // %bb.0: 16; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h 17; CHECK-NEXT: ret 18 %res = fadd <4 x half> %op1, %op2 19 ret <4 x half> %res 20} 21 22; Don't use SVE for 128-bit vectors. 23define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 { 24; CHECK-LABEL: fadd_v8f16: 25; CHECK: // %bb.0: 26; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h 27; CHECK-NEXT: ret 28 %res = fadd <8 x half> %op1, %op2 29 ret <8 x half> %res 30} 31 32define void @fadd_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 33; CHECK-LABEL: fadd_v16f16: 34; CHECK: // %bb.0: 35; CHECK-NEXT: ptrue p0.h, vl16 36; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 37; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 38; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h 39; CHECK-NEXT: st1h { z0.h }, p0, [x0] 40; CHECK-NEXT: ret 41 %op1 = load <16 x half>, ptr %a 42 %op2 = load <16 x half>, ptr %b 43 %res = fadd <16 x half> %op1, %op2 44 store <16 x half> %res, ptr %a 45 ret void 46} 47 48define void @fadd_v32f16(ptr %a, ptr %b) #0 { 49; VBITS_GE_256-LABEL: fadd_v32f16: 50; VBITS_GE_256: // %bb.0: 51; VBITS_GE_256-NEXT: ptrue p0.h, vl16 52; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 53; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 54; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 55; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 56; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 57; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z1.h 58; VBITS_GE_256-NEXT: movprfx z1, z2 59; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h 60; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 61; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 62; VBITS_GE_256-NEXT: ret 63; 64; VBITS_GE_512-LABEL: fadd_v32f16: 65; VBITS_GE_512: // %bb.0: 66; VBITS_GE_512-NEXT: ptrue p0.h, vl32 67; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 68; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 69; VBITS_GE_512-NEXT: fadd z0.h, p0/m, z0.h, z1.h 70; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 71; VBITS_GE_512-NEXT: ret 72 %op1 = load <32 x half>, ptr %a 73 %op2 = load <32 x half>, ptr %b 74 %res = fadd <32 x half> %op1, %op2 75 store <32 x half> %res, ptr %a 76 ret void 77} 78 79define void @fadd_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 80; CHECK-LABEL: fadd_v64f16: 81; CHECK: // %bb.0: 82; CHECK-NEXT: ptrue p0.h, vl64 83; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 84; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 85; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h 86; CHECK-NEXT: st1h { z0.h }, p0, [x0] 87; CHECK-NEXT: ret 88 %op1 = load <64 x half>, ptr %a 89 %op2 = load <64 x half>, ptr %b 90 %res = fadd <64 x half> %op1, %op2 91 store <64 x half> %res, ptr %a 92 ret void 93} 94 95define void @fadd_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 96; CHECK-LABEL: fadd_v128f16: 97; CHECK: // %bb.0: 98; CHECK-NEXT: ptrue p0.h, vl128 99; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 100; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 101; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h 102; CHECK-NEXT: st1h { z0.h }, p0, [x0] 103; CHECK-NEXT: ret 104 %op1 = load <128 x half>, ptr %a 105 %op2 = load <128 x half>, ptr %b 106 %res = fadd <128 x half> %op1, %op2 107 store <128 x half> %res, ptr %a 108 ret void 109} 110 111; Don't use SVE for 64-bit vectors. 112define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 { 113; CHECK-LABEL: fadd_v2f32: 114; CHECK: // %bb.0: 115; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s 116; CHECK-NEXT: ret 117 %res = fadd <2 x float> %op1, %op2 118 ret <2 x float> %res 119} 120 121; Don't use SVE for 128-bit vectors. 122define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 { 123; CHECK-LABEL: fadd_v4f32: 124; CHECK: // %bb.0: 125; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 126; CHECK-NEXT: ret 127 %res = fadd <4 x float> %op1, %op2 128 ret <4 x float> %res 129} 130 131define void @fadd_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 132; CHECK-LABEL: fadd_v8f32: 133; CHECK: // %bb.0: 134; CHECK-NEXT: ptrue p0.s, vl8 135; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 136; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 137; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s 138; CHECK-NEXT: st1w { z0.s }, p0, [x0] 139; CHECK-NEXT: ret 140 %op1 = load <8 x float>, ptr %a 141 %op2 = load <8 x float>, ptr %b 142 %res = fadd <8 x float> %op1, %op2 143 store <8 x float> %res, ptr %a 144 ret void 145} 146 147define void @fadd_v16f32(ptr %a, ptr %b) #0 { 148; VBITS_GE_256-LABEL: fadd_v16f32: 149; VBITS_GE_256: // %bb.0: 150; VBITS_GE_256-NEXT: ptrue p0.s, vl8 151; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 152; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 153; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 154; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 155; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 156; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z1.s 157; VBITS_GE_256-NEXT: movprfx z1, z2 158; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s 159; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 160; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 161; VBITS_GE_256-NEXT: ret 162; 163; VBITS_GE_512-LABEL: fadd_v16f32: 164; VBITS_GE_512: // %bb.0: 165; VBITS_GE_512-NEXT: ptrue p0.s, vl16 166; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 167; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 168; VBITS_GE_512-NEXT: fadd z0.s, p0/m, z0.s, z1.s 169; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 170; VBITS_GE_512-NEXT: ret 171 %op1 = load <16 x float>, ptr %a 172 %op2 = load <16 x float>, ptr %b 173 %res = fadd <16 x float> %op1, %op2 174 store <16 x float> %res, ptr %a 175 ret void 176} 177 178define void @fadd_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 179; CHECK-LABEL: fadd_v32f32: 180; CHECK: // %bb.0: 181; CHECK-NEXT: ptrue p0.s, vl32 182; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 183; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 184; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s 185; CHECK-NEXT: st1w { z0.s }, p0, [x0] 186; CHECK-NEXT: ret 187 %op1 = load <32 x float>, ptr %a 188 %op2 = load <32 x float>, ptr %b 189 %res = fadd <32 x float> %op1, %op2 190 store <32 x float> %res, ptr %a 191 ret void 192} 193 194define void @fadd_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 195; CHECK-LABEL: fadd_v64f32: 196; CHECK: // %bb.0: 197; CHECK-NEXT: ptrue p0.s, vl64 198; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 199; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 200; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s 201; CHECK-NEXT: st1w { z0.s }, p0, [x0] 202; CHECK-NEXT: ret 203 %op1 = load <64 x float>, ptr %a 204 %op2 = load <64 x float>, ptr %b 205 %res = fadd <64 x float> %op1, %op2 206 store <64 x float> %res, ptr %a 207 ret void 208} 209 210; Don't use SVE for 64-bit vectors. 211define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 { 212; CHECK-LABEL: fadd_v1f64: 213; CHECK: // %bb.0: 214; CHECK-NEXT: fadd d0, d0, d1 215; CHECK-NEXT: ret 216 %res = fadd <1 x double> %op1, %op2 217 ret <1 x double> %res 218} 219 220; Don't use SVE for 128-bit vectors. 221define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 { 222; CHECK-LABEL: fadd_v2f64: 223; CHECK: // %bb.0: 224; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d 225; CHECK-NEXT: ret 226 %res = fadd <2 x double> %op1, %op2 227 ret <2 x double> %res 228} 229 230define void @fadd_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 231; CHECK-LABEL: fadd_v4f64: 232; CHECK: // %bb.0: 233; CHECK-NEXT: ptrue p0.d, vl4 234; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 235; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 236; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d 237; CHECK-NEXT: st1d { z0.d }, p0, [x0] 238; CHECK-NEXT: ret 239 %op1 = load <4 x double>, ptr %a 240 %op2 = load <4 x double>, ptr %b 241 %res = fadd <4 x double> %op1, %op2 242 store <4 x double> %res, ptr %a 243 ret void 244} 245 246define void @fadd_v8f64(ptr %a, ptr %b) #0 { 247; VBITS_GE_256-LABEL: fadd_v8f64: 248; VBITS_GE_256: // %bb.0: 249; VBITS_GE_256-NEXT: ptrue p0.d, vl4 250; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 251; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 252; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 253; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 254; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 255; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z1.d 256; VBITS_GE_256-NEXT: movprfx z1, z2 257; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d 258; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 259; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 260; VBITS_GE_256-NEXT: ret 261; 262; VBITS_GE_512-LABEL: fadd_v8f64: 263; VBITS_GE_512: // %bb.0: 264; VBITS_GE_512-NEXT: ptrue p0.d, vl8 265; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 266; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 267; VBITS_GE_512-NEXT: fadd z0.d, p0/m, z0.d, z1.d 268; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 269; VBITS_GE_512-NEXT: ret 270 %op1 = load <8 x double>, ptr %a 271 %op2 = load <8 x double>, ptr %b 272 %res = fadd <8 x double> %op1, %op2 273 store <8 x double> %res, ptr %a 274 ret void 275} 276 277define void @fadd_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 278; CHECK-LABEL: fadd_v16f64: 279; CHECK: // %bb.0: 280; CHECK-NEXT: ptrue p0.d, vl16 281; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 282; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 283; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d 284; CHECK-NEXT: st1d { z0.d }, p0, [x0] 285; CHECK-NEXT: ret 286 %op1 = load <16 x double>, ptr %a 287 %op2 = load <16 x double>, ptr %b 288 %res = fadd <16 x double> %op1, %op2 289 store <16 x double> %res, ptr %a 290 ret void 291} 292 293define void @fadd_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 294; CHECK-LABEL: fadd_v32f64: 295; CHECK: // %bb.0: 296; CHECK-NEXT: ptrue p0.d, vl32 297; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 298; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 299; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d 300; CHECK-NEXT: st1d { z0.d }, p0, [x0] 301; CHECK-NEXT: ret 302 %op1 = load <32 x double>, ptr %a 303 %op2 = load <32 x double>, ptr %b 304 %res = fadd <32 x double> %op1, %op2 305 store <32 x double> %res, ptr %a 306 ret void 307} 308 309; 310; FDIV 311; 312 313; Don't use SVE for 64-bit vectors. 314define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 { 315; CHECK-LABEL: fdiv_v4f16: 316; CHECK: // %bb.0: 317; CHECK-NEXT: fdiv v0.4h, v0.4h, v1.4h 318; CHECK-NEXT: ret 319 %res = fdiv <4 x half> %op1, %op2 320 ret <4 x half> %res 321} 322 323; Don't use SVE for 128-bit vectors. 324define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 { 325; CHECK-LABEL: fdiv_v8f16: 326; CHECK: // %bb.0: 327; CHECK-NEXT: fdiv v0.8h, v0.8h, v1.8h 328; CHECK-NEXT: ret 329 %res = fdiv <8 x half> %op1, %op2 330 ret <8 x half> %res 331} 332 333define void @fdiv_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 334; CHECK-LABEL: fdiv_v16f16: 335; CHECK: // %bb.0: 336; CHECK-NEXT: ptrue p0.h, vl16 337; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 338; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 339; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h 340; CHECK-NEXT: st1h { z0.h }, p0, [x0] 341; CHECK-NEXT: ret 342 %op1 = load <16 x half>, ptr %a 343 %op2 = load <16 x half>, ptr %b 344 %res = fdiv <16 x half> %op1, %op2 345 store <16 x half> %res, ptr %a 346 ret void 347} 348 349define void @fdiv_v32f16(ptr %a, ptr %b) #0 { 350; VBITS_GE_256-LABEL: fdiv_v32f16: 351; VBITS_GE_256: // %bb.0: 352; VBITS_GE_256-NEXT: ptrue p0.h, vl16 353; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 354; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 355; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 356; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1] 357; VBITS_GE_256-NEXT: fdiv z0.h, p0/m, z0.h, z1.h 358; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 359; VBITS_GE_256-NEXT: fdiv z1.h, p0/m, z1.h, z2.h 360; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 361; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 362; VBITS_GE_256-NEXT: ret 363; 364; VBITS_GE_512-LABEL: fdiv_v32f16: 365; VBITS_GE_512: // %bb.0: 366; VBITS_GE_512-NEXT: ptrue p0.h, vl32 367; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 368; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 369; VBITS_GE_512-NEXT: fdiv z0.h, p0/m, z0.h, z1.h 370; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 371; VBITS_GE_512-NEXT: ret 372 %op1 = load <32 x half>, ptr %a 373 %op2 = load <32 x half>, ptr %b 374 %res = fdiv <32 x half> %op1, %op2 375 store <32 x half> %res, ptr %a 376 ret void 377} 378 379define void @fdiv_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 380; CHECK-LABEL: fdiv_v64f16: 381; CHECK: // %bb.0: 382; CHECK-NEXT: ptrue p0.h, vl64 383; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 384; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 385; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h 386; CHECK-NEXT: st1h { z0.h }, p0, [x0] 387; CHECK-NEXT: ret 388 %op1 = load <64 x half>, ptr %a 389 %op2 = load <64 x half>, ptr %b 390 %res = fdiv <64 x half> %op1, %op2 391 store <64 x half> %res, ptr %a 392 ret void 393} 394 395define void @fdiv_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 396; CHECK-LABEL: fdiv_v128f16: 397; CHECK: // %bb.0: 398; CHECK-NEXT: ptrue p0.h, vl128 399; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 400; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 401; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h 402; CHECK-NEXT: st1h { z0.h }, p0, [x0] 403; CHECK-NEXT: ret 404 %op1 = load <128 x half>, ptr %a 405 %op2 = load <128 x half>, ptr %b 406 %res = fdiv <128 x half> %op1, %op2 407 store <128 x half> %res, ptr %a 408 ret void 409} 410 411; Don't use SVE for 64-bit vectors. 412define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 { 413; CHECK-LABEL: fdiv_v2f32: 414; CHECK: // %bb.0: 415; CHECK-NEXT: fdiv v0.2s, v0.2s, v1.2s 416; CHECK-NEXT: ret 417 %res = fdiv <2 x float> %op1, %op2 418 ret <2 x float> %res 419} 420 421; Don't use SVE for 128-bit vectors. 422define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 { 423; CHECK-LABEL: fdiv_v4f32: 424; CHECK: // %bb.0: 425; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s 426; CHECK-NEXT: ret 427 %res = fdiv <4 x float> %op1, %op2 428 ret <4 x float> %res 429} 430 431define void @fdiv_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 432; CHECK-LABEL: fdiv_v8f32: 433; CHECK: // %bb.0: 434; CHECK-NEXT: ptrue p0.s, vl8 435; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 436; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 437; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s 438; CHECK-NEXT: st1w { z0.s }, p0, [x0] 439; CHECK-NEXT: ret 440 %op1 = load <8 x float>, ptr %a 441 %op2 = load <8 x float>, ptr %b 442 %res = fdiv <8 x float> %op1, %op2 443 store <8 x float> %res, ptr %a 444 ret void 445} 446 447define void @fdiv_v16f32(ptr %a, ptr %b) #0 { 448; VBITS_GE_256-LABEL: fdiv_v16f32: 449; VBITS_GE_256: // %bb.0: 450; VBITS_GE_256-NEXT: ptrue p0.s, vl8 451; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 452; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 453; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 454; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1] 455; VBITS_GE_256-NEXT: fdiv z0.s, p0/m, z0.s, z1.s 456; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 457; VBITS_GE_256-NEXT: fdiv z1.s, p0/m, z1.s, z2.s 458; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 459; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 460; VBITS_GE_256-NEXT: ret 461; 462; VBITS_GE_512-LABEL: fdiv_v16f32: 463; VBITS_GE_512: // %bb.0: 464; VBITS_GE_512-NEXT: ptrue p0.s, vl16 465; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 466; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 467; VBITS_GE_512-NEXT: fdiv z0.s, p0/m, z0.s, z1.s 468; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 469; VBITS_GE_512-NEXT: ret 470 %op1 = load <16 x float>, ptr %a 471 %op2 = load <16 x float>, ptr %b 472 %res = fdiv <16 x float> %op1, %op2 473 store <16 x float> %res, ptr %a 474 ret void 475} 476 477define void @fdiv_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 478; CHECK-LABEL: fdiv_v32f32: 479; CHECK: // %bb.0: 480; CHECK-NEXT: ptrue p0.s, vl32 481; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 482; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 483; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s 484; CHECK-NEXT: st1w { z0.s }, p0, [x0] 485; CHECK-NEXT: ret 486 %op1 = load <32 x float>, ptr %a 487 %op2 = load <32 x float>, ptr %b 488 %res = fdiv <32 x float> %op1, %op2 489 store <32 x float> %res, ptr %a 490 ret void 491} 492 493define void @fdiv_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 494; CHECK-LABEL: fdiv_v64f32: 495; CHECK: // %bb.0: 496; CHECK-NEXT: ptrue p0.s, vl64 497; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 498; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 499; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s 500; CHECK-NEXT: st1w { z0.s }, p0, [x0] 501; CHECK-NEXT: ret 502 %op1 = load <64 x float>, ptr %a 503 %op2 = load <64 x float>, ptr %b 504 %res = fdiv <64 x float> %op1, %op2 505 store <64 x float> %res, ptr %a 506 ret void 507} 508 509; Don't use SVE for 64-bit vectors. 510define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 { 511; CHECK-LABEL: fdiv_v1f64: 512; CHECK: // %bb.0: 513; CHECK-NEXT: fdiv d0, d0, d1 514; CHECK-NEXT: ret 515 %res = fdiv <1 x double> %op1, %op2 516 ret <1 x double> %res 517} 518 519; Don't use SVE for 128-bit vectors. 520define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 { 521; CHECK-LABEL: fdiv_v2f64: 522; CHECK: // %bb.0: 523; CHECK-NEXT: fdiv v0.2d, v0.2d, v1.2d 524; CHECK-NEXT: ret 525 %res = fdiv <2 x double> %op1, %op2 526 ret <2 x double> %res 527} 528 529define void @fdiv_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 530; CHECK-LABEL: fdiv_v4f64: 531; CHECK: // %bb.0: 532; CHECK-NEXT: ptrue p0.d, vl4 533; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 534; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 535; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d 536; CHECK-NEXT: st1d { z0.d }, p0, [x0] 537; CHECK-NEXT: ret 538 %op1 = load <4 x double>, ptr %a 539 %op2 = load <4 x double>, ptr %b 540 %res = fdiv <4 x double> %op1, %op2 541 store <4 x double> %res, ptr %a 542 ret void 543} 544 545define void @fdiv_v8f64(ptr %a, ptr %b) #0 { 546; VBITS_GE_256-LABEL: fdiv_v8f64: 547; VBITS_GE_256: // %bb.0: 548; VBITS_GE_256-NEXT: ptrue p0.d, vl4 549; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 550; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 551; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 552; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 553; VBITS_GE_256-NEXT: fdiv z0.d, p0/m, z0.d, z1.d 554; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 555; VBITS_GE_256-NEXT: fdiv z1.d, p0/m, z1.d, z2.d 556; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 557; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 558; VBITS_GE_256-NEXT: ret 559; 560; VBITS_GE_512-LABEL: fdiv_v8f64: 561; VBITS_GE_512: // %bb.0: 562; VBITS_GE_512-NEXT: ptrue p0.d, vl8 563; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 564; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 565; VBITS_GE_512-NEXT: fdiv z0.d, p0/m, z0.d, z1.d 566; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 567; VBITS_GE_512-NEXT: ret 568 %op1 = load <8 x double>, ptr %a 569 %op2 = load <8 x double>, ptr %b 570 %res = fdiv <8 x double> %op1, %op2 571 store <8 x double> %res, ptr %a 572 ret void 573} 574 575define void @fdiv_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 576; CHECK-LABEL: fdiv_v16f64: 577; CHECK: // %bb.0: 578; CHECK-NEXT: ptrue p0.d, vl16 579; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 580; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 581; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d 582; CHECK-NEXT: st1d { z0.d }, p0, [x0] 583; CHECK-NEXT: ret 584 %op1 = load <16 x double>, ptr %a 585 %op2 = load <16 x double>, ptr %b 586 %res = fdiv <16 x double> %op1, %op2 587 store <16 x double> %res, ptr %a 588 ret void 589} 590 591define void @fdiv_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 592; CHECK-LABEL: fdiv_v32f64: 593; CHECK: // %bb.0: 594; CHECK-NEXT: ptrue p0.d, vl32 595; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 596; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 597; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d 598; CHECK-NEXT: st1d { z0.d }, p0, [x0] 599; CHECK-NEXT: ret 600 %op1 = load <32 x double>, ptr %a 601 %op2 = load <32 x double>, ptr %b 602 %res = fdiv <32 x double> %op1, %op2 603 store <32 x double> %res, ptr %a 604 ret void 605} 606 607; 608; FMA 609; 610 611; Don't use SVE for 64-bit vectors. 612define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 { 613; CHECK-LABEL: fma_v4f16: 614; CHECK: // %bb.0: 615; CHECK-NEXT: fmla v2.4h, v1.4h, v0.4h 616; CHECK-NEXT: fmov d0, d2 617; CHECK-NEXT: ret 618 %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) 619 ret <4 x half> %res 620} 621 622; Don't use SVE for 128-bit vectors. 623define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 { 624; CHECK-LABEL: fma_v8f16: 625; CHECK: // %bb.0: 626; CHECK-NEXT: fmla v2.8h, v1.8h, v0.8h 627; CHECK-NEXT: mov v0.16b, v2.16b 628; CHECK-NEXT: ret 629 %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) 630 ret <8 x half> %res 631} 632 633define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { 634; CHECK-LABEL: fma_v16f16: 635; CHECK: // %bb.0: 636; CHECK-NEXT: ptrue p0.h, vl16 637; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 638; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 639; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2] 640; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h 641; CHECK-NEXT: st1h { z0.h }, p0, [x0] 642; CHECK-NEXT: ret 643 %op1 = load <16 x half>, ptr %a 644 %op2 = load <16 x half>, ptr %b 645 %op3 = load <16 x half>, ptr %c 646 %res = call <16 x half> @llvm.fma.v16f16(<16 x half> %op1, <16 x half> %op2, <16 x half> %op3) 647 store <16 x half> %res, ptr %a 648 ret void 649} 650 651define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 { 652; VBITS_GE_256-LABEL: fma_v32f16: 653; VBITS_GE_256: // %bb.0: 654; VBITS_GE_256-NEXT: ptrue p0.h, vl16 655; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 656; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 657; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 658; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x2, x8, lsl #1] 659; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0] 660; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] 661; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2] 662; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h 663; VBITS_GE_256-NEXT: movprfx z1, z5 664; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h 665; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 666; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 667; VBITS_GE_256-NEXT: ret 668; 669; VBITS_GE_512-LABEL: fma_v32f16: 670; VBITS_GE_512: // %bb.0: 671; VBITS_GE_512-NEXT: ptrue p0.h, vl32 672; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 673; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 674; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2] 675; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h 676; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 677; VBITS_GE_512-NEXT: ret 678 %op1 = load <32 x half>, ptr %a 679 %op2 = load <32 x half>, ptr %b 680 %op3 = load <32 x half>, ptr %c 681 %res = call <32 x half> @llvm.fma.v32f16(<32 x half> %op1, <32 x half> %op2, <32 x half> %op3) 682 store <32 x half> %res, ptr %a 683 ret void 684} 685 686define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { 687; CHECK-LABEL: fma_v64f16: 688; CHECK: // %bb.0: 689; CHECK-NEXT: ptrue p0.h, vl64 690; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 691; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 692; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2] 693; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h 694; CHECK-NEXT: st1h { z0.h }, p0, [x0] 695; CHECK-NEXT: ret 696 %op1 = load <64 x half>, ptr %a 697 %op2 = load <64 x half>, ptr %b 698 %op3 = load <64 x half>, ptr %c 699 %res = call <64 x half> @llvm.fma.v64f16(<64 x half> %op1, <64 x half> %op2, <64 x half> %op3) 700 store <64 x half> %res, ptr %a 701 ret void 702} 703 704define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { 705; CHECK-LABEL: fma_v128f16: 706; CHECK: // %bb.0: 707; CHECK-NEXT: ptrue p0.h, vl128 708; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 709; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 710; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2] 711; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h 712; CHECK-NEXT: st1h { z0.h }, p0, [x0] 713; CHECK-NEXT: ret 714 %op1 = load <128 x half>, ptr %a 715 %op2 = load <128 x half>, ptr %b 716 %op3 = load <128 x half>, ptr %c 717 %res = call <128 x half> @llvm.fma.v128f16(<128 x half> %op1, <128 x half> %op2, <128 x half> %op3) 718 store <128 x half> %res, ptr %a 719 ret void 720} 721 722; Don't use SVE for 64-bit vectors. 723define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 { 724; CHECK-LABEL: fma_v2f32: 725; CHECK: // %bb.0: 726; CHECK-NEXT: fmla v2.2s, v1.2s, v0.2s 727; CHECK-NEXT: fmov d0, d2 728; CHECK-NEXT: ret 729 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) 730 ret <2 x float> %res 731} 732 733; Don't use SVE for 128-bit vectors. 734define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 { 735; CHECK-LABEL: fma_v4f32: 736; CHECK: // %bb.0: 737; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s 738; CHECK-NEXT: mov v0.16b, v2.16b 739; CHECK-NEXT: ret 740 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) 741 ret <4 x float> %res 742} 743 744define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { 745; CHECK-LABEL: fma_v8f32: 746; CHECK: // %bb.0: 747; CHECK-NEXT: ptrue p0.s, vl8 748; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 749; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 750; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] 751; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s 752; CHECK-NEXT: st1w { z0.s }, p0, [x0] 753; CHECK-NEXT: ret 754 %op1 = load <8 x float>, ptr %a 755 %op2 = load <8 x float>, ptr %b 756 %op3 = load <8 x float>, ptr %c 757 %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %op1, <8 x float> %op2, <8 x float> %op3) 758 store <8 x float> %res, ptr %a 759 ret void 760} 761 762define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 { 763; VBITS_GE_256-LABEL: fma_v16f32: 764; VBITS_GE_256: // %bb.0: 765; VBITS_GE_256-NEXT: ptrue p0.s, vl8 766; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 767; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 768; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 769; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2] 770; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] 771; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] 772; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2] 773; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s 774; VBITS_GE_256-NEXT: movprfx z1, z5 775; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s 776; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 777; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 778; VBITS_GE_256-NEXT: ret 779; 780; VBITS_GE_512-LABEL: fma_v16f32: 781; VBITS_GE_512: // %bb.0: 782; VBITS_GE_512-NEXT: ptrue p0.s, vl16 783; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 784; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 785; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2] 786; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s 787; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 788; VBITS_GE_512-NEXT: ret 789 %op1 = load <16 x float>, ptr %a 790 %op2 = load <16 x float>, ptr %b 791 %op3 = load <16 x float>, ptr %c 792 %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %op1, <16 x float> %op2, <16 x float> %op3) 793 store <16 x float> %res, ptr %a 794 ret void 795} 796 797define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { 798; CHECK-LABEL: fma_v32f32: 799; CHECK: // %bb.0: 800; CHECK-NEXT: ptrue p0.s, vl32 801; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 802; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 803; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] 804; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s 805; CHECK-NEXT: st1w { z0.s }, p0, [x0] 806; CHECK-NEXT: ret 807 %op1 = load <32 x float>, ptr %a 808 %op2 = load <32 x float>, ptr %b 809 %op3 = load <32 x float>, ptr %c 810 %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %op1, <32 x float> %op2, <32 x float> %op3) 811 store <32 x float> %res, ptr %a 812 ret void 813} 814 815define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { 816; CHECK-LABEL: fma_v64f32: 817; CHECK: // %bb.0: 818; CHECK-NEXT: ptrue p0.s, vl64 819; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 820; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 821; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] 822; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s 823; CHECK-NEXT: st1w { z0.s }, p0, [x0] 824; CHECK-NEXT: ret 825 %op1 = load <64 x float>, ptr %a 826 %op2 = load <64 x float>, ptr %b 827 %op3 = load <64 x float>, ptr %c 828 %res = call <64 x float> @llvm.fma.v64f32(<64 x float> %op1, <64 x float> %op2, <64 x float> %op3) 829 store <64 x float> %res, ptr %a 830 ret void 831} 832 833; Don't use SVE for 64-bit vectors. 834define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 { 835; CHECK-LABEL: fma_v1f64: 836; CHECK: // %bb.0: 837; CHECK-NEXT: fmadd d0, d0, d1, d2 838; CHECK-NEXT: ret 839 %res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) 840 ret <1 x double> %res 841} 842 843; Don't use SVE for 128-bit vectors. 844define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 { 845; CHECK-LABEL: fma_v2f64: 846; CHECK: // %bb.0: 847; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d 848; CHECK-NEXT: mov v0.16b, v2.16b 849; CHECK-NEXT: ret 850 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) 851 ret <2 x double> %res 852} 853 854define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { 855; CHECK-LABEL: fma_v4f64: 856; CHECK: // %bb.0: 857; CHECK-NEXT: ptrue p0.d, vl4 858; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 859; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 860; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] 861; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d 862; CHECK-NEXT: st1d { z0.d }, p0, [x0] 863; CHECK-NEXT: ret 864 %op1 = load <4 x double>, ptr %a 865 %op2 = load <4 x double>, ptr %b 866 %op3 = load <4 x double>, ptr %c 867 %res = call <4 x double> @llvm.fma.v4f64(<4 x double> %op1, <4 x double> %op2, <4 x double> %op3) 868 store <4 x double> %res, ptr %a 869 ret void 870} 871 872define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 { 873; VBITS_GE_256-LABEL: fma_v8f64: 874; VBITS_GE_256: // %bb.0: 875; VBITS_GE_256-NEXT: ptrue p0.d, vl4 876; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 877; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 878; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 879; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x2, x8, lsl #3] 880; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] 881; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] 882; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2] 883; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d 884; VBITS_GE_256-NEXT: movprfx z1, z5 885; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d 886; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 887; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 888; VBITS_GE_256-NEXT: ret 889; 890; VBITS_GE_512-LABEL: fma_v8f64: 891; VBITS_GE_512: // %bb.0: 892; VBITS_GE_512-NEXT: ptrue p0.d, vl8 893; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 894; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 895; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2] 896; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d 897; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 898; VBITS_GE_512-NEXT: ret 899 %op1 = load <8 x double>, ptr %a 900 %op2 = load <8 x double>, ptr %b 901 %op3 = load <8 x double>, ptr %c 902 %res = call <8 x double> @llvm.fma.v8f64(<8 x double> %op1, <8 x double> %op2, <8 x double> %op3) 903 store <8 x double> %res, ptr %a 904 ret void 905} 906 907define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { 908; CHECK-LABEL: fma_v16f64: 909; CHECK: // %bb.0: 910; CHECK-NEXT: ptrue p0.d, vl16 911; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 912; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 913; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] 914; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d 915; CHECK-NEXT: st1d { z0.d }, p0, [x0] 916; CHECK-NEXT: ret 917 %op1 = load <16 x double>, ptr %a 918 %op2 = load <16 x double>, ptr %b 919 %op3 = load <16 x double>, ptr %c 920 %res = call <16 x double> @llvm.fma.v16f64(<16 x double> %op1, <16 x double> %op2, <16 x double> %op3) 921 store <16 x double> %res, ptr %a 922 ret void 923} 924 925define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { 926; CHECK-LABEL: fma_v32f64: 927; CHECK: // %bb.0: 928; CHECK-NEXT: ptrue p0.d, vl32 929; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 930; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 931; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] 932; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d 933; CHECK-NEXT: st1d { z0.d }, p0, [x0] 934; CHECK-NEXT: ret 935 %op1 = load <32 x double>, ptr %a 936 %op2 = load <32 x double>, ptr %b 937 %op3 = load <32 x double>, ptr %c 938 %res = call <32 x double> @llvm.fma.v32f64(<32 x double> %op1, <32 x double> %op2, <32 x double> %op3) 939 store <32 x double> %res, ptr %a 940 ret void 941} 942 943; 944; FMUL 945; 946 947; Don't use SVE for 64-bit vectors. 948define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 { 949; CHECK-LABEL: fmul_v4f16: 950; CHECK: // %bb.0: 951; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h 952; CHECK-NEXT: ret 953 %res = fmul <4 x half> %op1, %op2 954 ret <4 x half> %res 955} 956 957; Don't use SVE for 128-bit vectors. 958define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 { 959; CHECK-LABEL: fmul_v8f16: 960; CHECK: // %bb.0: 961; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h 962; CHECK-NEXT: ret 963 %res = fmul <8 x half> %op1, %op2 964 ret <8 x half> %res 965} 966 967define void @fmul_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 968; CHECK-LABEL: fmul_v16f16: 969; CHECK: // %bb.0: 970; CHECK-NEXT: ptrue p0.h, vl16 971; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 972; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 973; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h 974; CHECK-NEXT: st1h { z0.h }, p0, [x0] 975; CHECK-NEXT: ret 976 %op1 = load <16 x half>, ptr %a 977 %op2 = load <16 x half>, ptr %b 978 %res = fmul <16 x half> %op1, %op2 979 store <16 x half> %res, ptr %a 980 ret void 981} 982 983define void @fmul_v32f16(ptr %a, ptr %b) #0 { 984; VBITS_GE_256-LABEL: fmul_v32f16: 985; VBITS_GE_256: // %bb.0: 986; VBITS_GE_256-NEXT: ptrue p0.h, vl16 987; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 988; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 989; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 990; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 991; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 992; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z1.h 993; VBITS_GE_256-NEXT: movprfx z1, z2 994; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h 995; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 996; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 997; VBITS_GE_256-NEXT: ret 998; 999; VBITS_GE_512-LABEL: fmul_v32f16: 1000; VBITS_GE_512: // %bb.0: 1001; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1002; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1003; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 1004; VBITS_GE_512-NEXT: fmul z0.h, p0/m, z0.h, z1.h 1005; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1006; VBITS_GE_512-NEXT: ret 1007 %op1 = load <32 x half>, ptr %a 1008 %op2 = load <32 x half>, ptr %b 1009 %res = fmul <32 x half> %op1, %op2 1010 store <32 x half> %res, ptr %a 1011 ret void 1012} 1013 1014define void @fmul_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 1015; CHECK-LABEL: fmul_v64f16: 1016; CHECK: // %bb.0: 1017; CHECK-NEXT: ptrue p0.h, vl64 1018; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1019; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1020; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h 1021; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1022; CHECK-NEXT: ret 1023 %op1 = load <64 x half>, ptr %a 1024 %op2 = load <64 x half>, ptr %b 1025 %res = fmul <64 x half> %op1, %op2 1026 store <64 x half> %res, ptr %a 1027 ret void 1028} 1029 1030define void @fmul_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1031; CHECK-LABEL: fmul_v128f16: 1032; CHECK: // %bb.0: 1033; CHECK-NEXT: ptrue p0.h, vl128 1034; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1035; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1036; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h 1037; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1038; CHECK-NEXT: ret 1039 %op1 = load <128 x half>, ptr %a 1040 %op2 = load <128 x half>, ptr %b 1041 %res = fmul <128 x half> %op1, %op2 1042 store <128 x half> %res, ptr %a 1043 ret void 1044} 1045 1046; Don't use SVE for 64-bit vectors. 1047define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 { 1048; CHECK-LABEL: fmul_v2f32: 1049; CHECK: // %bb.0: 1050; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s 1051; CHECK-NEXT: ret 1052 %res = fmul <2 x float> %op1, %op2 1053 ret <2 x float> %res 1054} 1055 1056; Don't use SVE for 128-bit vectors. 1057define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 { 1058; CHECK-LABEL: fmul_v4f32: 1059; CHECK: // %bb.0: 1060; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s 1061; CHECK-NEXT: ret 1062 %res = fmul <4 x float> %op1, %op2 1063 ret <4 x float> %res 1064} 1065 1066define void @fmul_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1067; CHECK-LABEL: fmul_v8f32: 1068; CHECK: // %bb.0: 1069; CHECK-NEXT: ptrue p0.s, vl8 1070; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1071; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1072; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s 1073; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1074; CHECK-NEXT: ret 1075 %op1 = load <8 x float>, ptr %a 1076 %op2 = load <8 x float>, ptr %b 1077 %res = fmul <8 x float> %op1, %op2 1078 store <8 x float> %res, ptr %a 1079 ret void 1080} 1081 1082define void @fmul_v16f32(ptr %a, ptr %b) #0 { 1083; VBITS_GE_256-LABEL: fmul_v16f32: 1084; VBITS_GE_256: // %bb.0: 1085; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1086; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1087; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1088; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1089; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1090; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1091; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z1.s 1092; VBITS_GE_256-NEXT: movprfx z1, z2 1093; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s 1094; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1095; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1096; VBITS_GE_256-NEXT: ret 1097; 1098; VBITS_GE_512-LABEL: fmul_v16f32: 1099; VBITS_GE_512: // %bb.0: 1100; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1101; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1102; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1103; VBITS_GE_512-NEXT: fmul z0.s, p0/m, z0.s, z1.s 1104; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1105; VBITS_GE_512-NEXT: ret 1106 %op1 = load <16 x float>, ptr %a 1107 %op2 = load <16 x float>, ptr %b 1108 %res = fmul <16 x float> %op1, %op2 1109 store <16 x float> %res, ptr %a 1110 ret void 1111} 1112 1113define void @fmul_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1114; CHECK-LABEL: fmul_v32f32: 1115; CHECK: // %bb.0: 1116; CHECK-NEXT: ptrue p0.s, vl32 1117; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1118; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1119; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s 1120; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1121; CHECK-NEXT: ret 1122 %op1 = load <32 x float>, ptr %a 1123 %op2 = load <32 x float>, ptr %b 1124 %res = fmul <32 x float> %op1, %op2 1125 store <32 x float> %res, ptr %a 1126 ret void 1127} 1128 1129define void @fmul_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1130; CHECK-LABEL: fmul_v64f32: 1131; CHECK: // %bb.0: 1132; CHECK-NEXT: ptrue p0.s, vl64 1133; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1134; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1135; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s 1136; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1137; CHECK-NEXT: ret 1138 %op1 = load <64 x float>, ptr %a 1139 %op2 = load <64 x float>, ptr %b 1140 %res = fmul <64 x float> %op1, %op2 1141 store <64 x float> %res, ptr %a 1142 ret void 1143} 1144 1145; Don't use SVE for 64-bit vectors. 1146define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 { 1147; CHECK-LABEL: fmul_v1f64: 1148; CHECK: // %bb.0: 1149; CHECK-NEXT: fmul d0, d0, d1 1150; CHECK-NEXT: ret 1151 %res = fmul <1 x double> %op1, %op2 1152 ret <1 x double> %res 1153} 1154 1155; Don't use SVE for 128-bit vectors. 1156define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 { 1157; CHECK-LABEL: fmul_v2f64: 1158; CHECK: // %bb.0: 1159; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d 1160; CHECK-NEXT: ret 1161 %res = fmul <2 x double> %op1, %op2 1162 ret <2 x double> %res 1163} 1164 1165define void @fmul_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 1166; CHECK-LABEL: fmul_v4f64: 1167; CHECK: // %bb.0: 1168; CHECK-NEXT: ptrue p0.d, vl4 1169; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1170; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1171; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d 1172; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1173; CHECK-NEXT: ret 1174 %op1 = load <4 x double>, ptr %a 1175 %op2 = load <4 x double>, ptr %b 1176 %res = fmul <4 x double> %op1, %op2 1177 store <4 x double> %res, ptr %a 1178 ret void 1179} 1180 1181define void @fmul_v8f64(ptr %a, ptr %b) #0 { 1182; VBITS_GE_256-LABEL: fmul_v8f64: 1183; VBITS_GE_256: // %bb.0: 1184; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1185; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1186; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1187; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 1188; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 1189; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 1190; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z1.d 1191; VBITS_GE_256-NEXT: movprfx z1, z2 1192; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d 1193; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1194; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1195; VBITS_GE_256-NEXT: ret 1196; 1197; VBITS_GE_512-LABEL: fmul_v8f64: 1198; VBITS_GE_512: // %bb.0: 1199; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1200; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1201; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 1202; VBITS_GE_512-NEXT: fmul z0.d, p0/m, z0.d, z1.d 1203; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1204; VBITS_GE_512-NEXT: ret 1205 %op1 = load <8 x double>, ptr %a 1206 %op2 = load <8 x double>, ptr %b 1207 %res = fmul <8 x double> %op1, %op2 1208 store <8 x double> %res, ptr %a 1209 ret void 1210} 1211 1212define void @fmul_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 1213; CHECK-LABEL: fmul_v16f64: 1214; CHECK: // %bb.0: 1215; CHECK-NEXT: ptrue p0.d, vl16 1216; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1217; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1218; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d 1219; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1220; CHECK-NEXT: ret 1221 %op1 = load <16 x double>, ptr %a 1222 %op2 = load <16 x double>, ptr %b 1223 %res = fmul <16 x double> %op1, %op2 1224 store <16 x double> %res, ptr %a 1225 ret void 1226} 1227 1228define void @fmul_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 1229; CHECK-LABEL: fmul_v32f64: 1230; CHECK: // %bb.0: 1231; CHECK-NEXT: ptrue p0.d, vl32 1232; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1233; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1234; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d 1235; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1236; CHECK-NEXT: ret 1237 %op1 = load <32 x double>, ptr %a 1238 %op2 = load <32 x double>, ptr %b 1239 %res = fmul <32 x double> %op1, %op2 1240 store <32 x double> %res, ptr %a 1241 ret void 1242} 1243 1244; 1245; FNEG 1246; 1247 1248; Don't use SVE for 64-bit vectors. 1249define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 { 1250; CHECK-LABEL: fneg_v4f16: 1251; CHECK: // %bb.0: 1252; CHECK-NEXT: fneg v0.4h, v0.4h 1253; CHECK-NEXT: ret 1254 %res = fneg <4 x half> %op 1255 ret <4 x half> %res 1256} 1257 1258; Don't use SVE for 128-bit vectors. 1259define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 { 1260; CHECK-LABEL: fneg_v8f16: 1261; CHECK: // %bb.0: 1262; CHECK-NEXT: fneg v0.8h, v0.8h 1263; CHECK-NEXT: ret 1264 %res = fneg <8 x half> %op 1265 ret <8 x half> %res 1266} 1267 1268define void @fneg_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 1269; CHECK-LABEL: fneg_v16f16: 1270; CHECK: // %bb.0: 1271; CHECK-NEXT: ptrue p0.h, vl16 1272; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1273; CHECK-NEXT: fneg z0.h, p0/m, z0.h 1274; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1275; CHECK-NEXT: ret 1276 %op = load <16 x half>, ptr %a 1277 %res = fneg <16 x half> %op 1278 store <16 x half> %res, ptr %a 1279 ret void 1280} 1281 1282define void @fneg_v32f16(ptr %a) #0 { 1283; VBITS_GE_256-LABEL: fneg_v32f16: 1284; VBITS_GE_256: // %bb.0: 1285; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1286; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1287; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1288; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1289; VBITS_GE_256-NEXT: fneg z0.h, p0/m, z0.h 1290; VBITS_GE_256-NEXT: fneg z1.h, p0/m, z1.h 1291; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1292; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 1293; VBITS_GE_256-NEXT: ret 1294; 1295; VBITS_GE_512-LABEL: fneg_v32f16: 1296; VBITS_GE_512: // %bb.0: 1297; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1298; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1299; VBITS_GE_512-NEXT: fneg z0.h, p0/m, z0.h 1300; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1301; VBITS_GE_512-NEXT: ret 1302 %op = load <32 x half>, ptr %a 1303 %res = fneg <32 x half> %op 1304 store <32 x half> %res, ptr %a 1305 ret void 1306} 1307 1308define void @fneg_v64f16(ptr %a) vscale_range(8,0) #0 { 1309; CHECK-LABEL: fneg_v64f16: 1310; CHECK: // %bb.0: 1311; CHECK-NEXT: ptrue p0.h, vl64 1312; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1313; CHECK-NEXT: fneg z0.h, p0/m, z0.h 1314; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1315; CHECK-NEXT: ret 1316 %op = load <64 x half>, ptr %a 1317 %res = fneg <64 x half> %op 1318 store <64 x half> %res, ptr %a 1319 ret void 1320} 1321 1322define void @fneg_v128f16(ptr %a) vscale_range(16,0) #0 { 1323; CHECK-LABEL: fneg_v128f16: 1324; CHECK: // %bb.0: 1325; CHECK-NEXT: ptrue p0.h, vl128 1326; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1327; CHECK-NEXT: fneg z0.h, p0/m, z0.h 1328; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1329; CHECK-NEXT: ret 1330 %op = load <128 x half>, ptr %a 1331 %res = fneg <128 x half> %op 1332 store <128 x half> %res, ptr %a 1333 ret void 1334} 1335 1336; Don't use SVE for 64-bit vectors. 1337define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 { 1338; CHECK-LABEL: fneg_v2f32: 1339; CHECK: // %bb.0: 1340; CHECK-NEXT: fneg v0.2s, v0.2s 1341; CHECK-NEXT: ret 1342 %res = fneg <2 x float> %op 1343 ret <2 x float> %res 1344} 1345 1346; Don't use SVE for 128-bit vectors. 1347define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 { 1348; CHECK-LABEL: fneg_v4f32: 1349; CHECK: // %bb.0: 1350; CHECK-NEXT: fneg v0.4s, v0.4s 1351; CHECK-NEXT: ret 1352 %res = fneg <4 x float> %op 1353 ret <4 x float> %res 1354} 1355 1356define void @fneg_v8f32(ptr %a) vscale_range(2,0) #0 { 1357; CHECK-LABEL: fneg_v8f32: 1358; CHECK: // %bb.0: 1359; CHECK-NEXT: ptrue p0.s, vl8 1360; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1361; CHECK-NEXT: fneg z0.s, p0/m, z0.s 1362; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1363; CHECK-NEXT: ret 1364 %op = load <8 x float>, ptr %a 1365 %res = fneg <8 x float> %op 1366 store <8 x float> %res, ptr %a 1367 ret void 1368} 1369 1370define void @fneg_v16f32(ptr %a) #0 { 1371; VBITS_GE_256-LABEL: fneg_v16f32: 1372; VBITS_GE_256: // %bb.0: 1373; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1374; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1375; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1376; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1377; VBITS_GE_256-NEXT: fneg z0.s, p0/m, z0.s 1378; VBITS_GE_256-NEXT: fneg z1.s, p0/m, z1.s 1379; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1380; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1381; VBITS_GE_256-NEXT: ret 1382; 1383; VBITS_GE_512-LABEL: fneg_v16f32: 1384; VBITS_GE_512: // %bb.0: 1385; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1386; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1387; VBITS_GE_512-NEXT: fneg z0.s, p0/m, z0.s 1388; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1389; VBITS_GE_512-NEXT: ret 1390 %op = load <16 x float>, ptr %a 1391 %res = fneg <16 x float> %op 1392 store <16 x float> %res, ptr %a 1393 ret void 1394} 1395 1396define void @fneg_v32f32(ptr %a) vscale_range(8,0) #0 { 1397; CHECK-LABEL: fneg_v32f32: 1398; CHECK: // %bb.0: 1399; CHECK-NEXT: ptrue p0.s, vl32 1400; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1401; CHECK-NEXT: fneg z0.s, p0/m, z0.s 1402; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1403; CHECK-NEXT: ret 1404 %op = load <32 x float>, ptr %a 1405 %res = fneg <32 x float> %op 1406 store <32 x float> %res, ptr %a 1407 ret void 1408} 1409 1410define void @fneg_v64f32(ptr %a) vscale_range(16,0) #0 { 1411; CHECK-LABEL: fneg_v64f32: 1412; CHECK: // %bb.0: 1413; CHECK-NEXT: ptrue p0.s, vl64 1414; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1415; CHECK-NEXT: fneg z0.s, p0/m, z0.s 1416; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1417; CHECK-NEXT: ret 1418 %op = load <64 x float>, ptr %a 1419 %res = fneg <64 x float> %op 1420 store <64 x float> %res, ptr %a 1421 ret void 1422} 1423 1424; Don't use SVE for 64-bit vectors. 1425define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 { 1426; CHECK-LABEL: fneg_v1f64: 1427; CHECK: // %bb.0: 1428; CHECK-NEXT: fneg d0, d0 1429; CHECK-NEXT: ret 1430 %res = fneg <1 x double> %op 1431 ret <1 x double> %res 1432} 1433 1434; Don't use SVE for 128-bit vectors. 1435define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 { 1436; CHECK-LABEL: fneg_v2f64: 1437; CHECK: // %bb.0: 1438; CHECK-NEXT: fneg v0.2d, v0.2d 1439; CHECK-NEXT: ret 1440 %res = fneg <2 x double> %op 1441 ret <2 x double> %res 1442} 1443 1444define void @fneg_v4f64(ptr %a) vscale_range(2,0) #0 { 1445; CHECK-LABEL: fneg_v4f64: 1446; CHECK: // %bb.0: 1447; CHECK-NEXT: ptrue p0.d, vl4 1448; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1449; CHECK-NEXT: fneg z0.d, p0/m, z0.d 1450; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1451; CHECK-NEXT: ret 1452 %op = load <4 x double>, ptr %a 1453 %res = fneg <4 x double> %op 1454 store <4 x double> %res, ptr %a 1455 ret void 1456} 1457 1458define void @fneg_v8f64(ptr %a) #0 { 1459; VBITS_GE_256-LABEL: fneg_v8f64: 1460; VBITS_GE_256: // %bb.0: 1461; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1462; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1463; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1464; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1465; VBITS_GE_256-NEXT: fneg z0.d, p0/m, z0.d 1466; VBITS_GE_256-NEXT: fneg z1.d, p0/m, z1.d 1467; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1468; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1469; VBITS_GE_256-NEXT: ret 1470; 1471; VBITS_GE_512-LABEL: fneg_v8f64: 1472; VBITS_GE_512: // %bb.0: 1473; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1474; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1475; VBITS_GE_512-NEXT: fneg z0.d, p0/m, z0.d 1476; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1477; VBITS_GE_512-NEXT: ret 1478 %op = load <8 x double>, ptr %a 1479 %res = fneg <8 x double> %op 1480 store <8 x double> %res, ptr %a 1481 ret void 1482} 1483 1484define void @fneg_v16f64(ptr %a) vscale_range(8,0) #0 { 1485; CHECK-LABEL: fneg_v16f64: 1486; CHECK: // %bb.0: 1487; CHECK-NEXT: ptrue p0.d, vl16 1488; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1489; CHECK-NEXT: fneg z0.d, p0/m, z0.d 1490; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1491; CHECK-NEXT: ret 1492 %op = load <16 x double>, ptr %a 1493 %res = fneg <16 x double> %op 1494 store <16 x double> %res, ptr %a 1495 ret void 1496} 1497 1498define void @fneg_v32f64(ptr %a) vscale_range(16,0) #0 { 1499; CHECK-LABEL: fneg_v32f64: 1500; CHECK: // %bb.0: 1501; CHECK-NEXT: ptrue p0.d, vl32 1502; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1503; CHECK-NEXT: fneg z0.d, p0/m, z0.d 1504; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1505; CHECK-NEXT: ret 1506 %op = load <32 x double>, ptr %a 1507 %res = fneg <32 x double> %op 1508 store <32 x double> %res, ptr %a 1509 ret void 1510} 1511 1512; 1513; FSQRT 1514; 1515 1516; Don't use SVE for 64-bit vectors. 1517define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 { 1518; CHECK-LABEL: fsqrt_v4f16: 1519; CHECK: // %bb.0: 1520; CHECK-NEXT: fsqrt v0.4h, v0.4h 1521; CHECK-NEXT: ret 1522 %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op) 1523 ret <4 x half> %res 1524} 1525 1526; Don't use SVE for 128-bit vectors. 1527define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 { 1528; CHECK-LABEL: fsqrt_v8f16: 1529; CHECK: // %bb.0: 1530; CHECK-NEXT: fsqrt v0.8h, v0.8h 1531; CHECK-NEXT: ret 1532 %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op) 1533 ret <8 x half> %res 1534} 1535 1536define void @fsqrt_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 1537; CHECK-LABEL: fsqrt_v16f16: 1538; CHECK: // %bb.0: 1539; CHECK-NEXT: ptrue p0.h, vl16 1540; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1541; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h 1542; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1543; CHECK-NEXT: ret 1544 %op = load <16 x half>, ptr %a 1545 %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) 1546 store <16 x half> %res, ptr %a 1547 ret void 1548} 1549 1550define void @fsqrt_v32f16(ptr %a) #0 { 1551; VBITS_GE_256-LABEL: fsqrt_v32f16: 1552; VBITS_GE_256: // %bb.0: 1553; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1554; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1555; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1556; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1557; VBITS_GE_256-NEXT: fsqrt z0.h, p0/m, z0.h 1558; VBITS_GE_256-NEXT: fsqrt z1.h, p0/m, z1.h 1559; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1560; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 1561; VBITS_GE_256-NEXT: ret 1562; 1563; VBITS_GE_512-LABEL: fsqrt_v32f16: 1564; VBITS_GE_512: // %bb.0: 1565; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1566; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1567; VBITS_GE_512-NEXT: fsqrt z0.h, p0/m, z0.h 1568; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1569; VBITS_GE_512-NEXT: ret 1570 %op = load <32 x half>, ptr %a 1571 %res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op) 1572 store <32 x half> %res, ptr %a 1573 ret void 1574} 1575 1576define void @fsqrt_v64f16(ptr %a) vscale_range(8,0) #0 { 1577; CHECK-LABEL: fsqrt_v64f16: 1578; CHECK: // %bb.0: 1579; CHECK-NEXT: ptrue p0.h, vl64 1580; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1581; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h 1582; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1583; CHECK-NEXT: ret 1584 %op = load <64 x half>, ptr %a 1585 %res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op) 1586 store <64 x half> %res, ptr %a 1587 ret void 1588} 1589 1590define void @fsqrt_v128f16(ptr %a) vscale_range(16,0) #0 { 1591; CHECK-LABEL: fsqrt_v128f16: 1592; CHECK: // %bb.0: 1593; CHECK-NEXT: ptrue p0.h, vl128 1594; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1595; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h 1596; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1597; CHECK-NEXT: ret 1598 %op = load <128 x half>, ptr %a 1599 %res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op) 1600 store <128 x half> %res, ptr %a 1601 ret void 1602} 1603 1604; Don't use SVE for 64-bit vectors. 1605define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 { 1606; CHECK-LABEL: fsqrt_v2f32: 1607; CHECK: // %bb.0: 1608; CHECK-NEXT: fsqrt v0.2s, v0.2s 1609; CHECK-NEXT: ret 1610 %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op) 1611 ret <2 x float> %res 1612} 1613 1614; Don't use SVE for 128-bit vectors. 1615define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 { 1616; CHECK-LABEL: fsqrt_v4f32: 1617; CHECK: // %bb.0: 1618; CHECK-NEXT: fsqrt v0.4s, v0.4s 1619; CHECK-NEXT: ret 1620 %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op) 1621 ret <4 x float> %res 1622} 1623 1624define void @fsqrt_v8f32(ptr %a) vscale_range(2,0) #0 { 1625; CHECK-LABEL: fsqrt_v8f32: 1626; CHECK: // %bb.0: 1627; CHECK-NEXT: ptrue p0.s, vl8 1628; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1629; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s 1630; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1631; CHECK-NEXT: ret 1632 %op = load <8 x float>, ptr %a 1633 %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) 1634 store <8 x float> %res, ptr %a 1635 ret void 1636} 1637 1638define void @fsqrt_v16f32(ptr %a) #0 { 1639; VBITS_GE_256-LABEL: fsqrt_v16f32: 1640; VBITS_GE_256: // %bb.0: 1641; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1642; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1643; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1644; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1645; VBITS_GE_256-NEXT: fsqrt z0.s, p0/m, z0.s 1646; VBITS_GE_256-NEXT: fsqrt z1.s, p0/m, z1.s 1647; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1648; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1649; VBITS_GE_256-NEXT: ret 1650; 1651; VBITS_GE_512-LABEL: fsqrt_v16f32: 1652; VBITS_GE_512: // %bb.0: 1653; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1654; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1655; VBITS_GE_512-NEXT: fsqrt z0.s, p0/m, z0.s 1656; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1657; VBITS_GE_512-NEXT: ret 1658 %op = load <16 x float>, ptr %a 1659 %res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op) 1660 store <16 x float> %res, ptr %a 1661 ret void 1662} 1663 1664define void @fsqrt_v32f32(ptr %a) vscale_range(8,0) #0 { 1665; CHECK-LABEL: fsqrt_v32f32: 1666; CHECK: // %bb.0: 1667; CHECK-NEXT: ptrue p0.s, vl32 1668; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1669; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s 1670; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1671; CHECK-NEXT: ret 1672 %op = load <32 x float>, ptr %a 1673 %res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op) 1674 store <32 x float> %res, ptr %a 1675 ret void 1676} 1677 1678define void @fsqrt_v64f32(ptr %a) vscale_range(16,0) #0 { 1679; CHECK-LABEL: fsqrt_v64f32: 1680; CHECK: // %bb.0: 1681; CHECK-NEXT: ptrue p0.s, vl64 1682; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1683; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s 1684; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1685; CHECK-NEXT: ret 1686 %op = load <64 x float>, ptr %a 1687 %res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op) 1688 store <64 x float> %res, ptr %a 1689 ret void 1690} 1691 1692; Don't use SVE for 64-bit vectors. 1693define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 { 1694; CHECK-LABEL: fsqrt_v1f64: 1695; CHECK: // %bb.0: 1696; CHECK-NEXT: fsqrt d0, d0 1697; CHECK-NEXT: ret 1698 %res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op) 1699 ret <1 x double> %res 1700} 1701 1702; Don't use SVE for 128-bit vectors. 1703define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 { 1704; CHECK-LABEL: fsqrt_v2f64: 1705; CHECK: // %bb.0: 1706; CHECK-NEXT: fsqrt v0.2d, v0.2d 1707; CHECK-NEXT: ret 1708 %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op) 1709 ret <2 x double> %res 1710} 1711 1712define void @fsqrt_v4f64(ptr %a) vscale_range(2,0) #0 { 1713; CHECK-LABEL: fsqrt_v4f64: 1714; CHECK: // %bb.0: 1715; CHECK-NEXT: ptrue p0.d, vl4 1716; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1717; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d 1718; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1719; CHECK-NEXT: ret 1720 %op = load <4 x double>, ptr %a 1721 %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) 1722 store <4 x double> %res, ptr %a 1723 ret void 1724} 1725 1726define void @fsqrt_v8f64(ptr %a) #0 { 1727; VBITS_GE_256-LABEL: fsqrt_v8f64: 1728; VBITS_GE_256: // %bb.0: 1729; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1730; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1731; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1732; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1733; VBITS_GE_256-NEXT: fsqrt z0.d, p0/m, z0.d 1734; VBITS_GE_256-NEXT: fsqrt z1.d, p0/m, z1.d 1735; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 1736; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 1737; VBITS_GE_256-NEXT: ret 1738; 1739; VBITS_GE_512-LABEL: fsqrt_v8f64: 1740; VBITS_GE_512: // %bb.0: 1741; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1742; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1743; VBITS_GE_512-NEXT: fsqrt z0.d, p0/m, z0.d 1744; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 1745; VBITS_GE_512-NEXT: ret 1746 %op = load <8 x double>, ptr %a 1747 %res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op) 1748 store <8 x double> %res, ptr %a 1749 ret void 1750} 1751 1752define void @fsqrt_v16f64(ptr %a) vscale_range(8,0) #0 { 1753; CHECK-LABEL: fsqrt_v16f64: 1754; CHECK: // %bb.0: 1755; CHECK-NEXT: ptrue p0.d, vl16 1756; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1757; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d 1758; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1759; CHECK-NEXT: ret 1760 %op = load <16 x double>, ptr %a 1761 %res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op) 1762 store <16 x double> %res, ptr %a 1763 ret void 1764} 1765 1766define void @fsqrt_v32f64(ptr %a) vscale_range(16,0) #0 { 1767; CHECK-LABEL: fsqrt_v32f64: 1768; CHECK: // %bb.0: 1769; CHECK-NEXT: ptrue p0.d, vl32 1770; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1771; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d 1772; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1773; CHECK-NEXT: ret 1774 %op = load <32 x double>, ptr %a 1775 %res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op) 1776 store <32 x double> %res, ptr %a 1777 ret void 1778} 1779 1780; 1781; FSUB 1782; 1783 1784; Don't use SVE for 64-bit vectors. 1785define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 { 1786; CHECK-LABEL: fsub_v4f16: 1787; CHECK: // %bb.0: 1788; CHECK-NEXT: fsub v0.4h, v0.4h, v1.4h 1789; CHECK-NEXT: ret 1790 %res = fsub <4 x half> %op1, %op2 1791 ret <4 x half> %res 1792} 1793 1794; Don't use SVE for 128-bit vectors. 1795define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 { 1796; CHECK-LABEL: fsub_v8f16: 1797; CHECK: // %bb.0: 1798; CHECK-NEXT: fsub v0.8h, v0.8h, v1.8h 1799; CHECK-NEXT: ret 1800 %res = fsub <8 x half> %op1, %op2 1801 ret <8 x half> %res 1802} 1803 1804define void @fsub_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 1805; CHECK-LABEL: fsub_v16f16: 1806; CHECK: // %bb.0: 1807; CHECK-NEXT: ptrue p0.h, vl16 1808; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1809; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1810; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h 1811; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1812; CHECK-NEXT: ret 1813 %op1 = load <16 x half>, ptr %a 1814 %op2 = load <16 x half>, ptr %b 1815 %res = fsub <16 x half> %op1, %op2 1816 store <16 x half> %res, ptr %a 1817 ret void 1818} 1819 1820define void @fsub_v32f16(ptr %a, ptr %b) #0 { 1821; VBITS_GE_256-LABEL: fsub_v32f16: 1822; VBITS_GE_256: // %bb.0: 1823; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1824; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1825; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1826; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 1827; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 1828; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 1829; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z1.h 1830; VBITS_GE_256-NEXT: movprfx z1, z2 1831; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h 1832; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 1833; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 1834; VBITS_GE_256-NEXT: ret 1835; 1836; VBITS_GE_512-LABEL: fsub_v32f16: 1837; VBITS_GE_512: // %bb.0: 1838; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1839; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1840; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 1841; VBITS_GE_512-NEXT: fsub z0.h, p0/m, z0.h, z1.h 1842; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 1843; VBITS_GE_512-NEXT: ret 1844 %op1 = load <32 x half>, ptr %a 1845 %op2 = load <32 x half>, ptr %b 1846 %res = fsub <32 x half> %op1, %op2 1847 store <32 x half> %res, ptr %a 1848 ret void 1849} 1850 1851define void @fsub_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 1852; CHECK-LABEL: fsub_v64f16: 1853; CHECK: // %bb.0: 1854; CHECK-NEXT: ptrue p0.h, vl64 1855; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1856; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1857; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h 1858; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1859; CHECK-NEXT: ret 1860 %op1 = load <64 x half>, ptr %a 1861 %op2 = load <64 x half>, ptr %b 1862 %res = fsub <64 x half> %op1, %op2 1863 store <64 x half> %res, ptr %a 1864 ret void 1865} 1866 1867define void @fsub_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 1868; CHECK-LABEL: fsub_v128f16: 1869; CHECK: // %bb.0: 1870; CHECK-NEXT: ptrue p0.h, vl128 1871; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1872; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 1873; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h 1874; CHECK-NEXT: st1h { z0.h }, p0, [x0] 1875; CHECK-NEXT: ret 1876 %op1 = load <128 x half>, ptr %a 1877 %op2 = load <128 x half>, ptr %b 1878 %res = fsub <128 x half> %op1, %op2 1879 store <128 x half> %res, ptr %a 1880 ret void 1881} 1882 1883; Don't use SVE for 64-bit vectors. 1884define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 { 1885; CHECK-LABEL: fsub_v2f32: 1886; CHECK: // %bb.0: 1887; CHECK-NEXT: fsub v0.2s, v0.2s, v1.2s 1888; CHECK-NEXT: ret 1889 %res = fsub <2 x float> %op1, %op2 1890 ret <2 x float> %res 1891} 1892 1893; Don't use SVE for 128-bit vectors. 1894define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 { 1895; CHECK-LABEL: fsub_v4f32: 1896; CHECK: // %bb.0: 1897; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s 1898; CHECK-NEXT: ret 1899 %res = fsub <4 x float> %op1, %op2 1900 ret <4 x float> %res 1901} 1902 1903define void @fsub_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 1904; CHECK-LABEL: fsub_v8f32: 1905; CHECK: // %bb.0: 1906; CHECK-NEXT: ptrue p0.s, vl8 1907; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1908; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1909; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s 1910; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1911; CHECK-NEXT: ret 1912 %op1 = load <8 x float>, ptr %a 1913 %op2 = load <8 x float>, ptr %b 1914 %res = fsub <8 x float> %op1, %op2 1915 store <8 x float> %res, ptr %a 1916 ret void 1917} 1918 1919define void @fsub_v16f32(ptr %a, ptr %b) #0 { 1920; VBITS_GE_256-LABEL: fsub_v16f32: 1921; VBITS_GE_256: // %bb.0: 1922; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1923; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1924; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1925; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 1926; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 1927; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 1928; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z1.s 1929; VBITS_GE_256-NEXT: movprfx z1, z2 1930; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s 1931; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 1932; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 1933; VBITS_GE_256-NEXT: ret 1934; 1935; VBITS_GE_512-LABEL: fsub_v16f32: 1936; VBITS_GE_512: // %bb.0: 1937; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1938; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1939; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 1940; VBITS_GE_512-NEXT: fsub z0.s, p0/m, z0.s, z1.s 1941; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 1942; VBITS_GE_512-NEXT: ret 1943 %op1 = load <16 x float>, ptr %a 1944 %op2 = load <16 x float>, ptr %b 1945 %res = fsub <16 x float> %op1, %op2 1946 store <16 x float> %res, ptr %a 1947 ret void 1948} 1949 1950define void @fsub_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 1951; CHECK-LABEL: fsub_v32f32: 1952; CHECK: // %bb.0: 1953; CHECK-NEXT: ptrue p0.s, vl32 1954; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1955; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1956; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s 1957; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1958; CHECK-NEXT: ret 1959 %op1 = load <32 x float>, ptr %a 1960 %op2 = load <32 x float>, ptr %b 1961 %res = fsub <32 x float> %op1, %op2 1962 store <32 x float> %res, ptr %a 1963 ret void 1964} 1965 1966define void @fsub_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 1967; CHECK-LABEL: fsub_v64f32: 1968; CHECK: // %bb.0: 1969; CHECK-NEXT: ptrue p0.s, vl64 1970; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1971; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 1972; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s 1973; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1974; CHECK-NEXT: ret 1975 %op1 = load <64 x float>, ptr %a 1976 %op2 = load <64 x float>, ptr %b 1977 %res = fsub <64 x float> %op1, %op2 1978 store <64 x float> %res, ptr %a 1979 ret void 1980} 1981 1982; Don't use SVE for 64-bit vectors. 1983define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 { 1984; CHECK-LABEL: fsub_v1f64: 1985; CHECK: // %bb.0: 1986; CHECK-NEXT: fsub d0, d0, d1 1987; CHECK-NEXT: ret 1988 %res = fsub <1 x double> %op1, %op2 1989 ret <1 x double> %res 1990} 1991 1992; Don't use SVE for 128-bit vectors. 1993define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 { 1994; CHECK-LABEL: fsub_v2f64: 1995; CHECK: // %bb.0: 1996; CHECK-NEXT: fsub v0.2d, v0.2d, v1.2d 1997; CHECK-NEXT: ret 1998 %res = fsub <2 x double> %op1, %op2 1999 ret <2 x double> %res 2000} 2001 2002define void @fsub_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 2003; CHECK-LABEL: fsub_v4f64: 2004; CHECK: // %bb.0: 2005; CHECK-NEXT: ptrue p0.d, vl4 2006; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2007; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 2008; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d 2009; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2010; CHECK-NEXT: ret 2011 %op1 = load <4 x double>, ptr %a 2012 %op2 = load <4 x double>, ptr %b 2013 %res = fsub <4 x double> %op1, %op2 2014 store <4 x double> %res, ptr %a 2015 ret void 2016} 2017 2018define void @fsub_v8f64(ptr %a, ptr %b) #0 { 2019; VBITS_GE_256-LABEL: fsub_v8f64: 2020; VBITS_GE_256: // %bb.0: 2021; VBITS_GE_256-NEXT: ptrue p0.d, vl4 2022; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 2023; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 2024; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 2025; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 2026; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 2027; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z1.d 2028; VBITS_GE_256-NEXT: movprfx z1, z2 2029; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d 2030; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 2031; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 2032; VBITS_GE_256-NEXT: ret 2033; 2034; VBITS_GE_512-LABEL: fsub_v8f64: 2035; VBITS_GE_512: // %bb.0: 2036; VBITS_GE_512-NEXT: ptrue p0.d, vl8 2037; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 2038; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 2039; VBITS_GE_512-NEXT: fsub z0.d, p0/m, z0.d, z1.d 2040; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 2041; VBITS_GE_512-NEXT: ret 2042 %op1 = load <8 x double>, ptr %a 2043 %op2 = load <8 x double>, ptr %b 2044 %res = fsub <8 x double> %op1, %op2 2045 store <8 x double> %res, ptr %a 2046 ret void 2047} 2048 2049define void @fsub_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 2050; CHECK-LABEL: fsub_v16f64: 2051; CHECK: // %bb.0: 2052; CHECK-NEXT: ptrue p0.d, vl16 2053; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2054; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 2055; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d 2056; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2057; CHECK-NEXT: ret 2058 %op1 = load <16 x double>, ptr %a 2059 %op2 = load <16 x double>, ptr %b 2060 %res = fsub <16 x double> %op1, %op2 2061 store <16 x double> %res, ptr %a 2062 ret void 2063} 2064 2065define void @fsub_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 2066; CHECK-LABEL: fsub_v32f64: 2067; CHECK: // %bb.0: 2068; CHECK-NEXT: ptrue p0.d, vl32 2069; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2070; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 2071; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d 2072; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2073; CHECK-NEXT: ret 2074 %op1 = load <32 x double>, ptr %a 2075 %op2 = load <32 x double>, ptr %b 2076 %res = fsub <32 x double> %op1, %op2 2077 store <32 x double> %res, ptr %a 2078 ret void 2079} 2080 2081; 2082; FABS 2083; 2084 2085; Don't use SVE for 64-bit vectors. 2086define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 { 2087; CHECK-LABEL: fabs_v4f16: 2088; CHECK: // %bb.0: 2089; CHECK-NEXT: fabs v0.4h, v0.4h 2090; CHECK-NEXT: ret 2091 %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op) 2092 ret <4 x half> %res 2093} 2094 2095; Don't use SVE for 128-bit vectors. 2096define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 { 2097; CHECK-LABEL: fabs_v8f16: 2098; CHECK: // %bb.0: 2099; CHECK-NEXT: fabs v0.8h, v0.8h 2100; CHECK-NEXT: ret 2101 %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op) 2102 ret <8 x half> %res 2103} 2104 2105define void @fabs_v16f16(ptr %a) vscale_range(2,0) #0 { 2106; CHECK-LABEL: fabs_v16f16: 2107; CHECK: // %bb.0: 2108; CHECK-NEXT: ptrue p0.h, vl16 2109; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 2110; CHECK-NEXT: fabs z0.h, p0/m, z0.h 2111; CHECK-NEXT: st1h { z0.h }, p0, [x0] 2112; CHECK-NEXT: ret 2113 %op = load <16 x half>, ptr %a 2114 %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) 2115 store <16 x half> %res, ptr %a 2116 ret void 2117} 2118 2119define void @fabs_v32f16(ptr %a) #0 { 2120; VBITS_GE_256-LABEL: fabs_v32f16: 2121; VBITS_GE_256: // %bb.0: 2122; VBITS_GE_256-NEXT: ptrue p0.h, vl16 2123; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 2124; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 2125; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 2126; VBITS_GE_256-NEXT: fabs z0.h, p0/m, z0.h 2127; VBITS_GE_256-NEXT: fabs z1.h, p0/m, z1.h 2128; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 2129; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 2130; VBITS_GE_256-NEXT: ret 2131; 2132; VBITS_GE_512-LABEL: fabs_v32f16: 2133; VBITS_GE_512: // %bb.0: 2134; VBITS_GE_512-NEXT: ptrue p0.h, vl32 2135; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 2136; VBITS_GE_512-NEXT: fabs z0.h, p0/m, z0.h 2137; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 2138; VBITS_GE_512-NEXT: ret 2139 %op = load <32 x half>, ptr %a 2140 %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op) 2141 store <32 x half> %res, ptr %a 2142 ret void 2143} 2144 2145define void @fabs_v64f16(ptr %a) vscale_range(8,0) #0 { 2146; CHECK-LABEL: fabs_v64f16: 2147; CHECK: // %bb.0: 2148; CHECK-NEXT: ptrue p0.h, vl64 2149; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 2150; CHECK-NEXT: fabs z0.h, p0/m, z0.h 2151; CHECK-NEXT: st1h { z0.h }, p0, [x0] 2152; CHECK-NEXT: ret 2153 %op = load <64 x half>, ptr %a 2154 %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op) 2155 store <64 x half> %res, ptr %a 2156 ret void 2157} 2158 2159define void @fabs_v128f16(ptr %a) vscale_range(16,0) #0 { 2160; CHECK-LABEL: fabs_v128f16: 2161; CHECK: // %bb.0: 2162; CHECK-NEXT: ptrue p0.h, vl128 2163; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 2164; CHECK-NEXT: fabs z0.h, p0/m, z0.h 2165; CHECK-NEXT: st1h { z0.h }, p0, [x0] 2166; CHECK-NEXT: ret 2167 %op = load <128 x half>, ptr %a 2168 %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op) 2169 store <128 x half> %res, ptr %a 2170 ret void 2171} 2172 2173; Don't use SVE for 64-bit vectors. 2174define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 { 2175; CHECK-LABEL: fabs_v2f32: 2176; CHECK: // %bb.0: 2177; CHECK-NEXT: fabs v0.2s, v0.2s 2178; CHECK-NEXT: ret 2179 %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op) 2180 ret <2 x float> %res 2181} 2182 2183; Don't use SVE for 128-bit vectors. 2184define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 { 2185; CHECK-LABEL: fabs_v4f32: 2186; CHECK: // %bb.0: 2187; CHECK-NEXT: fabs v0.4s, v0.4s 2188; CHECK-NEXT: ret 2189 %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op) 2190 ret <4 x float> %res 2191} 2192 2193define void @fabs_v8f32(ptr %a) vscale_range(2,0) #0 { 2194; CHECK-LABEL: fabs_v8f32: 2195; CHECK: // %bb.0: 2196; CHECK-NEXT: ptrue p0.s, vl8 2197; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 2198; CHECK-NEXT: fabs z0.s, p0/m, z0.s 2199; CHECK-NEXT: st1w { z0.s }, p0, [x0] 2200; CHECK-NEXT: ret 2201 %op = load <8 x float>, ptr %a 2202 %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) 2203 store <8 x float> %res, ptr %a 2204 ret void 2205} 2206 2207define void @fabs_v16f32(ptr %a) #0 { 2208; VBITS_GE_256-LABEL: fabs_v16f32: 2209; VBITS_GE_256: // %bb.0: 2210; VBITS_GE_256-NEXT: ptrue p0.s, vl8 2211; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 2212; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 2213; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 2214; VBITS_GE_256-NEXT: fabs z0.s, p0/m, z0.s 2215; VBITS_GE_256-NEXT: fabs z1.s, p0/m, z1.s 2216; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 2217; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 2218; VBITS_GE_256-NEXT: ret 2219; 2220; VBITS_GE_512-LABEL: fabs_v16f32: 2221; VBITS_GE_512: // %bb.0: 2222; VBITS_GE_512-NEXT: ptrue p0.s, vl16 2223; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 2224; VBITS_GE_512-NEXT: fabs z0.s, p0/m, z0.s 2225; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 2226; VBITS_GE_512-NEXT: ret 2227 %op = load <16 x float>, ptr %a 2228 %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op) 2229 store <16 x float> %res, ptr %a 2230 ret void 2231} 2232 2233define void @fabs_v32f32(ptr %a) vscale_range(8,0) #0 { 2234; CHECK-LABEL: fabs_v32f32: 2235; CHECK: // %bb.0: 2236; CHECK-NEXT: ptrue p0.s, vl32 2237; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 2238; CHECK-NEXT: fabs z0.s, p0/m, z0.s 2239; CHECK-NEXT: st1w { z0.s }, p0, [x0] 2240; CHECK-NEXT: ret 2241 %op = load <32 x float>, ptr %a 2242 %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op) 2243 store <32 x float> %res, ptr %a 2244 ret void 2245} 2246 2247define void @fabs_v64f32(ptr %a) vscale_range(16,0) #0 { 2248; CHECK-LABEL: fabs_v64f32: 2249; CHECK: // %bb.0: 2250; CHECK-NEXT: ptrue p0.s, vl64 2251; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 2252; CHECK-NEXT: fabs z0.s, p0/m, z0.s 2253; CHECK-NEXT: st1w { z0.s }, p0, [x0] 2254; CHECK-NEXT: ret 2255 %op = load <64 x float>, ptr %a 2256 %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op) 2257 store <64 x float> %res, ptr %a 2258 ret void 2259} 2260 2261; Don't use SVE for 64-bit vectors. 2262define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 { 2263; CHECK-LABEL: fabs_v1f64: 2264; CHECK: // %bb.0: 2265; CHECK-NEXT: fabs d0, d0 2266; CHECK-NEXT: ret 2267 %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op) 2268 ret <1 x double> %res 2269} 2270 2271; Don't use SVE for 128-bit vectors. 2272define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 { 2273; CHECK-LABEL: fabs_v2f64: 2274; CHECK: // %bb.0: 2275; CHECK-NEXT: fabs v0.2d, v0.2d 2276; CHECK-NEXT: ret 2277 %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op) 2278 ret <2 x double> %res 2279} 2280 2281define void @fabs_v4f64(ptr %a) vscale_range(2,0) #0 { 2282; CHECK-LABEL: fabs_v4f64: 2283; CHECK: // %bb.0: 2284; CHECK-NEXT: ptrue p0.d, vl4 2285; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2286; CHECK-NEXT: fabs z0.d, p0/m, z0.d 2287; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2288; CHECK-NEXT: ret 2289 %op = load <4 x double>, ptr %a 2290 %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) 2291 store <4 x double> %res, ptr %a 2292 ret void 2293} 2294 2295define void @fabs_v8f64(ptr %a) #0 { 2296; VBITS_GE_256-LABEL: fabs_v8f64: 2297; VBITS_GE_256: // %bb.0: 2298; VBITS_GE_256-NEXT: ptrue p0.d, vl4 2299; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 2300; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 2301; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 2302; VBITS_GE_256-NEXT: fabs z0.d, p0/m, z0.d 2303; VBITS_GE_256-NEXT: fabs z1.d, p0/m, z1.d 2304; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 2305; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 2306; VBITS_GE_256-NEXT: ret 2307; 2308; VBITS_GE_512-LABEL: fabs_v8f64: 2309; VBITS_GE_512: // %bb.0: 2310; VBITS_GE_512-NEXT: ptrue p0.d, vl8 2311; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 2312; VBITS_GE_512-NEXT: fabs z0.d, p0/m, z0.d 2313; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 2314; VBITS_GE_512-NEXT: ret 2315 %op = load <8 x double>, ptr %a 2316 %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op) 2317 store <8 x double> %res, ptr %a 2318 ret void 2319} 2320 2321define void @fabs_v16f64(ptr %a) vscale_range(8,0) #0 { 2322; CHECK-LABEL: fabs_v16f64: 2323; CHECK: // %bb.0: 2324; CHECK-NEXT: ptrue p0.d, vl16 2325; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2326; CHECK-NEXT: fabs z0.d, p0/m, z0.d 2327; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2328; CHECK-NEXT: ret 2329 %op = load <16 x double>, ptr %a 2330 %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op) 2331 store <16 x double> %res, ptr %a 2332 ret void 2333} 2334 2335define void @fabs_v32f64(ptr %a) vscale_range(16,0) #0 { 2336; CHECK-LABEL: fabs_v32f64: 2337; CHECK: // %bb.0: 2338; CHECK-NEXT: ptrue p0.d, vl32 2339; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 2340; CHECK-NEXT: fabs z0.d, p0/m, z0.d 2341; CHECK-NEXT: st1d { z0.d }, p0, [x0] 2342; CHECK-NEXT: ret 2343 %op = load <32 x double>, ptr %a 2344 %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op) 2345 store <32 x double> %res, ptr %a 2346 ret void 2347} 2348 2349attributes #0 = { "target-features"="+sve" } 2350 2351declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) 2352declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) 2353declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>) 2354declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>) 2355declare <64 x half> @llvm.fma.v64f16(<64 x half>, <64 x half>, <64 x half>) 2356declare <128 x half> @llvm.fma.v128f16(<128 x half>, <128 x half>, <128 x half>) 2357declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 2358declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 2359declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) 2360declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) 2361declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) 2362declare <64 x float> @llvm.fma.v64f32(<64 x float>, <64 x float>, <64 x float>) 2363declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>) 2364declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) 2365declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) 2366declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) 2367declare <16 x double> @llvm.fma.v16f64(<16 x double>, <16 x double>, <16 x double>) 2368declare <32 x double> @llvm.fma.v32f64(<32 x double>, <32 x double>, <32 x double>) 2369 2370declare <4 x half> @llvm.sqrt.v4f16(<4 x half>) 2371declare <8 x half> @llvm.sqrt.v8f16(<8 x half>) 2372declare <16 x half> @llvm.sqrt.v16f16(<16 x half>) 2373declare <32 x half> @llvm.sqrt.v32f16(<32 x half>) 2374declare <64 x half> @llvm.sqrt.v64f16(<64 x half>) 2375declare <128 x half> @llvm.sqrt.v128f16(<128 x half>) 2376declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) 2377declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 2378declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 2379declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 2380declare <32 x float> @llvm.sqrt.v32f32(<32 x float>) 2381declare <64 x float> @llvm.sqrt.v64f32(<64 x float>) 2382declare <1 x double> @llvm.sqrt.v1f64(<1 x double>) 2383declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 2384declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 2385declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) 2386declare <16 x double> @llvm.sqrt.v16f64(<16 x double>) 2387declare <32 x double> @llvm.sqrt.v32f64(<32 x double>) 2388 2389declare <4 x half> @llvm.fabs.v4f16(<4 x half>) 2390declare <8 x half> @llvm.fabs.v8f16(<8 x half>) 2391declare <16 x half> @llvm.fabs.v16f16(<16 x half>) 2392declare <32 x half> @llvm.fabs.v32f16(<32 x half>) 2393declare <64 x half> @llvm.fabs.v64f16(<64 x half>) 2394declare <128 x half> @llvm.fabs.v128f16(<128 x half>) 2395declare <2 x float> @llvm.fabs.v2f32(<2 x float>) 2396declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 2397declare <8 x float> @llvm.fabs.v8f32(<8 x float>) 2398declare <16 x float> @llvm.fabs.v16f32(<16 x float>) 2399declare <32 x float> @llvm.fabs.v32f32(<32 x float>) 2400declare <64 x float> @llvm.fabs.v64f32(<64 x float>) 2401declare <1 x double> @llvm.fabs.v1f64(<1 x double>) 2402declare <2 x double> @llvm.fabs.v2f64(<2 x double>) 2403declare <4 x double> @llvm.fabs.v4f64(<4 x double>) 2404declare <8 x double> @llvm.fabs.v8f64(<8 x double>) 2405declare <16 x double> @llvm.fabs.v16f64(<16 x double>) 2406declare <32 x double> @llvm.fabs.v32f64(<32 x double>) 2407