1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; FADDA 10; 11 12; No single instruction NEON support. Use SVE. 13define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 { 14; CHECK-LABEL: fadda_v4f16: 15; CHECK: // %bb.0: 16; CHECK-NEXT: ptrue p0.h, vl4 17; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 18; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 19; CHECK-NEXT: fadda h0, p0, h0, z1.h 20; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 21; CHECK-NEXT: ret 22 %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) 23 ret half %res 24} 25 26; No single instruction NEON support. Use SVE. 27define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 { 28; CHECK-LABEL: fadda_v8f16: 29; CHECK: // %bb.0: 30; CHECK-NEXT: ptrue p0.h, vl8 31; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 32; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 33; CHECK-NEXT: fadda h0, p0, h0, z1.h 34; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 35; CHECK-NEXT: ret 36 %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) 37 ret half %res 38} 39 40define half @fadda_v16f16(half %start, ptr %a) vscale_range(2,0) #0 { 41; CHECK-LABEL: fadda_v16f16: 42; CHECK: // %bb.0: 43; CHECK-NEXT: ptrue p0.h, vl16 44; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 45; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 46; CHECK-NEXT: fadda h0, p0, h0, z1.h 47; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 48; CHECK-NEXT: ret 49 %op = load <16 x half>, ptr %a 50 %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) 51 ret half %res 52} 53 54define half @fadda_v32f16(half %start, ptr %a) #0 { 55; VBITS_GE_256-LABEL: fadda_v32f16: 56; VBITS_GE_256: // %bb.0: 57; VBITS_GE_256-NEXT: ptrue p0.h, vl16 58; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 59; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 60; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 61; VBITS_GE_256-NEXT: fadda h0, p0, h0, z1.h 62; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] 63; VBITS_GE_256-NEXT: fadda h0, p0, h0, z1.h 64; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 65; VBITS_GE_256-NEXT: ret 66; 67; VBITS_GE_512-LABEL: fadda_v32f16: 68; VBITS_GE_512: // %bb.0: 69; VBITS_GE_512-NEXT: ptrue p0.h, vl32 70; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 71; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] 72; VBITS_GE_512-NEXT: fadda h0, p0, h0, z1.h 73; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 74; VBITS_GE_512-NEXT: ret 75 %op = load <32 x half>, ptr %a 76 %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op) 77 ret half %res 78} 79 80define half @fadda_v64f16(half %start, ptr %a) vscale_range(8,0) #0 { 81; CHECK-LABEL: fadda_v64f16: 82; CHECK: // %bb.0: 83; CHECK-NEXT: ptrue p0.h, vl64 84; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 85; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 86; CHECK-NEXT: fadda h0, p0, h0, z1.h 87; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 88; CHECK-NEXT: ret 89 %op = load <64 x half>, ptr %a 90 %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op) 91 ret half %res 92} 93 94define half @fadda_v128f16(half %start, ptr %a) vscale_range(16,0) #0 { 95; CHECK-LABEL: fadda_v128f16: 96; CHECK: // %bb.0: 97; CHECK-NEXT: ptrue p0.h, vl128 98; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 99; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 100; CHECK-NEXT: fadda h0, p0, h0, z1.h 101; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 102; CHECK-NEXT: ret 103 %op = load <128 x half>, ptr %a 104 %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op) 105 ret half %res 106} 107 108; No single instruction NEON support. Use SVE. 109define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 { 110; CHECK-LABEL: fadda_v2f32: 111; CHECK: // %bb.0: 112; CHECK-NEXT: ptrue p0.s, vl2 113; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 114; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 115; CHECK-NEXT: fadda s0, p0, s0, z1.s 116; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 117; CHECK-NEXT: ret 118 %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) 119 ret float %res 120} 121 122; No single instruction NEON support. Use SVE. 123define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 { 124; CHECK-LABEL: fadda_v4f32: 125; CHECK: // %bb.0: 126; CHECK-NEXT: ptrue p0.s, vl4 127; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 128; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 129; CHECK-NEXT: fadda s0, p0, s0, z1.s 130; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 131; CHECK-NEXT: ret 132 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) 133 ret float %res 134} 135 136define float @fadda_v8f32(float %start, ptr %a) vscale_range(2,0) #0 { 137; CHECK-LABEL: fadda_v8f32: 138; CHECK: // %bb.0: 139; CHECK-NEXT: ptrue p0.s, vl8 140; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 141; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 142; CHECK-NEXT: fadda s0, p0, s0, z1.s 143; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 144; CHECK-NEXT: ret 145 %op = load <8 x float>, ptr %a 146 %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) 147 ret float %res 148} 149 150define float @fadda_v16f32(float %start, ptr %a) #0 { 151; VBITS_GE_256-LABEL: fadda_v16f32: 152; VBITS_GE_256: // %bb.0: 153; VBITS_GE_256-NEXT: ptrue p0.s, vl8 154; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 155; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 156; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 157; VBITS_GE_256-NEXT: fadda s0, p0, s0, z1.s 158; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] 159; VBITS_GE_256-NEXT: fadda s0, p0, s0, z1.s 160; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 161; VBITS_GE_256-NEXT: ret 162; 163; VBITS_GE_512-LABEL: fadda_v16f32: 164; VBITS_GE_512: // %bb.0: 165; VBITS_GE_512-NEXT: ptrue p0.s, vl16 166; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 167; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] 168; VBITS_GE_512-NEXT: fadda s0, p0, s0, z1.s 169; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 170; VBITS_GE_512-NEXT: ret 171 %op = load <16 x float>, ptr %a 172 %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op) 173 ret float %res 174} 175 176define float @fadda_v32f32(float %start, ptr %a) vscale_range(8,0) #0 { 177; CHECK-LABEL: fadda_v32f32: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ptrue p0.s, vl32 180; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 181; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 182; CHECK-NEXT: fadda s0, p0, s0, z1.s 183; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 184; CHECK-NEXT: ret 185 %op = load <32 x float>, ptr %a 186 %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op) 187 ret float %res 188} 189 190define float @fadda_v64f32(float %start, ptr %a) vscale_range(16,0) #0 { 191; CHECK-LABEL: fadda_v64f32: 192; CHECK: // %bb.0: 193; CHECK-NEXT: ptrue p0.s, vl64 194; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 195; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 196; CHECK-NEXT: fadda s0, p0, s0, z1.s 197; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 198; CHECK-NEXT: ret 199 %op = load <64 x float>, ptr %a 200 %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op) 201 ret float %res 202} 203 204; No single instruction NEON support. Use SVE. 205define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(1,0) #0 { 206; CHECK-LABEL: fadda_v1f64: 207; CHECK: // %bb.0: 208; CHECK-NEXT: fadd d0, d0, d1 209; CHECK-NEXT: ret 210 %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) 211 ret double %res 212} 213 214; No single instruction NEON support. Use SVE. 215define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 { 216; CHECK-LABEL: fadda_v2f64: 217; CHECK: // %bb.0: 218; CHECK-NEXT: ptrue p0.d, vl2 219; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 220; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 221; CHECK-NEXT: fadda d0, p0, d0, z1.d 222; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 223; CHECK-NEXT: ret 224 %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) 225 ret double %res 226} 227 228define double @fadda_v4f64(double %start, ptr %a) vscale_range(2,0) #0 { 229; CHECK-LABEL: fadda_v4f64: 230; CHECK: // %bb.0: 231; CHECK-NEXT: ptrue p0.d, vl4 232; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 233; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 234; CHECK-NEXT: fadda d0, p0, d0, z1.d 235; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 236; CHECK-NEXT: ret 237 %op = load <4 x double>, ptr %a 238 %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) 239 ret double %res 240} 241 242define double @fadda_v8f64(double %start, ptr %a) #0 { 243; VBITS_GE_256-LABEL: fadda_v8f64: 244; VBITS_GE_256: // %bb.0: 245; VBITS_GE_256-NEXT: ptrue p0.d, vl4 246; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 247; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 248; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 249; VBITS_GE_256-NEXT: fadda d0, p0, d0, z1.d 250; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] 251; VBITS_GE_256-NEXT: fadda d0, p0, d0, z1.d 252; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 253; VBITS_GE_256-NEXT: ret 254; 255; VBITS_GE_512-LABEL: fadda_v8f64: 256; VBITS_GE_512: // %bb.0: 257; VBITS_GE_512-NEXT: ptrue p0.d, vl8 258; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 259; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] 260; VBITS_GE_512-NEXT: fadda d0, p0, d0, z1.d 261; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 262; VBITS_GE_512-NEXT: ret 263 %op = load <8 x double>, ptr %a 264 %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op) 265 ret double %res 266} 267 268define double @fadda_v16f64(double %start, ptr %a) vscale_range(8,0) #0 { 269; CHECK-LABEL: fadda_v16f64: 270; CHECK: // %bb.0: 271; CHECK-NEXT: ptrue p0.d, vl16 272; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 273; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 274; CHECK-NEXT: fadda d0, p0, d0, z1.d 275; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 276; CHECK-NEXT: ret 277 %op = load <16 x double>, ptr %a 278 %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op) 279 ret double %res 280} 281 282define double @fadda_v32f64(double %start, ptr %a) vscale_range(16,0) #0 { 283; CHECK-LABEL: fadda_v32f64: 284; CHECK: // %bb.0: 285; CHECK-NEXT: ptrue p0.d, vl32 286; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 287; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 288; CHECK-NEXT: fadda d0, p0, d0, z1.d 289; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 290; CHECK-NEXT: ret 291 %op = load <32 x double>, ptr %a 292 %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op) 293 ret double %res 294} 295 296; 297; FADDV 298; 299 300; No single instruction NEON support for 4 element vectors. 301define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 { 302; CHECK-LABEL: faddv_v4f16: 303; CHECK: // %bb.0: 304; CHECK-NEXT: ptrue p0.h, vl4 305; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 306; CHECK-NEXT: faddv h1, p0, z1.h 307; CHECK-NEXT: fadd h0, h0, h1 308; CHECK-NEXT: ret 309 %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) 310 ret half %res 311} 312 313; No single instruction NEON support for 8 element vectors. 314define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 { 315; CHECK-LABEL: faddv_v8f16: 316; CHECK: // %bb.0: 317; CHECK-NEXT: ptrue p0.h, vl8 318; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 319; CHECK-NEXT: faddv h1, p0, z1.h 320; CHECK-NEXT: fadd h0, h0, h1 321; CHECK-NEXT: ret 322 %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) 323 ret half %res 324} 325 326define half @faddv_v16f16(half %start, ptr %a) vscale_range(2,0) #0 { 327; CHECK-LABEL: faddv_v16f16: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ptrue p0.h, vl16 330; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 331; CHECK-NEXT: faddv h1, p0, z1.h 332; CHECK-NEXT: fadd h0, h0, h1 333; CHECK-NEXT: ret 334 %op = load <16 x half>, ptr %a 335 %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) 336 ret half %res 337} 338 339define half @faddv_v32f16(half %start, ptr %a) #0 { 340; VBITS_GE_256-LABEL: faddv_v32f16: 341; VBITS_GE_256: // %bb.0: 342; VBITS_GE_256-NEXT: ptrue p0.h, vl16 343; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 344; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] 345; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 346; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z2.h 347; VBITS_GE_256-NEXT: faddv h1, p0, z1.h 348; VBITS_GE_256-NEXT: fadd h0, h0, h1 349; VBITS_GE_256-NEXT: ret 350; 351; VBITS_GE_512-LABEL: faddv_v32f16: 352; VBITS_GE_512: // %bb.0: 353; VBITS_GE_512-NEXT: ptrue p0.h, vl32 354; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] 355; VBITS_GE_512-NEXT: faddv h1, p0, z1.h 356; VBITS_GE_512-NEXT: fadd h0, h0, h1 357; VBITS_GE_512-NEXT: ret 358 %op = load <32 x half>, ptr %a 359 %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op) 360 ret half %res 361} 362 363define half @faddv_v64f16(half %start, ptr %a) vscale_range(8,0) #0 { 364; CHECK-LABEL: faddv_v64f16: 365; CHECK: // %bb.0: 366; CHECK-NEXT: ptrue p0.h, vl64 367; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 368; CHECK-NEXT: faddv h1, p0, z1.h 369; CHECK-NEXT: fadd h0, h0, h1 370; CHECK-NEXT: ret 371 %op = load <64 x half>, ptr %a 372 %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op) 373 ret half %res 374} 375 376define half @faddv_v128f16(half %start, ptr %a) vscale_range(16,0) #0 { 377; CHECK-LABEL: faddv_v128f16: 378; CHECK: // %bb.0: 379; CHECK-NEXT: ptrue p0.h, vl128 380; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] 381; CHECK-NEXT: faddv h1, p0, z1.h 382; CHECK-NEXT: fadd h0, h0, h1 383; CHECK-NEXT: ret 384 %op = load <128 x half>, ptr %a 385 %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op) 386 ret half %res 387} 388 389; Don't use SVE for 2 element vectors. 390define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 { 391; CHECK-LABEL: faddv_v2f32: 392; CHECK: // %bb.0: 393; CHECK-NEXT: faddp s1, v1.2s 394; CHECK-NEXT: fadd s0, s0, s1 395; CHECK-NEXT: ret 396 %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) 397 ret float %res 398} 399 400; No single instruction NEON support for 4 element vectors. 401define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 { 402; CHECK-LABEL: faddv_v4f32: 403; CHECK: // %bb.0: 404; CHECK-NEXT: ptrue p0.s, vl4 405; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 406; CHECK-NEXT: faddv s1, p0, z1.s 407; CHECK-NEXT: fadd s0, s0, s1 408; CHECK-NEXT: ret 409 %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) 410 ret float %res 411} 412 413define float @faddv_v8f32(float %start, ptr %a) vscale_range(2,0) #0 { 414; CHECK-LABEL: faddv_v8f32: 415; CHECK: // %bb.0: 416; CHECK-NEXT: ptrue p0.s, vl8 417; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 418; CHECK-NEXT: faddv s1, p0, z1.s 419; CHECK-NEXT: fadd s0, s0, s1 420; CHECK-NEXT: ret 421 %op = load <8 x float>, ptr %a 422 %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) 423 ret float %res 424} 425 426define float @faddv_v16f32(float %start, ptr %a) #0 { 427; VBITS_GE_256-LABEL: faddv_v16f32: 428; VBITS_GE_256: // %bb.0: 429; VBITS_GE_256-NEXT: ptrue p0.s, vl8 430; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 431; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] 432; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 433; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z2.s 434; VBITS_GE_256-NEXT: faddv s1, p0, z1.s 435; VBITS_GE_256-NEXT: fadd s0, s0, s1 436; VBITS_GE_256-NEXT: ret 437; 438; VBITS_GE_512-LABEL: faddv_v16f32: 439; VBITS_GE_512: // %bb.0: 440; VBITS_GE_512-NEXT: ptrue p0.s, vl16 441; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] 442; VBITS_GE_512-NEXT: faddv s1, p0, z1.s 443; VBITS_GE_512-NEXT: fadd s0, s0, s1 444; VBITS_GE_512-NEXT: ret 445 %op = load <16 x float>, ptr %a 446 %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op) 447 ret float %res 448} 449 450define float @faddv_v32f32(float %start, ptr %a) vscale_range(8,0) #0 { 451; CHECK-LABEL: faddv_v32f32: 452; CHECK: // %bb.0: 453; CHECK-NEXT: ptrue p0.s, vl32 454; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 455; CHECK-NEXT: faddv s1, p0, z1.s 456; CHECK-NEXT: fadd s0, s0, s1 457; CHECK-NEXT: ret 458 %op = load <32 x float>, ptr %a 459 %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op) 460 ret float %res 461} 462 463define float @faddv_v64f32(float %start, ptr %a) vscale_range(16,0) #0 { 464; CHECK-LABEL: faddv_v64f32: 465; CHECK: // %bb.0: 466; CHECK-NEXT: ptrue p0.s, vl64 467; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] 468; CHECK-NEXT: faddv s1, p0, z1.s 469; CHECK-NEXT: fadd s0, s0, s1 470; CHECK-NEXT: ret 471 %op = load <64 x float>, ptr %a 472 %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op) 473 ret float %res 474} 475 476; Don't use SVE for 1 element vectors. 477define double @faddv_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 { 478; CHECK-LABEL: faddv_v1f64: 479; CHECK: // %bb.0: 480; CHECK-NEXT: fadd d0, d0, d1 481; CHECK-NEXT: ret 482 %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) 483 ret double %res 484} 485 486; Don't use SVE for 2 element vectors. 487define double @faddv_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 { 488; CHECK-LABEL: faddv_v2f64: 489; CHECK: // %bb.0: 490; CHECK-NEXT: faddp d1, v1.2d 491; CHECK-NEXT: fadd d0, d0, d1 492; CHECK-NEXT: ret 493 %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) 494 ret double %res 495} 496 497define double @faddv_v4f64(double %start, ptr %a) vscale_range(2,0) #0 { 498; CHECK-LABEL: faddv_v4f64: 499; CHECK: // %bb.0: 500; CHECK-NEXT: ptrue p0.d, vl4 501; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 502; CHECK-NEXT: faddv d1, p0, z1.d 503; CHECK-NEXT: fadd d0, d0, d1 504; CHECK-NEXT: ret 505 %op = load <4 x double>, ptr %a 506 %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) 507 ret double %res 508} 509 510define double @faddv_v8f64(double %start, ptr %a) #0 { 511; VBITS_GE_256-LABEL: faddv_v8f64: 512; VBITS_GE_256: // %bb.0: 513; VBITS_GE_256-NEXT: ptrue p0.d, vl4 514; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 515; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] 516; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 517; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z2.d 518; VBITS_GE_256-NEXT: faddv d1, p0, z1.d 519; VBITS_GE_256-NEXT: fadd d0, d0, d1 520; VBITS_GE_256-NEXT: ret 521; 522; VBITS_GE_512-LABEL: faddv_v8f64: 523; VBITS_GE_512: // %bb.0: 524; VBITS_GE_512-NEXT: ptrue p0.d, vl8 525; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] 526; VBITS_GE_512-NEXT: faddv d1, p0, z1.d 527; VBITS_GE_512-NEXT: fadd d0, d0, d1 528; VBITS_GE_512-NEXT: ret 529 %op = load <8 x double>, ptr %a 530 %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op) 531 ret double %res 532} 533 534define double @faddv_v16f64(double %start, ptr %a) vscale_range(8,0) #0 { 535; CHECK-LABEL: faddv_v16f64: 536; CHECK: // %bb.0: 537; CHECK-NEXT: ptrue p0.d, vl16 538; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 539; CHECK-NEXT: faddv d1, p0, z1.d 540; CHECK-NEXT: fadd d0, d0, d1 541; CHECK-NEXT: ret 542 %op = load <16 x double>, ptr %a 543 %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op) 544 ret double %res 545} 546 547define double @faddv_v32f64(double %start, ptr %a) vscale_range(16,0) #0 { 548; CHECK-LABEL: faddv_v32f64: 549; CHECK: // %bb.0: 550; CHECK-NEXT: ptrue p0.d, vl32 551; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 552; CHECK-NEXT: faddv d1, p0, z1.d 553; CHECK-NEXT: fadd d0, d0, d1 554; CHECK-NEXT: ret 555 %op = load <32 x double>, ptr %a 556 %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op) 557 ret double %res 558} 559 560; 561; FMAXNMV 562; 563 564; No NEON 16-bit vector FMAXNMV support. Use SVE. 565define half @fmaxv_v4f16(<4 x half> %a) vscale_range(2,0) #0 { 566; CHECK-LABEL: fmaxv_v4f16: 567; CHECK: // %bb.0: 568; CHECK-NEXT: fmaxnmv h0, v0.4h 569; CHECK-NEXT: ret 570 %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) 571 ret half %res 572} 573 574; No NEON 16-bit vector FMAXNMV support. Use SVE. 575define half @fmaxv_v8f16(<8 x half> %a) vscale_range(2,0) #0 { 576; CHECK-LABEL: fmaxv_v8f16: 577; CHECK: // %bb.0: 578; CHECK-NEXT: fmaxnmv h0, v0.8h 579; CHECK-NEXT: ret 580 %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) 581 ret half %res 582} 583 584define half @fmaxv_v16f16(ptr %a) vscale_range(2,0) #0 { 585; CHECK-LABEL: fmaxv_v16f16: 586; CHECK: // %bb.0: 587; CHECK-NEXT: ptrue p0.h, vl16 588; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 589; CHECK-NEXT: fmaxnmv h0, p0, z0.h 590; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 591; CHECK-NEXT: ret 592 %op = load <16 x half>, ptr %a 593 %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) 594 ret half %res 595} 596 597define half @fmaxv_v32f16(ptr %a) #0 { 598; VBITS_GE_256-LABEL: fmaxv_v32f16: 599; VBITS_GE_256: // %bb.0: 600; VBITS_GE_256-NEXT: ptrue p0.h, vl16 601; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 602; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 603; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 604; VBITS_GE_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h 605; VBITS_GE_256-NEXT: fmaxnmv h0, p0, z0.h 606; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 607; VBITS_GE_256-NEXT: ret 608; 609; VBITS_GE_512-LABEL: fmaxv_v32f16: 610; VBITS_GE_512: // %bb.0: 611; VBITS_GE_512-NEXT: ptrue p0.h, vl32 612; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 613; VBITS_GE_512-NEXT: fmaxnmv h0, p0, z0.h 614; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 615; VBITS_GE_512-NEXT: ret 616 %op = load <32 x half>, ptr %a 617 %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op) 618 ret half %res 619} 620 621define half @fmaxv_v64f16(ptr %a) vscale_range(8,0) #0 { 622; CHECK-LABEL: fmaxv_v64f16: 623; CHECK: // %bb.0: 624; CHECK-NEXT: ptrue p0.h, vl64 625; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 626; CHECK-NEXT: fmaxnmv h0, p0, z0.h 627; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 628; CHECK-NEXT: ret 629 %op = load <64 x half>, ptr %a 630 %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op) 631 ret half %res 632} 633 634define half @fmaxv_v128f16(ptr %a) vscale_range(16,0) #0 { 635; CHECK-LABEL: fmaxv_v128f16: 636; CHECK: // %bb.0: 637; CHECK-NEXT: ptrue p0.h, vl128 638; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 639; CHECK-NEXT: fmaxnmv h0, p0, z0.h 640; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 641; CHECK-NEXT: ret 642 %op = load <128 x half>, ptr %a 643 %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op) 644 ret half %res 645} 646 647; Don't use SVE for 64-bit f32 vectors. 648define float @fmaxv_v2f32(<2 x float> %a) vscale_range(2,0) #0 { 649; CHECK-LABEL: fmaxv_v2f32: 650; CHECK: // %bb.0: 651; CHECK-NEXT: fmaxnmp s0, v0.2s 652; CHECK-NEXT: ret 653 %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) 654 ret float %res 655} 656 657; Don't use SVE for 128-bit f32 vectors. 658define float @fmaxv_v4f32(<4 x float> %a) vscale_range(2,0) #0 { 659; CHECK-LABEL: fmaxv_v4f32: 660; CHECK: // %bb.0: 661; CHECK-NEXT: fmaxnmv s0, v0.4s 662; CHECK-NEXT: ret 663 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 664 ret float %res 665} 666 667define float @fmaxv_v8f32(ptr %a) vscale_range(2,0) #0 { 668; CHECK-LABEL: fmaxv_v8f32: 669; CHECK: // %bb.0: 670; CHECK-NEXT: ptrue p0.s, vl8 671; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 672; CHECK-NEXT: fmaxnmv s0, p0, z0.s 673; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 674; CHECK-NEXT: ret 675 %op = load <8 x float>, ptr %a 676 %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) 677 ret float %res 678} 679 680define float @fmaxv_v16f32(ptr %a) #0 { 681; VBITS_GE_256-LABEL: fmaxv_v16f32: 682; VBITS_GE_256: // %bb.0: 683; VBITS_GE_256-NEXT: ptrue p0.s, vl8 684; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 685; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 686; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 687; VBITS_GE_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s 688; VBITS_GE_256-NEXT: fmaxnmv s0, p0, z0.s 689; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 690; VBITS_GE_256-NEXT: ret 691; 692; VBITS_GE_512-LABEL: fmaxv_v16f32: 693; VBITS_GE_512: // %bb.0: 694; VBITS_GE_512-NEXT: ptrue p0.s, vl16 695; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 696; VBITS_GE_512-NEXT: fmaxnmv s0, p0, z0.s 697; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 698; VBITS_GE_512-NEXT: ret 699 %op = load <16 x float>, ptr %a 700 %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op) 701 ret float %res 702} 703 704define float @fmaxv_v32f32(ptr %a) vscale_range(8,0) #0 { 705; CHECK-LABEL: fmaxv_v32f32: 706; CHECK: // %bb.0: 707; CHECK-NEXT: ptrue p0.s, vl32 708; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 709; CHECK-NEXT: fmaxnmv s0, p0, z0.s 710; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 711; CHECK-NEXT: ret 712 %op = load <32 x float>, ptr %a 713 %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op) 714 ret float %res 715} 716 717define float @fmaxv_v64f32(ptr %a) vscale_range(16,0) #0 { 718; CHECK-LABEL: fmaxv_v64f32: 719; CHECK: // %bb.0: 720; CHECK-NEXT: ptrue p0.s, vl64 721; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 722; CHECK-NEXT: fmaxnmv s0, p0, z0.s 723; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 724; CHECK-NEXT: ret 725 %op = load <64 x float>, ptr %a 726 %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op) 727 ret float %res 728} 729 730; Nothing to do for single element vectors. 731define double @fmaxv_v1f64(<1 x double> %a) vscale_range(2,0) #0 { 732; CHECK-LABEL: fmaxv_v1f64: 733; CHECK: // %bb.0: 734; CHECK-NEXT: ret 735 %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a) 736 ret double %res 737} 738 739; Don't use SVE for 128-bit f64 vectors. 740define double @fmaxv_v2f64(<2 x double> %a) vscale_range(2,0) #0 { 741; CHECK-LABEL: fmaxv_v2f64: 742; CHECK: // %bb.0: 743; CHECK-NEXT: fmaxnmp d0, v0.2d 744; CHECK-NEXT: ret 745 %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) 746 ret double %res 747} 748 749define double @fmaxv_v4f64(ptr %a) vscale_range(2,0) #0 { 750; CHECK-LABEL: fmaxv_v4f64: 751; CHECK: // %bb.0: 752; CHECK-NEXT: ptrue p0.d, vl4 753; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 754; CHECK-NEXT: fmaxnmv d0, p0, z0.d 755; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 756; CHECK-NEXT: ret 757 %op = load <4 x double>, ptr %a 758 %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) 759 ret double %res 760} 761 762define double @fmaxv_v8f64(ptr %a) #0 { 763; VBITS_GE_256-LABEL: fmaxv_v8f64: 764; VBITS_GE_256: // %bb.0: 765; VBITS_GE_256-NEXT: ptrue p0.d, vl4 766; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 767; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 768; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 769; VBITS_GE_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d 770; VBITS_GE_256-NEXT: fmaxnmv d0, p0, z0.d 771; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 772; VBITS_GE_256-NEXT: ret 773; 774; VBITS_GE_512-LABEL: fmaxv_v8f64: 775; VBITS_GE_512: // %bb.0: 776; VBITS_GE_512-NEXT: ptrue p0.d, vl8 777; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 778; VBITS_GE_512-NEXT: fmaxnmv d0, p0, z0.d 779; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 780; VBITS_GE_512-NEXT: ret 781 %op = load <8 x double>, ptr %a 782 %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op) 783 ret double %res 784} 785 786define double @fmaxv_v16f64(ptr %a) vscale_range(8,0) #0 { 787; CHECK-LABEL: fmaxv_v16f64: 788; CHECK: // %bb.0: 789; CHECK-NEXT: ptrue p0.d, vl16 790; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 791; CHECK-NEXT: fmaxnmv d0, p0, z0.d 792; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 793; CHECK-NEXT: ret 794 %op = load <16 x double>, ptr %a 795 %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op) 796 ret double %res 797} 798 799define double @fmaxv_v32f64(ptr %a) vscale_range(16,0) #0 { 800; CHECK-LABEL: fmaxv_v32f64: 801; CHECK: // %bb.0: 802; CHECK-NEXT: ptrue p0.d, vl32 803; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 804; CHECK-NEXT: fmaxnmv d0, p0, z0.d 805; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 806; CHECK-NEXT: ret 807 %op = load <32 x double>, ptr %a 808 %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op) 809 ret double %res 810} 811 812; 813; FMINNMV 814; 815 816; No NEON 16-bit vector FMINNMV support. Use SVE. 817define half @fminv_v4f16(<4 x half> %a) vscale_range(2,0) #0 { 818; CHECK-LABEL: fminv_v4f16: 819; CHECK: // %bb.0: 820; CHECK-NEXT: fminnmv h0, v0.4h 821; CHECK-NEXT: ret 822 %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) 823 ret half %res 824} 825 826; No NEON 16-bit vector FMINNMV support. Use SVE. 827define half @fminv_v8f16(<8 x half> %a) vscale_range(2,0) #0 { 828; CHECK-LABEL: fminv_v8f16: 829; CHECK: // %bb.0: 830; CHECK-NEXT: fminnmv h0, v0.8h 831; CHECK-NEXT: ret 832 %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) 833 ret half %res 834} 835 836define half @fminv_v16f16(ptr %a) vscale_range(2,0) #0 { 837; CHECK-LABEL: fminv_v16f16: 838; CHECK: // %bb.0: 839; CHECK-NEXT: ptrue p0.h, vl16 840; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 841; CHECK-NEXT: fminnmv h0, p0, z0.h 842; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 843; CHECK-NEXT: ret 844 %op = load <16 x half>, ptr %a 845 %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) 846 ret half %res 847} 848 849define half @fminv_v32f16(ptr %a) #0 { 850; VBITS_GE_256-LABEL: fminv_v32f16: 851; VBITS_GE_256: // %bb.0: 852; VBITS_GE_256-NEXT: ptrue p0.h, vl16 853; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 854; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 855; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 856; VBITS_GE_256-NEXT: fminnm z0.h, p0/m, z0.h, z1.h 857; VBITS_GE_256-NEXT: fminnmv h0, p0, z0.h 858; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 859; VBITS_GE_256-NEXT: ret 860; 861; VBITS_GE_512-LABEL: fminv_v32f16: 862; VBITS_GE_512: // %bb.0: 863; VBITS_GE_512-NEXT: ptrue p0.h, vl32 864; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 865; VBITS_GE_512-NEXT: fminnmv h0, p0, z0.h 866; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 867; VBITS_GE_512-NEXT: ret 868 %op = load <32 x half>, ptr %a 869 %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op) 870 ret half %res 871} 872 873define half @fminv_v64f16(ptr %a) vscale_range(8,0) #0 { 874; CHECK-LABEL: fminv_v64f16: 875; CHECK: // %bb.0: 876; CHECK-NEXT: ptrue p0.h, vl64 877; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 878; CHECK-NEXT: fminnmv h0, p0, z0.h 879; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 880; CHECK-NEXT: ret 881 %op = load <64 x half>, ptr %a 882 %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op) 883 ret half %res 884} 885 886define half @fminv_v128f16(ptr %a) vscale_range(16,0) #0 { 887; CHECK-LABEL: fminv_v128f16: 888; CHECK: // %bb.0: 889; CHECK-NEXT: ptrue p0.h, vl128 890; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 891; CHECK-NEXT: fminnmv h0, p0, z0.h 892; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 893; CHECK-NEXT: ret 894 %op = load <128 x half>, ptr %a 895 %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op) 896 ret half %res 897} 898 899; Don't use SVE for 64-bit f32 vectors. 900define float @fminv_v2f32(<2 x float> %a) vscale_range(2,0) #0 { 901; CHECK-LABEL: fminv_v2f32: 902; CHECK: // %bb.0: 903; CHECK-NEXT: fminnmp s0, v0.2s 904; CHECK-NEXT: ret 905 %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) 906 ret float %res 907} 908 909; Don't use SVE for 128-bit f32 vectors. 910define float @fminv_v4f32(<4 x float> %a) vscale_range(2,0) #0 { 911; CHECK-LABEL: fminv_v4f32: 912; CHECK: // %bb.0: 913; CHECK-NEXT: fminnmv s0, v0.4s 914; CHECK-NEXT: ret 915 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 916 ret float %res 917} 918 919define float @fminv_v8f32(ptr %a) vscale_range(2,0) #0 { 920; CHECK-LABEL: fminv_v8f32: 921; CHECK: // %bb.0: 922; CHECK-NEXT: ptrue p0.s, vl8 923; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 924; CHECK-NEXT: fminnmv s0, p0, z0.s 925; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 926; CHECK-NEXT: ret 927 %op = load <8 x float>, ptr %a 928 %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) 929 ret float %res 930} 931 932define float @fminv_v16f32(ptr %a) #0 { 933; VBITS_GE_256-LABEL: fminv_v16f32: 934; VBITS_GE_256: // %bb.0: 935; VBITS_GE_256-NEXT: ptrue p0.s, vl8 936; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 937; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 938; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 939; VBITS_GE_256-NEXT: fminnm z0.s, p0/m, z0.s, z1.s 940; VBITS_GE_256-NEXT: fminnmv s0, p0, z0.s 941; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 942; VBITS_GE_256-NEXT: ret 943; 944; VBITS_GE_512-LABEL: fminv_v16f32: 945; VBITS_GE_512: // %bb.0: 946; VBITS_GE_512-NEXT: ptrue p0.s, vl16 947; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 948; VBITS_GE_512-NEXT: fminnmv s0, p0, z0.s 949; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 950; VBITS_GE_512-NEXT: ret 951 %op = load <16 x float>, ptr %a 952 %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op) 953 ret float %res 954} 955 956define float @fminv_v32f32(ptr %a) vscale_range(8,0) #0 { 957; CHECK-LABEL: fminv_v32f32: 958; CHECK: // %bb.0: 959; CHECK-NEXT: ptrue p0.s, vl32 960; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 961; CHECK-NEXT: fminnmv s0, p0, z0.s 962; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 963; CHECK-NEXT: ret 964 %op = load <32 x float>, ptr %a 965 %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op) 966 ret float %res 967} 968 969define float @fminv_v64f32(ptr %a) vscale_range(16,0) #0 { 970; CHECK-LABEL: fminv_v64f32: 971; CHECK: // %bb.0: 972; CHECK-NEXT: ptrue p0.s, vl64 973; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 974; CHECK-NEXT: fminnmv s0, p0, z0.s 975; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 976; CHECK-NEXT: ret 977 %op = load <64 x float>, ptr %a 978 %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op) 979 ret float %res 980} 981 982; Nothing to do for single element vectors. 983define double @fminv_v1f64(<1 x double> %a) vscale_range(2,0) #0 { 984; CHECK-LABEL: fminv_v1f64: 985; CHECK: // %bb.0: 986; CHECK-NEXT: ret 987 %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a) 988 ret double %res 989} 990 991; Don't use SVE for 128-bit f64 vectors. 992define double @fminv_v2f64(<2 x double> %a) vscale_range(2,0) #0 { 993; CHECK-LABEL: fminv_v2f64: 994; CHECK: // %bb.0: 995; CHECK-NEXT: fminnmp d0, v0.2d 996; CHECK-NEXT: ret 997 %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) 998 ret double %res 999} 1000 1001define double @fminv_v4f64(ptr %a) vscale_range(2,0) #0 { 1002; CHECK-LABEL: fminv_v4f64: 1003; CHECK: // %bb.0: 1004; CHECK-NEXT: ptrue p0.d, vl4 1005; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1006; CHECK-NEXT: fminnmv d0, p0, z0.d 1007; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1008; CHECK-NEXT: ret 1009 %op = load <4 x double>, ptr %a 1010 %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) 1011 ret double %res 1012} 1013 1014define double @fminv_v8f64(ptr %a) #0 { 1015; VBITS_GE_256-LABEL: fminv_v8f64: 1016; VBITS_GE_256: // %bb.0: 1017; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1018; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1019; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1020; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1021; VBITS_GE_256-NEXT: fminnm z0.d, p0/m, z0.d, z1.d 1022; VBITS_GE_256-NEXT: fminnmv d0, p0, z0.d 1023; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 1024; VBITS_GE_256-NEXT: ret 1025; 1026; VBITS_GE_512-LABEL: fminv_v8f64: 1027; VBITS_GE_512: // %bb.0: 1028; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1029; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1030; VBITS_GE_512-NEXT: fminnmv d0, p0, z0.d 1031; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 1032; VBITS_GE_512-NEXT: ret 1033 %op = load <8 x double>, ptr %a 1034 %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op) 1035 ret double %res 1036} 1037 1038define double @fminv_v16f64(ptr %a) vscale_range(8,0) #0 { 1039; CHECK-LABEL: fminv_v16f64: 1040; CHECK: // %bb.0: 1041; CHECK-NEXT: ptrue p0.d, vl16 1042; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1043; CHECK-NEXT: fminnmv d0, p0, z0.d 1044; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1045; CHECK-NEXT: ret 1046 %op = load <16 x double>, ptr %a 1047 %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op) 1048 ret double %res 1049} 1050 1051define double @fminv_v32f64(ptr %a) vscale_range(16,0) #0 { 1052; CHECK-LABEL: fminv_v32f64: 1053; CHECK: // %bb.0: 1054; CHECK-NEXT: ptrue p0.d, vl32 1055; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1056; CHECK-NEXT: fminnmv d0, p0, z0.d 1057; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1058; CHECK-NEXT: ret 1059 %op = load <32 x double>, ptr %a 1060 %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op) 1061 ret double %res 1062} 1063 1064; 1065; FMAXV 1066; 1067 1068define half @fmaximumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 { 1069; CHECK-LABEL: fmaximumv_v4f16: 1070; CHECK: // %bb.0: 1071; CHECK-NEXT: fmaxv h0, v0.4h 1072; CHECK-NEXT: ret 1073 %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a) 1074 ret half %res 1075} 1076 1077define half @fmaximumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 { 1078; CHECK-LABEL: fmaximumv_v8f16: 1079; CHECK: // %bb.0: 1080; CHECK-NEXT: fmaxv h0, v0.8h 1081; CHECK-NEXT: ret 1082 %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a) 1083 ret half %res 1084} 1085 1086define half @fmaximumv_v16f16(ptr %a) vscale_range(2,0) #0 { 1087; CHECK-LABEL: fmaximumv_v16f16: 1088; CHECK: // %bb.0: 1089; CHECK-NEXT: ptrue p0.h, vl16 1090; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1091; CHECK-NEXT: fmaxv h0, p0, z0.h 1092; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1093; CHECK-NEXT: ret 1094 %op = load <16 x half>, ptr %a 1095 %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op) 1096 ret half %res 1097} 1098 1099define half @fmaximumv_v32f16(ptr %a) #0 { 1100; VBITS_GE_256-LABEL: fmaximumv_v32f16: 1101; VBITS_GE_256: // %bb.0: 1102; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1103; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1104; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1105; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1106; VBITS_GE_256-NEXT: fmax z0.h, p0/m, z0.h, z1.h 1107; VBITS_GE_256-NEXT: fmaxv h0, p0, z0.h 1108; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 1109; VBITS_GE_256-NEXT: ret 1110; 1111; VBITS_GE_512-LABEL: fmaximumv_v32f16: 1112; VBITS_GE_512: // %bb.0: 1113; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1114; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1115; VBITS_GE_512-NEXT: fmaxv h0, p0, z0.h 1116; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 1117; VBITS_GE_512-NEXT: ret 1118 %op = load <32 x half>, ptr %a 1119 %res = call half @llvm.vector.reduce.fmaximum.v32f16(<32 x half> %op) 1120 ret half %res 1121} 1122 1123define half @fmaximumv_v64f16(ptr %a) vscale_range(8,0) #0 { 1124; CHECK-LABEL: fmaximumv_v64f16: 1125; CHECK: // %bb.0: 1126; CHECK-NEXT: ptrue p0.h, vl64 1127; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1128; CHECK-NEXT: fmaxv h0, p0, z0.h 1129; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1130; CHECK-NEXT: ret 1131 %op = load <64 x half>, ptr %a 1132 %res = call half @llvm.vector.reduce.fmaximum.v64f16(<64 x half> %op) 1133 ret half %res 1134} 1135 1136define half @fmaximumv_v128f16(ptr %a) vscale_range(16,0) #0 { 1137; CHECK-LABEL: fmaximumv_v128f16: 1138; CHECK: // %bb.0: 1139; CHECK-NEXT: ptrue p0.h, vl128 1140; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1141; CHECK-NEXT: fmaxv h0, p0, z0.h 1142; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1143; CHECK-NEXT: ret 1144 %op = load <128 x half>, ptr %a 1145 %res = call half @llvm.vector.reduce.fmaximum.v128f16(<128 x half> %op) 1146 ret half %res 1147} 1148 1149; Don't use SVE for 64-bit f32 vectors. 1150define float @fmaximumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 { 1151; CHECK-LABEL: fmaximumv_v2f32: 1152; CHECK: // %bb.0: 1153; CHECK-NEXT: fmaxp s0, v0.2s 1154; CHECK-NEXT: ret 1155 %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a) 1156 ret float %res 1157} 1158 1159; Don't use SVE for 128-bit f32 vectors. 1160define float @fmaximumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 { 1161; CHECK-LABEL: fmaximumv_v4f32: 1162; CHECK: // %bb.0: 1163; CHECK-NEXT: fmaxv s0, v0.4s 1164; CHECK-NEXT: ret 1165 %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a) 1166 ret float %res 1167} 1168 1169define float @fmaximumv_v8f32(ptr %a) vscale_range(2,0) #0 { 1170; CHECK-LABEL: fmaximumv_v8f32: 1171; CHECK: // %bb.0: 1172; CHECK-NEXT: ptrue p0.s, vl8 1173; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1174; CHECK-NEXT: fmaxv s0, p0, z0.s 1175; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1176; CHECK-NEXT: ret 1177 %op = load <8 x float>, ptr %a 1178 %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op) 1179 ret float %res 1180} 1181 1182define float @fmaximumv_v16f32(ptr %a) #0 { 1183; VBITS_GE_256-LABEL: fmaximumv_v16f32: 1184; VBITS_GE_256: // %bb.0: 1185; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1186; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1187; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1188; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1189; VBITS_GE_256-NEXT: fmax z0.s, p0/m, z0.s, z1.s 1190; VBITS_GE_256-NEXT: fmaxv s0, p0, z0.s 1191; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 1192; VBITS_GE_256-NEXT: ret 1193; 1194; VBITS_GE_512-LABEL: fmaximumv_v16f32: 1195; VBITS_GE_512: // %bb.0: 1196; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1197; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1198; VBITS_GE_512-NEXT: fmaxv s0, p0, z0.s 1199; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 1200; VBITS_GE_512-NEXT: ret 1201 %op = load <16 x float>, ptr %a 1202 %res = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %op) 1203 ret float %res 1204} 1205 1206define float @fmaximumv_v32f32(ptr %a) vscale_range(8,0) #0 { 1207; CHECK-LABEL: fmaximumv_v32f32: 1208; CHECK: // %bb.0: 1209; CHECK-NEXT: ptrue p0.s, vl32 1210; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1211; CHECK-NEXT: fmaxv s0, p0, z0.s 1212; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1213; CHECK-NEXT: ret 1214 %op = load <32 x float>, ptr %a 1215 %res = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> %op) 1216 ret float %res 1217} 1218 1219define float @fmaximumv_v64f32(ptr %a) vscale_range(16,0) #0 { 1220; CHECK-LABEL: fmaximumv_v64f32: 1221; CHECK: // %bb.0: 1222; CHECK-NEXT: ptrue p0.s, vl64 1223; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1224; CHECK-NEXT: fmaxv s0, p0, z0.s 1225; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1226; CHECK-NEXT: ret 1227 %op = load <64 x float>, ptr %a 1228 %res = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %op) 1229 ret float %res 1230} 1231 1232; Nothing to do for single element vectors. 1233define double @fmaximumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 { 1234; CHECK-LABEL: fmaximumv_v1f64: 1235; CHECK: // %bb.0: 1236; CHECK-NEXT: ret 1237 %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a) 1238 ret double %res 1239} 1240 1241; Don't use SVE for 128-bit f64 vectors. 1242define double @fmaximumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 { 1243; CHECK-LABEL: fmaximumv_v2f64: 1244; CHECK: // %bb.0: 1245; CHECK-NEXT: fmaxp d0, v0.2d 1246; CHECK-NEXT: ret 1247 %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a) 1248 ret double %res 1249} 1250 1251define double @fmaximumv_v4f64(ptr %a) vscale_range(2,0) #0 { 1252; CHECK-LABEL: fmaximumv_v4f64: 1253; CHECK: // %bb.0: 1254; CHECK-NEXT: ptrue p0.d, vl4 1255; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1256; CHECK-NEXT: fmaxv d0, p0, z0.d 1257; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1258; CHECK-NEXT: ret 1259 %op = load <4 x double>, ptr %a 1260 %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op) 1261 ret double %res 1262} 1263 1264define double @fmaximumv_v8f64(ptr %a) #0 { 1265; VBITS_GE_256-LABEL: fmaximumv_v8f64: 1266; VBITS_GE_256: // %bb.0: 1267; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1268; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1269; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1270; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1271; VBITS_GE_256-NEXT: fmax z0.d, p0/m, z0.d, z1.d 1272; VBITS_GE_256-NEXT: fmaxv d0, p0, z0.d 1273; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 1274; VBITS_GE_256-NEXT: ret 1275; 1276; VBITS_GE_512-LABEL: fmaximumv_v8f64: 1277; VBITS_GE_512: // %bb.0: 1278; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1279; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1280; VBITS_GE_512-NEXT: fmaxv d0, p0, z0.d 1281; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 1282; VBITS_GE_512-NEXT: ret 1283 %op = load <8 x double>, ptr %a 1284 %res = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %op) 1285 ret double %res 1286} 1287 1288define double @fmaximumv_v16f64(ptr %a) vscale_range(8,0) #0 { 1289; CHECK-LABEL: fmaximumv_v16f64: 1290; CHECK: // %bb.0: 1291; CHECK-NEXT: ptrue p0.d, vl16 1292; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1293; CHECK-NEXT: fmaxv d0, p0, z0.d 1294; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1295; CHECK-NEXT: ret 1296 %op = load <16 x double>, ptr %a 1297 %res = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> %op) 1298 ret double %res 1299} 1300 1301define double @fmaximumv_v32f64(ptr %a) vscale_range(16,0) #0 { 1302; CHECK-LABEL: fmaximumv_v32f64: 1303; CHECK: // %bb.0: 1304; CHECK-NEXT: ptrue p0.d, vl32 1305; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1306; CHECK-NEXT: fmaxv d0, p0, z0.d 1307; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1308; CHECK-NEXT: ret 1309 %op = load <32 x double>, ptr %a 1310 %res = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %op) 1311 ret double %res 1312} 1313 1314; 1315; FMINV 1316; 1317 1318define half @fminimumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 { 1319; CHECK-LABEL: fminimumv_v4f16: 1320; CHECK: // %bb.0: 1321; CHECK-NEXT: fminv h0, v0.4h 1322; CHECK-NEXT: ret 1323 %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a) 1324 ret half %res 1325} 1326 1327define half @fminimumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 { 1328; CHECK-LABEL: fminimumv_v8f16: 1329; CHECK: // %bb.0: 1330; CHECK-NEXT: fminv h0, v0.8h 1331; CHECK-NEXT: ret 1332 %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a) 1333 ret half %res 1334} 1335 1336define half @fminimumv_v16f16(ptr %a) vscale_range(2,0) #0 { 1337; CHECK-LABEL: fminimumv_v16f16: 1338; CHECK: // %bb.0: 1339; CHECK-NEXT: ptrue p0.h, vl16 1340; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1341; CHECK-NEXT: fminv h0, p0, z0.h 1342; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1343; CHECK-NEXT: ret 1344 %op = load <16 x half>, ptr %a 1345 %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op) 1346 ret half %res 1347} 1348 1349define half @fminimumv_v32f16(ptr %a) #0 { 1350; VBITS_GE_256-LABEL: fminimumv_v32f16: 1351; VBITS_GE_256: // %bb.0: 1352; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1353; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1354; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1355; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1356; VBITS_GE_256-NEXT: fmin z0.h, p0/m, z0.h, z1.h 1357; VBITS_GE_256-NEXT: fminv h0, p0, z0.h 1358; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0 1359; VBITS_GE_256-NEXT: ret 1360; 1361; VBITS_GE_512-LABEL: fminimumv_v32f16: 1362; VBITS_GE_512: // %bb.0: 1363; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1364; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1365; VBITS_GE_512-NEXT: fminv h0, p0, z0.h 1366; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0 1367; VBITS_GE_512-NEXT: ret 1368 %op = load <32 x half>, ptr %a 1369 %res = call half @llvm.vector.reduce.fminimum.v32f16(<32 x half> %op) 1370 ret half %res 1371} 1372 1373define half @fminimumv_v64f16(ptr %a) vscale_range(8,0) #0 { 1374; CHECK-LABEL: fminimumv_v64f16: 1375; CHECK: // %bb.0: 1376; CHECK-NEXT: ptrue p0.h, vl64 1377; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1378; CHECK-NEXT: fminv h0, p0, z0.h 1379; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1380; CHECK-NEXT: ret 1381 %op = load <64 x half>, ptr %a 1382 %res = call half @llvm.vector.reduce.fminimum.v64f16(<64 x half> %op) 1383 ret half %res 1384} 1385 1386define half @fminimumv_v128f16(ptr %a) vscale_range(16,0) #0 { 1387; CHECK-LABEL: fminimumv_v128f16: 1388; CHECK: // %bb.0: 1389; CHECK-NEXT: ptrue p0.h, vl128 1390; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1391; CHECK-NEXT: fminv h0, p0, z0.h 1392; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 1393; CHECK-NEXT: ret 1394 %op = load <128 x half>, ptr %a 1395 %res = call half @llvm.vector.reduce.fminimum.v128f16(<128 x half> %op) 1396 ret half %res 1397} 1398 1399; Don't use SVE for 64-bit f32 vectors. 1400define float @fminimumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 { 1401; CHECK-LABEL: fminimumv_v2f32: 1402; CHECK: // %bb.0: 1403; CHECK-NEXT: fminp s0, v0.2s 1404; CHECK-NEXT: ret 1405 %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a) 1406 ret float %res 1407} 1408 1409; Don't use SVE for 128-bit f32 vectors. 1410define float @fminimumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 { 1411; CHECK-LABEL: fminimumv_v4f32: 1412; CHECK: // %bb.0: 1413; CHECK-NEXT: fminv s0, v0.4s 1414; CHECK-NEXT: ret 1415 %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) 1416 ret float %res 1417} 1418 1419define float @fminimumv_v8f32(ptr %a) vscale_range(2,0) #0 { 1420; CHECK-LABEL: fminimumv_v8f32: 1421; CHECK: // %bb.0: 1422; CHECK-NEXT: ptrue p0.s, vl8 1423; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1424; CHECK-NEXT: fminv s0, p0, z0.s 1425; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1426; CHECK-NEXT: ret 1427 %op = load <8 x float>, ptr %a 1428 %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op) 1429 ret float %res 1430} 1431 1432define float @fminimumv_v16f32(ptr %a) #0 { 1433; VBITS_GE_256-LABEL: fminimumv_v16f32: 1434; VBITS_GE_256: // %bb.0: 1435; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1436; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1437; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1438; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1439; VBITS_GE_256-NEXT: fmin z0.s, p0/m, z0.s, z1.s 1440; VBITS_GE_256-NEXT: fminv s0, p0, z0.s 1441; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0 1442; VBITS_GE_256-NEXT: ret 1443; 1444; VBITS_GE_512-LABEL: fminimumv_v16f32: 1445; VBITS_GE_512: // %bb.0: 1446; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1447; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1448; VBITS_GE_512-NEXT: fminv s0, p0, z0.s 1449; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0 1450; VBITS_GE_512-NEXT: ret 1451 %op = load <16 x float>, ptr %a 1452 %res = call float @llvm.vector.reduce.fminimum.v16f32(<16 x float> %op) 1453 ret float %res 1454} 1455 1456define float @fminimumv_v32f32(ptr %a) vscale_range(8,0) #0 { 1457; CHECK-LABEL: fminimumv_v32f32: 1458; CHECK: // %bb.0: 1459; CHECK-NEXT: ptrue p0.s, vl32 1460; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1461; CHECK-NEXT: fminv s0, p0, z0.s 1462; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1463; CHECK-NEXT: ret 1464 %op = load <32 x float>, ptr %a 1465 %res = call float @llvm.vector.reduce.fminimum.v32f32(<32 x float> %op) 1466 ret float %res 1467} 1468 1469define float @fminimumv_v64f32(ptr %a) vscale_range(16,0) #0 { 1470; CHECK-LABEL: fminimumv_v64f32: 1471; CHECK: // %bb.0: 1472; CHECK-NEXT: ptrue p0.s, vl64 1473; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1474; CHECK-NEXT: fminv s0, p0, z0.s 1475; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 1476; CHECK-NEXT: ret 1477 %op = load <64 x float>, ptr %a 1478 %res = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %op) 1479 ret float %res 1480} 1481 1482; Nothing to do for single element vectors. 1483define double @fminimumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 { 1484; CHECK-LABEL: fminimumv_v1f64: 1485; CHECK: // %bb.0: 1486; CHECK-NEXT: ret 1487 %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a) 1488 ret double %res 1489} 1490 1491; Don't use SVE for 128-bit f64 vectors. 1492define double @fminimumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 { 1493; CHECK-LABEL: fminimumv_v2f64: 1494; CHECK: // %bb.0: 1495; CHECK-NEXT: fminp d0, v0.2d 1496; CHECK-NEXT: ret 1497 %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a) 1498 ret double %res 1499} 1500 1501define double @fminimumv_v4f64(ptr %a) vscale_range(2,0) #0 { 1502; CHECK-LABEL: fminimumv_v4f64: 1503; CHECK: // %bb.0: 1504; CHECK-NEXT: ptrue p0.d, vl4 1505; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1506; CHECK-NEXT: fminv d0, p0, z0.d 1507; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1508; CHECK-NEXT: ret 1509 %op = load <4 x double>, ptr %a 1510 %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op) 1511 ret double %res 1512} 1513 1514define double @fminimumv_v8f64(ptr %a) #0 { 1515; VBITS_GE_256-LABEL: fminimumv_v8f64: 1516; VBITS_GE_256: // %bb.0: 1517; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1518; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1519; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1520; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1521; VBITS_GE_256-NEXT: fmin z0.d, p0/m, z0.d, z1.d 1522; VBITS_GE_256-NEXT: fminv d0, p0, z0.d 1523; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 1524; VBITS_GE_256-NEXT: ret 1525; 1526; VBITS_GE_512-LABEL: fminimumv_v8f64: 1527; VBITS_GE_512: // %bb.0: 1528; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1529; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1530; VBITS_GE_512-NEXT: fminv d0, p0, z0.d 1531; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 1532; VBITS_GE_512-NEXT: ret 1533 %op = load <8 x double>, ptr %a 1534 %res = call double @llvm.vector.reduce.fminimum.v8f64(<8 x double> %op) 1535 ret double %res 1536} 1537 1538define double @fminimumv_v16f64(ptr %a) vscale_range(8,0) #0 { 1539; CHECK-LABEL: fminimumv_v16f64: 1540; CHECK: // %bb.0: 1541; CHECK-NEXT: ptrue p0.d, vl16 1542; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1543; CHECK-NEXT: fminv d0, p0, z0.d 1544; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1545; CHECK-NEXT: ret 1546 %op = load <16 x double>, ptr %a 1547 %res = call double @llvm.vector.reduce.fminimum.v16f64(<16 x double> %op) 1548 ret double %res 1549} 1550 1551define double @fminimumv_v32f64(ptr %a) vscale_range(16,0) #0 { 1552; CHECK-LABEL: fminimumv_v32f64: 1553; CHECK: // %bb.0: 1554; CHECK-NEXT: ptrue p0.d, vl32 1555; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1556; CHECK-NEXT: fminv d0, p0, z0.d 1557; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 1558; CHECK-NEXT: ret 1559 %op = load <32 x double>, ptr %a 1560 %res = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %op) 1561 ret double %res 1562} 1563 1564attributes #0 = { "target-features"="+sve" } 1565 1566declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>) 1567declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>) 1568declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>) 1569declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>) 1570declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>) 1571declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>) 1572 1573declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) 1574declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 1575declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) 1576declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) 1577declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) 1578declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>) 1579 1580declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>) 1581declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) 1582declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) 1583declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) 1584declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) 1585declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) 1586 1587declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) 1588declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>) 1589declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>) 1590declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>) 1591declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>) 1592declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>) 1593 1594declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) 1595declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 1596declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 1597declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>) 1598declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>) 1599declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>) 1600 1601declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>) 1602declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) 1603declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) 1604declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>) 1605declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>) 1606declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) 1607 1608declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) 1609declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>) 1610declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>) 1611declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>) 1612declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>) 1613declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>) 1614 1615declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) 1616declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 1617declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) 1618declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>) 1619declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>) 1620declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>) 1621 1622declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>) 1623declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) 1624declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) 1625declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>) 1626declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>) 1627declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) 1628 1629declare half @llvm.vector.reduce.fmaximum.v4f16(<4 x half>) 1630declare half @llvm.vector.reduce.fmaximum.v8f16(<8 x half>) 1631declare half @llvm.vector.reduce.fmaximum.v16f16(<16 x half>) 1632declare half @llvm.vector.reduce.fmaximum.v32f16(<32 x half>) 1633declare half @llvm.vector.reduce.fmaximum.v64f16(<64 x half>) 1634declare half @llvm.vector.reduce.fmaximum.v128f16(<128 x half>) 1635 1636declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>) 1637declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) 1638declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>) 1639declare float @llvm.vector.reduce.fmaximum.v16f32(<16 x float>) 1640declare float @llvm.vector.reduce.fmaximum.v32f32(<32 x float>) 1641declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>) 1642 1643declare double @llvm.vector.reduce.fmaximum.v1f64(<1 x double>) 1644declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>) 1645declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>) 1646declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>) 1647declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>) 1648declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>) 1649 1650declare half @llvm.vector.reduce.fminimum.v4f16(<4 x half>) 1651declare half @llvm.vector.reduce.fminimum.v8f16(<8 x half>) 1652declare half @llvm.vector.reduce.fminimum.v16f16(<16 x half>) 1653declare half @llvm.vector.reduce.fminimum.v32f16(<32 x half>) 1654declare half @llvm.vector.reduce.fminimum.v64f16(<64 x half>) 1655declare half @llvm.vector.reduce.fminimum.v128f16(<128 x half>) 1656 1657declare float @llvm.vector.reduce.fminimum.v2f32(<2 x float>) 1658declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>) 1659declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>) 1660declare float @llvm.vector.reduce.fminimum.v16f32(<16 x float>) 1661declare float @llvm.vector.reduce.fminimum.v32f32(<32 x float>) 1662declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>) 1663 1664declare double @llvm.vector.reduce.fminimum.v1f64(<1 x double>) 1665declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>) 1666declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>) 1667declare double @llvm.vector.reduce.fminimum.v8f64(<8 x double>) 1668declare double @llvm.vector.reduce.fminimum.v16f64(<16 x double>) 1669declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>) 1670