1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16 3; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16 4; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16 5; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 6 7define float @add_HalfS(<2 x float> %bin.rdx) { 8; CHECK-LABEL: add_HalfS: 9; CHECK: // %bb.0: 10; CHECK-NEXT: faddp s0, v0.2s 11; CHECK-NEXT: ret 12 %r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx) 13 ret float %r 14} 15 16define half @add_HalfH(<4 x half> %bin.rdx) { 17; CHECK-SD-NOFP16-LABEL: add_HalfH: 18; CHECK-SD-NOFP16: // %bb.0: 19; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 20; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1] 21; CHECK-SD-NOFP16-NEXT: fcvt s2, h0 22; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 23; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1 24; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2] 25; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[3] 26; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 27; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 28; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 29; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 30; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 31; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 32; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 33; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0 34; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 35; CHECK-SD-NOFP16-NEXT: ret 36; 37; CHECK-SD-FP16-LABEL: add_HalfH: 38; CHECK-SD-FP16: // %bb.0: 39; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h 40; CHECK-SD-FP16-NEXT: faddp h0, v0.2h 41; CHECK-SD-FP16-NEXT: ret 42; 43; CHECK-GI-NOFP16-LABEL: add_HalfH: 44; CHECK-GI-NOFP16: // %bb.0: 45; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h 46; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 47; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 48; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 49; CHECK-GI-NOFP16-NEXT: ret 50; 51; CHECK-GI-FP16-LABEL: add_HalfH: 52; CHECK-GI-FP16: // %bb.0: 53; CHECK-GI-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h 54; CHECK-GI-FP16-NEXT: faddp h0, v0.2h 55; CHECK-GI-FP16-NEXT: ret 56 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx) 57 ret half %r 58} 59 60 61define half @add_H(<8 x half> %bin.rdx) { 62; CHECK-SD-NOFP16-LABEL: add_H: 63; CHECK-SD-NOFP16: // %bb.0: 64; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1] 65; CHECK-SD-NOFP16-NEXT: fcvt s2, h0 66; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 67; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1 68; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2] 69; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 70; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 71; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 72; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 73; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[3] 74; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 75; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 76; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 77; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 78; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[4] 79; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 80; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 81; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 82; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 83; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[5] 84; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 85; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 86; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 87; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 88; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6] 89; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] 90; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 91; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 92; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 93; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 94; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2 95; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 96; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 97; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0 98; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 99; CHECK-SD-NOFP16-NEXT: ret 100; 101; CHECK-SD-FP16-LABEL: add_H: 102; CHECK-SD-FP16: // %bb.0: 103; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h 104; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h 105; CHECK-SD-FP16-NEXT: faddp h0, v0.2h 106; CHECK-SD-FP16-NEXT: ret 107; 108; CHECK-GI-NOFP16-LABEL: add_H: 109; CHECK-GI-NOFP16: // %bb.0: 110; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h 111; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 112; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s 113; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 114; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 115; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 116; CHECK-GI-NOFP16-NEXT: ret 117; 118; CHECK-GI-FP16-LABEL: add_H: 119; CHECK-GI-FP16: // %bb.0: 120; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h 121; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h 122; CHECK-GI-FP16-NEXT: faddp h0, v0.2h 123; CHECK-GI-FP16-NEXT: ret 124 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx) 125 ret half %r 126} 127 128define float @add_S(<4 x float> %bin.rdx) { 129; CHECK-LABEL: add_S: 130; CHECK: // %bb.0: 131; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s 132; CHECK-NEXT: faddp s0, v0.2s 133; CHECK-NEXT: ret 134 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx) 135 ret float %r 136} 137 138define double @add_D(<2 x double> %bin.rdx) { 139; CHECK-LABEL: add_D: 140; CHECK: // %bb.0: 141; CHECK-NEXT: faddp d0, v0.2d 142; CHECK-NEXT: ret 143 %r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx) 144 ret double %r 145} 146 147define half @add_2H(<16 x half> %bin.rdx) { 148; CHECK-SD-NOFP16-LABEL: add_2H: 149; CHECK-SD-NOFP16: // %bb.0: 150; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h 151; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h 152; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 153; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 154; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s 155; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s 156; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s 157; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s 158; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1] 159; CHECK-SD-NOFP16-NEXT: fcvt s2, h1 160; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 161; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0 162; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2] 163; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 164; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 165; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 166; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 167; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3] 168; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 169; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 170; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 171; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 172; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4] 173; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 174; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 175; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 176; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 177; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5] 178; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 179; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 180; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 181; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 182; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6] 183; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] 184; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 185; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 186; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 187; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 188; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 189; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 190; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 191; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1 192; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 193; CHECK-SD-NOFP16-NEXT: ret 194; 195; CHECK-SD-FP16-LABEL: add_2H: 196; CHECK-SD-FP16: // %bb.0: 197; CHECK-SD-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h 198; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h 199; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h 200; CHECK-SD-FP16-NEXT: faddp h0, v0.2h 201; CHECK-SD-FP16-NEXT: ret 202; 203; CHECK-GI-NOFP16-LABEL: add_2H: 204; CHECK-GI-NOFP16: // %bb.0: 205; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h 206; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 207; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h 208; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 209; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v2.4s, v0.4s 210; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v3.4s, v1.4s 211; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s 212; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 213; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 214; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 215; CHECK-GI-NOFP16-NEXT: ret 216; 217; CHECK-GI-FP16-LABEL: add_2H: 218; CHECK-GI-FP16: // %bb.0: 219; CHECK-GI-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h 220; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h 221; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h 222; CHECK-GI-FP16-NEXT: faddp h0, v0.2h 223; CHECK-GI-FP16-NEXT: ret 224 %r = call fast half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx) 225 ret half %r 226} 227 228define float @add_2S(<8 x float> %bin.rdx) { 229; CHECK-LABEL: add_2S: 230; CHECK: // %bb.0: 231; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s 232; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s 233; CHECK-NEXT: faddp s0, v0.2s 234; CHECK-NEXT: ret 235 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx) 236 ret float %r 237} 238 239define double @add_2D(<4 x double> %bin.rdx) { 240; CHECK-LABEL: add_2D: 241; CHECK: // %bb.0: 242; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d 243; CHECK-NEXT: faddp d0, v0.2d 244; CHECK-NEXT: ret 245 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx) 246 ret double %r 247} 248 249; Added at least one test where the start value is not -0.0. 250define float @add_S_init_42(<4 x float> %bin.rdx) { 251; CHECK-LABEL: add_S_init_42: 252; CHECK: // %bb.0: 253; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s 254; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 255; CHECK-NEXT: fmov s1, w8 256; CHECK-NEXT: faddp s0, v0.2s 257; CHECK-NEXT: fadd s0, s0, s1 258; CHECK-NEXT: ret 259 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx) 260 ret float %r 261} 262 263; The faddp.4s in the loop should not use v0.4s as second operand, 264; because this introduces an unnecessary cross-iteration dependency. 265define float @fadd_reduction_v4f32_in_loop(ptr %ptr.start) { 266; CHECK-LABEL: fadd_reduction_v4f32_in_loop: 267; CHECK: // %bb.0: // %entry 268; CHECK-NEXT: movi d0, #0000000000000000 269; CHECK-NEXT: mov x8, xzr 270; CHECK-NEXT: .LBB9_1: // %loop 271; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 272; CHECK-NEXT: ldr q1, [x0, x8] 273; CHECK-NEXT: add x8, x8, #16 274; CHECK-NEXT: cmp w8, #112 275; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s 276; CHECK-NEXT: faddp s1, v1.2s 277; CHECK-NEXT: fadd s0, s1, s0 278; CHECK-NEXT: b.ne .LBB9_1 279; CHECK-NEXT: // %bb.2: // %exit 280; CHECK-NEXT: ret 281entry: 282 br label %loop 283 284loop: 285 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] 286 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ] 287 %red = phi float [ 0.000000e+00, %entry ], [ %red.next, %loop ] 288 %lv = load <4 x float>, ptr %ptr, align 4 289 %r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %lv) 290 %red.next = fadd fast float %r, %red 291 %ec = icmp eq i32 %iv, 7 292 %ptr.next = getelementptr inbounds float, ptr %ptr, i64 4 293 %iv.next= add nuw nsw i32 %iv, 1 294 br i1 %ec, label %exit, label %loop 295 296exit: 297 ret float %red.next 298} 299 300; The faddp.4h in the loop should not use v0.4h as second operand, 301; because this introduces an unnecessary cross-iteration dependency. 302define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) { 303; CHECK-SD-NOFP16-LABEL: fadd_reduction_v4f16_in_loop: 304; CHECK-SD-NOFP16: // %bb.0: // %entry 305; CHECK-SD-NOFP16-NEXT: movi d0, #0000000000000000 306; CHECK-SD-NOFP16-NEXT: mov x8, xzr 307; CHECK-SD-NOFP16-NEXT: .LBB10_1: // %loop 308; CHECK-SD-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1 309; CHECK-SD-NOFP16-NEXT: ldr d1, [x0, x8] 310; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 311; CHECK-SD-NOFP16-NEXT: add x8, x8, #8 312; CHECK-SD-NOFP16-NEXT: cmp w8, #56 313; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1] 314; CHECK-SD-NOFP16-NEXT: fcvt s3, h1 315; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 316; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2 317; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2] 318; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[3] 319; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 320; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 321; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 322; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 323; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 324; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 325; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 326; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1 327; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 328; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 329; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0 330; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 331; CHECK-SD-NOFP16-NEXT: b.ne .LBB10_1 332; CHECK-SD-NOFP16-NEXT: // %bb.2: // %exit 333; CHECK-SD-NOFP16-NEXT: ret 334; 335; CHECK-SD-FP16-LABEL: fadd_reduction_v4f16_in_loop: 336; CHECK-SD-FP16: // %bb.0: // %entry 337; CHECK-SD-FP16-NEXT: movi d0, #0000000000000000 338; CHECK-SD-FP16-NEXT: mov x8, xzr 339; CHECK-SD-FP16-NEXT: .LBB10_1: // %loop 340; CHECK-SD-FP16-NEXT: // =>This Inner Loop Header: Depth=1 341; CHECK-SD-FP16-NEXT: ldr d1, [x0, x8] 342; CHECK-SD-FP16-NEXT: add x8, x8, #8 343; CHECK-SD-FP16-NEXT: cmp w8, #56 344; CHECK-SD-FP16-NEXT: faddp v1.4h, v1.4h, v1.4h 345; CHECK-SD-FP16-NEXT: faddp h1, v1.2h 346; CHECK-SD-FP16-NEXT: fadd h0, h1, h0 347; CHECK-SD-FP16-NEXT: b.ne .LBB10_1 348; CHECK-SD-FP16-NEXT: // %bb.2: // %exit 349; CHECK-SD-FP16-NEXT: ret 350; 351; CHECK-GI-NOFP16-LABEL: fadd_reduction_v4f16_in_loop: 352; CHECK-GI-NOFP16: // %bb.0: // %entry 353; CHECK-GI-NOFP16-NEXT: mov x8, xzr 354; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0 355; CHECK-GI-NOFP16-NEXT: .LBB10_1: // %loop 356; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1 357; CHECK-GI-NOFP16-NEXT: ldr d0, [x0, x8] 358; CHECK-GI-NOFP16-NEXT: fmov s1, w9 359; CHECK-GI-NOFP16-NEXT: add x8, x8, #8 360; CHECK-GI-NOFP16-NEXT: cmp w8, #56 361; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h 362; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 363; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 364; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 365; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 366; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 367; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1 368; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 369; CHECK-GI-NOFP16-NEXT: fmov w9, s0 370; CHECK-GI-NOFP16-NEXT: b.ne .LBB10_1 371; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit 372; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0 373; CHECK-GI-NOFP16-NEXT: ret 374; 375; CHECK-GI-FP16-LABEL: fadd_reduction_v4f16_in_loop: 376; CHECK-GI-FP16: // %bb.0: // %entry 377; CHECK-GI-FP16-NEXT: movi d0, #0000000000000000 378; CHECK-GI-FP16-NEXT: mov x8, xzr 379; CHECK-GI-FP16-NEXT: .LBB10_1: // %loop 380; CHECK-GI-FP16-NEXT: // =>This Inner Loop Header: Depth=1 381; CHECK-GI-FP16-NEXT: ldr d1, [x0, x8] 382; CHECK-GI-FP16-NEXT: add x8, x8, #8 383; CHECK-GI-FP16-NEXT: cmp w8, #56 384; CHECK-GI-FP16-NEXT: faddp v1.4h, v1.4h, v1.4h 385; CHECK-GI-FP16-NEXT: faddp h1, v1.2h 386; CHECK-GI-FP16-NEXT: fadd h0, h1, h0 387; CHECK-GI-FP16-NEXT: b.ne .LBB10_1 388; CHECK-GI-FP16-NEXT: // %bb.2: // %exit 389; CHECK-GI-FP16-NEXT: ret 390entry: 391 br label %loop 392 393loop: 394 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] 395 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ] 396 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ] 397 %lv = load <4 x half>, ptr %ptr, align 4 398 %r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %lv) 399 %red.next = fadd fast half %r, %red 400 %ec = icmp eq i32 %iv, 7 401 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4 402 %iv.next= add nuw nsw i32 %iv, 1 403 br i1 %ec, label %exit, label %loop 404 405exit: 406 ret half %red.next 407} 408 409; The faddp.8h in the loop should not use v0.8h as second operand, 410; because this introduces an unnecessary cross-iteration dependency. 411define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) { 412; CHECK-SD-NOFP16-LABEL: fadd_reduction_v8f16_in_loop: 413; CHECK-SD-NOFP16: // %bb.0: // %entry 414; CHECK-SD-NOFP16-NEXT: movi d0, #0000000000000000 415; CHECK-SD-NOFP16-NEXT: mov x8, xzr 416; CHECK-SD-NOFP16-NEXT: .LBB11_1: // %loop 417; CHECK-SD-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1 418; CHECK-SD-NOFP16-NEXT: ldr q1, [x0, x8] 419; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 420; CHECK-SD-NOFP16-NEXT: add x8, x8, #8 421; CHECK-SD-NOFP16-NEXT: cmp w8, #56 422; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1] 423; CHECK-SD-NOFP16-NEXT: fcvt s3, h1 424; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 425; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2 426; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2] 427; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 428; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 429; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 430; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 431; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[3] 432; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 433; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 434; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 435; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 436; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[4] 437; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 438; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 439; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 440; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 441; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[5] 442; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 443; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 444; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 445; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 446; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[6] 447; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] 448; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 449; CHECK-SD-NOFP16-NEXT: fcvt s3, h3 450; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 451; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 452; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3 453; CHECK-SD-NOFP16-NEXT: fcvt h2, s2 454; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 455; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1 456; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 457; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 458; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0 459; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 460; CHECK-SD-NOFP16-NEXT: b.ne .LBB11_1 461; CHECK-SD-NOFP16-NEXT: // %bb.2: // %exit 462; CHECK-SD-NOFP16-NEXT: ret 463; 464; CHECK-SD-FP16-LABEL: fadd_reduction_v8f16_in_loop: 465; CHECK-SD-FP16: // %bb.0: // %entry 466; CHECK-SD-FP16-NEXT: movi d0, #0000000000000000 467; CHECK-SD-FP16-NEXT: mov x8, xzr 468; CHECK-SD-FP16-NEXT: .LBB11_1: // %loop 469; CHECK-SD-FP16-NEXT: // =>This Inner Loop Header: Depth=1 470; CHECK-SD-FP16-NEXT: ldr q1, [x0, x8] 471; CHECK-SD-FP16-NEXT: add x8, x8, #8 472; CHECK-SD-FP16-NEXT: cmp w8, #56 473; CHECK-SD-FP16-NEXT: faddp v2.8h, v1.8h, v1.8h 474; CHECK-SD-FP16-NEXT: faddp v1.8h, v2.8h, v1.8h 475; CHECK-SD-FP16-NEXT: faddp h1, v1.2h 476; CHECK-SD-FP16-NEXT: fadd h0, h1, h0 477; CHECK-SD-FP16-NEXT: b.ne .LBB11_1 478; CHECK-SD-FP16-NEXT: // %bb.2: // %exit 479; CHECK-SD-FP16-NEXT: ret 480; 481; CHECK-GI-NOFP16-LABEL: fadd_reduction_v8f16_in_loop: 482; CHECK-GI-NOFP16: // %bb.0: // %entry 483; CHECK-GI-NOFP16-NEXT: mov x8, xzr 484; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0 485; CHECK-GI-NOFP16-NEXT: .LBB11_1: // %loop 486; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1 487; CHECK-GI-NOFP16-NEXT: ldr q0, [x0, x8] 488; CHECK-GI-NOFP16-NEXT: add x8, x8, #8 489; CHECK-GI-NOFP16-NEXT: cmp w8, #56 490; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h 491; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 492; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s 493; CHECK-GI-NOFP16-NEXT: fmov s1, w9 494; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 495; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 496; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 497; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 498; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 499; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1 500; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 501; CHECK-GI-NOFP16-NEXT: fmov w9, s0 502; CHECK-GI-NOFP16-NEXT: b.ne .LBB11_1 503; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit 504; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0 505; CHECK-GI-NOFP16-NEXT: ret 506; 507; CHECK-GI-FP16-LABEL: fadd_reduction_v8f16_in_loop: 508; CHECK-GI-FP16: // %bb.0: // %entry 509; CHECK-GI-FP16-NEXT: movi d0, #0000000000000000 510; CHECK-GI-FP16-NEXT: mov x8, xzr 511; CHECK-GI-FP16-NEXT: .LBB11_1: // %loop 512; CHECK-GI-FP16-NEXT: // =>This Inner Loop Header: Depth=1 513; CHECK-GI-FP16-NEXT: ldr q1, [x0, x8] 514; CHECK-GI-FP16-NEXT: add x8, x8, #8 515; CHECK-GI-FP16-NEXT: cmp w8, #56 516; CHECK-GI-FP16-NEXT: faddp v2.8h, v1.8h, v1.8h 517; CHECK-GI-FP16-NEXT: faddp v1.8h, v2.8h, v1.8h 518; CHECK-GI-FP16-NEXT: faddp h1, v1.2h 519; CHECK-GI-FP16-NEXT: fadd h0, h1, h0 520; CHECK-GI-FP16-NEXT: b.ne .LBB11_1 521; CHECK-GI-FP16-NEXT: // %bb.2: // %exit 522; CHECK-GI-FP16-NEXT: ret 523entry: 524 br label %loop 525 526loop: 527 %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ] 528 %ptr = phi ptr [ %ptr.start, %entry ], [ %ptr.next, %loop ] 529 %red = phi half [ 0.000000e+00, %entry ], [ %red.next, %loop ] 530 %lv = load <8 x half>, ptr %ptr, align 4 531 %r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %lv) 532 %red.next = fadd fast half %r, %red 533 %ec = icmp eq i32 %iv, 7 534 %ptr.next = getelementptr inbounds half, ptr %ptr, i64 4 535 %iv.next= add nuw nsw i32 %iv, 1 536 br i1 %ec, label %exit, label %loop 537 538exit: 539 ret half %red.next 540} 541 542 543define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { 544; CHECK-SD-NOFP16-LABEL: fadd_reduct_reassoc_v8f16: 545; CHECK-SD-NOFP16: // %bb.0: 546; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h 547; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h 548; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 549; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 550; CHECK-SD-NOFP16-NEXT: fadd v2.4s, v3.4s, v2.4s 551; CHECK-SD-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s 552; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s 553; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s 554; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1] 555; CHECK-SD-NOFP16-NEXT: fcvt s2, h1 556; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 557; CHECK-SD-NOFP16-NEXT: fadd s0, s2, s0 558; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2] 559; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 560; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 561; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 562; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 563; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3] 564; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 565; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 566; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 567; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 568; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4] 569; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 570; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 571; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 572; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 573; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5] 574; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 575; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 576; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 577; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 578; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6] 579; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] 580; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 581; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 582; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 583; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 584; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s2 585; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 586; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 587; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1 588; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 589; CHECK-SD-NOFP16-NEXT: ret 590; 591; CHECK-SD-FP16-LABEL: fadd_reduct_reassoc_v8f16: 592; CHECK-SD-FP16: // %bb.0: 593; CHECK-SD-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h 594; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h 595; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h 596; CHECK-SD-FP16-NEXT: faddp h0, v0.2h 597; CHECK-SD-FP16-NEXT: ret 598; 599; CHECK-GI-NOFP16-LABEL: fadd_reduct_reassoc_v8f16: 600; CHECK-GI-NOFP16: // %bb.0: 601; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h 602; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 603; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h 604; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 605; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v2.4s, v0.4s 606; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v3.4s, v1.4s 607; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s 608; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s 609; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s 610; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s 611; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 612; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 613; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 614; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 615; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1 616; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 617; CHECK-GI-NOFP16-NEXT: ret 618; 619; CHECK-GI-FP16-LABEL: fadd_reduct_reassoc_v8f16: 620; CHECK-GI-FP16: // %bb.0: 621; CHECK-GI-FP16-NEXT: faddp v2.8h, v0.8h, v0.8h 622; CHECK-GI-FP16-NEXT: faddp v3.8h, v1.8h, v1.8h 623; CHECK-GI-FP16-NEXT: faddp v0.8h, v2.8h, v0.8h 624; CHECK-GI-FP16-NEXT: faddp v1.8h, v3.8h, v1.8h 625; CHECK-GI-FP16-NEXT: faddp h0, v0.2h 626; CHECK-GI-FP16-NEXT: faddp h1, v1.2h 627; CHECK-GI-FP16-NEXT: fadd h0, h0, h1 628; CHECK-GI-FP16-NEXT: ret 629 %r1 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %a) 630 %r2 = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %b) 631 %r = fadd fast half %r1, %r2 632 ret half %r 633} 634 635define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { 636; CHECK-SD-LABEL: fadd_reduct_reassoc_v8f32: 637; CHECK-SD: // %bb.0: 638; CHECK-SD-NEXT: fadd v2.4s, v2.4s, v3.4s 639; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s 640; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s 641; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 642; CHECK-SD-NEXT: faddp s0, v0.2s 643; CHECK-SD-NEXT: ret 644; 645; CHECK-GI-LABEL: fadd_reduct_reassoc_v8f32: 646; CHECK-GI: // %bb.0: 647; CHECK-GI-NEXT: fadd v0.4s, v0.4s, v1.4s 648; CHECK-GI-NEXT: fadd v1.4s, v2.4s, v3.4s 649; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 650; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s 651; CHECK-GI-NEXT: faddp s0, v0.2s 652; CHECK-GI-NEXT: faddp s1, v1.2s 653; CHECK-GI-NEXT: fadd s0, s0, s1 654; CHECK-GI-NEXT: ret 655 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) 656 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) 657 %r = fadd fast float %r1, %r2 658 ret float %r 659} 660 661define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { 662; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f32: 663; CHECK-SD: // %bb.0: 664; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s 665; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 666; CHECK-SD-NEXT: faddp s0, v0.2s 667; CHECK-SD-NEXT: ret 668; 669; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f32: 670; CHECK-GI: // %bb.0: 671; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 672; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s 673; CHECK-GI-NEXT: faddp s0, v0.2s 674; CHECK-GI-NEXT: faddp s1, v1.2s 675; CHECK-GI-NEXT: fadd s0, s0, s1 676; CHECK-GI-NEXT: ret 677 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 678 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 679 %r = fadd fast float %r1, %r2 680 ret float %r 681} 682 683define float @fadd_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) { 684; CHECK-LABEL: fadd_reduct_reassoc_v4f32_init: 685; CHECK: // %bb.0: 686; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s 687; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s 688; CHECK-NEXT: faddp s1, v1.2s 689; CHECK-NEXT: fadd s0, s0, s1 690; CHECK-NEXT: faddp s1, v2.2s 691; CHECK-NEXT: fadd s0, s0, s1 692; CHECK-NEXT: ret 693 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %i, <4 x float> %a) 694 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 695 %r = fadd fast float %r1, %r2 696 ret float %r 697} 698 699define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { 700; CHECK-SD-LABEL: fadd_reduct_reassoc_v4v8f32: 701; CHECK-SD: // %bb.0: 702; CHECK-SD-NEXT: fadd v1.4s, v1.4s, v2.4s 703; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s 704; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 705; CHECK-SD-NEXT: faddp s0, v0.2s 706; CHECK-SD-NEXT: ret 707; 708; CHECK-GI-LABEL: fadd_reduct_reassoc_v4v8f32: 709; CHECK-GI: // %bb.0: 710; CHECK-GI-NEXT: fadd v1.4s, v1.4s, v2.4s 711; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 712; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s 713; CHECK-GI-NEXT: faddp s0, v0.2s 714; CHECK-GI-NEXT: faddp s1, v1.2s 715; CHECK-GI-NEXT: fadd s0, s0, s1 716; CHECK-GI-NEXT: ret 717 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 718 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b) 719 %r = fadd fast float %r1, %r2 720 ret float %r 721} 722 723define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { 724; CHECK-SD-LABEL: fadd_reduct_reassoc_v4f64: 725; CHECK-SD: // %bb.0: 726; CHECK-SD-NEXT: fadd v2.2d, v2.2d, v3.2d 727; CHECK-SD-NEXT: fadd v0.2d, v0.2d, v1.2d 728; CHECK-SD-NEXT: fadd v0.2d, v0.2d, v2.2d 729; CHECK-SD-NEXT: faddp d0, v0.2d 730; CHECK-SD-NEXT: ret 731; 732; CHECK-GI-LABEL: fadd_reduct_reassoc_v4f64: 733; CHECK-GI: // %bb.0: 734; CHECK-GI-NEXT: fadd v0.2d, v0.2d, v1.2d 735; CHECK-GI-NEXT: fadd v1.2d, v2.2d, v3.2d 736; CHECK-GI-NEXT: faddp d0, v0.2d 737; CHECK-GI-NEXT: faddp d1, v1.2d 738; CHECK-GI-NEXT: fadd d0, d0, d1 739; CHECK-GI-NEXT: ret 740 %r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a) 741 %r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b) 742 %r = fadd fast double %r1, %r2 743 ret double %r 744} 745 746define float @fadd_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) { 747; CHECK-LABEL: fadd_reduct_reassoc_v4f32_extrause: 748; CHECK: // %bb.0: 749; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s 750; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s 751; CHECK-NEXT: faddp s0, v0.2s 752; CHECK-NEXT: faddp s1, v1.2s 753; CHECK-NEXT: fadd s1, s0, s1 754; CHECK-NEXT: fmul s0, s1, s0 755; CHECK-NEXT: ret 756 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 757 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 758 %r = fadd fast float %r1, %r2 759 %p = fmul float %r, %r1 760 ret float %p 761} 762 763; Function Attrs: nounwind readnone 764declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>) 765declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>) 766declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>) 767declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 768declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 769declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 770declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 771declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 772