1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16 3; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16 4; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16 5; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 6 7define float @mul_HalfS(<2 x float> %bin.rdx) { 8; CHECK-SD-LABEL: mul_HalfS: 9; CHECK-SD: // %bb.0: 10; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 11; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 12; CHECK-SD-NEXT: ret 13; 14; CHECK-GI-LABEL: mul_HalfS: 15; CHECK-GI: // %bb.0: 16; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 17; CHECK-GI-NEXT: mov s1, v0.s[1] 18; CHECK-GI-NEXT: fmul s0, s0, s1 19; CHECK-GI-NEXT: ret 20 %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx) 21 ret float %r 22} 23 24define half @mul_HalfH(<4 x half> %bin.rdx) { 25; CHECK-SD-NOFP16-LABEL: mul_HalfH: 26; CHECK-SD-NOFP16: // %bb.0: 27; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0 28; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1] 29; CHECK-SD-NOFP16-NEXT: fcvt s2, h0 30; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 31; CHECK-SD-NOFP16-NEXT: fmul s1, s2, s1 32; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2] 33; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[3] 34; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 35; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 36; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 37; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 38; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 39; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 40; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 41; CHECK-SD-NOFP16-NEXT: fmul s0, s1, s0 42; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 43; CHECK-SD-NOFP16-NEXT: ret 44; 45; CHECK-SD-FP16-LABEL: mul_HalfH: 46; CHECK-SD-FP16: // %bb.0: 47; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 48; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1] 49; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] 50; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] 51; CHECK-SD-FP16-NEXT: ret 52; 53; CHECK-GI-NOFP16-LABEL: mul_HalfH: 54; CHECK-GI-NOFP16: // %bb.0: 55; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h 56; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] 57; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s 58; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] 59; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 60; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 61; CHECK-GI-NOFP16-NEXT: ret 62; 63; CHECK-GI-FP16-LABEL: mul_HalfH: 64; CHECK-GI-FP16: // %bb.0: 65; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 66; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] 67; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] 68; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] 69; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 70; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 71; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 72; CHECK-GI-FP16-NEXT: ret 73 %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx) 74 ret half %r 75} 76 77 78define half @mul_H(<8 x half> %bin.rdx) { 79; CHECK-SD-NOFP16-LABEL: mul_H: 80; CHECK-SD-NOFP16: // %bb.0: 81; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1] 82; CHECK-SD-NOFP16-NEXT: fcvt s2, h0 83; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 84; CHECK-SD-NOFP16-NEXT: fmul s1, s2, s1 85; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2] 86; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 87; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 88; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 89; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 90; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[3] 91; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 92; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 93; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 94; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 95; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[4] 96; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 97; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 98; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 99; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 100; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[5] 101; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 102; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 103; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 104; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 105; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6] 106; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7] 107; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 108; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 109; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 110; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 111; CHECK-SD-NOFP16-NEXT: fmul s1, s1, s2 112; CHECK-SD-NOFP16-NEXT: fcvt h1, s1 113; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 114; CHECK-SD-NOFP16-NEXT: fmul s0, s1, s0 115; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 116; CHECK-SD-NOFP16-NEXT: ret 117; 118; CHECK-SD-FP16-LABEL: mul_H: 119; CHECK-SD-FP16: // %bb.0: 120; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 121; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h 122; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1] 123; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] 124; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] 125; CHECK-SD-FP16-NEXT: ret 126; 127; CHECK-GI-NOFP16-LABEL: mul_H: 128; CHECK-GI-NOFP16: // %bb.0: 129; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h 130; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 131; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v1.4s, v0.4s 132; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] 133; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s 134; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] 135; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 136; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 137; CHECK-GI-NOFP16-NEXT: ret 138; 139; CHECK-GI-FP16-LABEL: mul_H: 140; CHECK-GI-FP16: // %bb.0: 141; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] 142; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h 143; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] 144; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] 145; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] 146; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 147; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 148; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 149; CHECK-GI-FP16-NEXT: ret 150 %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx) 151 ret half %r 152} 153 154define float @mul_S(<4 x float> %bin.rdx) { 155; CHECK-SD-LABEL: mul_S: 156; CHECK-SD: // %bb.0: 157; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 158; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 159; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 160; CHECK-SD-NEXT: ret 161; 162; CHECK-GI-LABEL: mul_S: 163; CHECK-GI: // %bb.0: 164; CHECK-GI-NEXT: mov d1, v0.d[1] 165; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s 166; CHECK-GI-NEXT: mov s1, v0.s[1] 167; CHECK-GI-NEXT: fmul s0, s0, s1 168; CHECK-GI-NEXT: ret 169 %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx) 170 ret float %r 171} 172 173define double @mul_D(<2 x double> %bin.rdx) { 174; CHECK-LABEL: mul_D: 175; CHECK: // %bb.0: 176; CHECK-NEXT: fmul d0, d0, v0.d[1] 177; CHECK-NEXT: ret 178 %r = call fast double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx) 179 ret double %r 180} 181 182define half @mul_2H(<16 x half> %bin.rdx) { 183; CHECK-SD-NOFP16-LABEL: mul_2H: 184; CHECK-SD-NOFP16: // %bb.0: 185; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h 186; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h 187; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 188; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 189; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s 190; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s 191; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s 192; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s 193; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1] 194; CHECK-SD-NOFP16-NEXT: fcvt s2, h1 195; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 196; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0 197; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2] 198; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 199; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 200; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 201; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 202; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3] 203; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 204; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 205; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 206; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 207; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4] 208; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 209; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 210; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 211; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 212; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5] 213; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 214; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 215; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 216; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 217; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6] 218; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] 219; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 220; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 221; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 222; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 223; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 224; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 225; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 226; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1 227; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 228; CHECK-SD-NOFP16-NEXT: ret 229; 230; CHECK-SD-FP16-LABEL: mul_2H: 231; CHECK-SD-FP16: // %bb.0: 232; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h 233; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 234; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h 235; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1] 236; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] 237; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] 238; CHECK-SD-FP16-NEXT: ret 239; 240; CHECK-GI-NOFP16-LABEL: mul_2H: 241; CHECK-GI-NOFP16: // %bb.0: 242; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h 243; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 244; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h 245; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 246; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s 247; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s 248; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s 249; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] 250; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s 251; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] 252; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 253; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 254; CHECK-GI-NOFP16-NEXT: ret 255; 256; CHECK-GI-FP16-LABEL: mul_2H: 257; CHECK-GI-FP16: // %bb.0: 258; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h 259; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] 260; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h 261; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] 262; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] 263; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] 264; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 265; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 266; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 267; CHECK-GI-FP16-NEXT: ret 268 %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx) 269 ret half %r 270} 271 272define float @mul_2S(<8 x float> %bin.rdx) { 273; CHECK-SD-LABEL: mul_2S: 274; CHECK-SD: // %bb.0: 275; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 276; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 277; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 278; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 279; CHECK-SD-NEXT: ret 280; 281; CHECK-GI-LABEL: mul_2S: 282; CHECK-GI: // %bb.0: 283; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s 284; CHECK-GI-NEXT: mov d1, v0.d[1] 285; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s 286; CHECK-GI-NEXT: mov s1, v0.s[1] 287; CHECK-GI-NEXT: fmul s0, s0, s1 288; CHECK-GI-NEXT: ret 289 %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx) 290 ret float %r 291} 292 293define double @mul_2D(<4 x double> %bin.rdx) { 294; CHECK-LABEL: mul_2D: 295; CHECK: // %bb.0: 296; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d 297; CHECK-NEXT: fmul d0, d0, v0.d[1] 298; CHECK-NEXT: ret 299 %r = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx) 300 ret double %r 301} 302 303; added at least one test where the start value is not 1.0. 304define float @mul_S_init_42(<4 x float> %bin.rdx) { 305; CHECK-SD-LABEL: mul_S_init_42: 306; CHECK-SD: // %bb.0: 307; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 308; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000 309; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 310; CHECK-SD-NEXT: fmov s1, w8 311; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 312; CHECK-SD-NEXT: fmul s0, s0, s1 313; CHECK-SD-NEXT: ret 314; 315; CHECK-GI-LABEL: mul_S_init_42: 316; CHECK-GI: // %bb.0: 317; CHECK-GI-NEXT: mov d1, v0.d[1] 318; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000 319; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s 320; CHECK-GI-NEXT: mov s1, v0.s[1] 321; CHECK-GI-NEXT: fmul s0, s0, s1 322; CHECK-GI-NEXT: fmov s1, w8 323; CHECK-GI-NEXT: fmul s0, s0, s1 324; CHECK-GI-NEXT: ret 325 %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx) 326 ret float %r 327} 328 329 330define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { 331; CHECK-SD-NOFP16-LABEL: fmul_reduct_reassoc_v8f16: 332; CHECK-SD-NOFP16: // %bb.0: 333; CHECK-SD-NOFP16-NEXT: fcvtl v2.4s, v1.4h 334; CHECK-SD-NOFP16-NEXT: fcvtl v3.4s, v0.4h 335; CHECK-SD-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 336; CHECK-SD-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 337; CHECK-SD-NOFP16-NEXT: fmul v2.4s, v3.4s, v2.4s 338; CHECK-SD-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s 339; CHECK-SD-NOFP16-NEXT: fcvtn v1.4h, v2.4s 340; CHECK-SD-NOFP16-NEXT: fcvtn2 v1.8h, v0.4s 341; CHECK-SD-NOFP16-NEXT: mov h0, v1.h[1] 342; CHECK-SD-NOFP16-NEXT: fcvt s2, h1 343; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 344; CHECK-SD-NOFP16-NEXT: fmul s0, s2, s0 345; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[2] 346; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 347; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 348; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 349; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 350; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[3] 351; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 352; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 353; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 354; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 355; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[4] 356; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 357; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 358; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 359; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 360; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[5] 361; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 362; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 363; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 364; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 365; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[6] 366; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7] 367; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 368; CHECK-SD-NOFP16-NEXT: fcvt s2, h2 369; CHECK-SD-NOFP16-NEXT: fcvt s1, h1 370; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 371; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s2 372; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 373; CHECK-SD-NOFP16-NEXT: fcvt s0, h0 374; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1 375; CHECK-SD-NOFP16-NEXT: fcvt h0, s0 376; CHECK-SD-NOFP16-NEXT: ret 377; 378; CHECK-SD-FP16-LABEL: fmul_reduct_reassoc_v8f16: 379; CHECK-SD-FP16: // %bb.0: 380; CHECK-SD-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h 381; CHECK-SD-FP16-NEXT: ext v1.16b, v0.16b, v0.16b, #8 382; CHECK-SD-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h 383; CHECK-SD-FP16-NEXT: fmul h1, h0, v0.h[1] 384; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] 385; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] 386; CHECK-SD-FP16-NEXT: ret 387; 388; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16: 389; CHECK-GI-NOFP16: // %bb.0: 390; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h 391; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h 392; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h 393; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h 394; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s 395; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s 396; CHECK-GI-NOFP16-NEXT: mov d2, v0.d[1] 397; CHECK-GI-NOFP16-NEXT: mov d3, v1.d[1] 398; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s 399; CHECK-GI-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s 400; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] 401; CHECK-GI-NOFP16-NEXT: mov s3, v1.s[1] 402; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s2 403; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s3 404; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 405; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 406; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 407; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 408; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 409; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 410; CHECK-GI-NOFP16-NEXT: ret 411; 412; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16: 413; CHECK-GI-FP16: // %bb.0: 414; CHECK-GI-FP16-NEXT: mov d2, v0.d[1] 415; CHECK-GI-FP16-NEXT: mov d3, v1.d[1] 416; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v2.4h 417; CHECK-GI-FP16-NEXT: fmul v1.4h, v1.4h, v3.4h 418; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] 419; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] 420; CHECK-GI-FP16-NEXT: mov h4, v0.h[3] 421; CHECK-GI-FP16-NEXT: mov h5, v1.h[1] 422; CHECK-GI-FP16-NEXT: mov h6, v1.h[2] 423; CHECK-GI-FP16-NEXT: mov h7, v1.h[3] 424; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 425; CHECK-GI-FP16-NEXT: fmul h2, h3, h4 426; CHECK-GI-FP16-NEXT: fmul h1, h1, h5 427; CHECK-GI-FP16-NEXT: fmul h3, h6, h7 428; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 429; CHECK-GI-FP16-NEXT: fmul h1, h1, h3 430; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 431; CHECK-GI-FP16-NEXT: ret 432 %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a) 433 %r2 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %b) 434 %r = fmul fast half %r1, %r2 435 ret half %r 436} 437 438define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { 439; CHECK-SD-LABEL: fmul_reduct_reassoc_v8f32: 440; CHECK-SD: // %bb.0: 441; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v3.4s 442; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 443; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s 444; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 445; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 446; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 447; CHECK-SD-NEXT: ret 448; 449; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32: 450; CHECK-GI: // %bb.0: 451; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s 452; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s 453; CHECK-GI-NEXT: mov d2, v0.d[1] 454; CHECK-GI-NEXT: mov d3, v1.d[1] 455; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s 456; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 457; CHECK-GI-NEXT: mov s2, v0.s[1] 458; CHECK-GI-NEXT: mov s3, v1.s[1] 459; CHECK-GI-NEXT: fmul s0, s0, s2 460; CHECK-GI-NEXT: fmul s1, s1, s3 461; CHECK-GI-NEXT: fmul s0, s0, s1 462; CHECK-GI-NEXT: ret 463 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) 464 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b) 465 %r = fmul fast float %r1, %r2 466 ret float %r 467} 468 469define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { 470; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32: 471; CHECK-SD: // %bb.0: 472; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 473; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 474; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 475; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 476; CHECK-SD-NEXT: ret 477; 478; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32: 479; CHECK-GI: // %bb.0: 480; CHECK-GI-NEXT: mov d2, v0.d[1] 481; CHECK-GI-NEXT: mov d3, v1.d[1] 482; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s 483; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 484; CHECK-GI-NEXT: mov s2, v0.s[1] 485; CHECK-GI-NEXT: mov s3, v1.s[1] 486; CHECK-GI-NEXT: fmul s0, s0, s2 487; CHECK-GI-NEXT: fmul s1, s1, s3 488; CHECK-GI-NEXT: fmul s0, s0, s1 489; CHECK-GI-NEXT: ret 490 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 491 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 492 %r = fmul fast float %r1, %r2 493 ret float %r 494} 495 496define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) { 497; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init: 498; CHECK-SD: // %bb.0: 499; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 500; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s 501; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8 502; CHECK-SD-NEXT: fmul s1, s1, v1.s[1] 503; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s 504; CHECK-SD-NEXT: fmul s0, s0, s1 505; CHECK-SD-NEXT: fmul s1, s2, v2.s[1] 506; CHECK-SD-NEXT: fmul s0, s0, s1 507; CHECK-SD-NEXT: ret 508; 509; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init: 510; CHECK-GI: // %bb.0: 511; CHECK-GI-NEXT: mov d3, v1.d[1] 512; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 513; CHECK-GI-NEXT: mov d3, v2.d[1] 514; CHECK-GI-NEXT: mov s4, v1.s[1] 515; CHECK-GI-NEXT: fmul v2.2s, v2.2s, v3.2s 516; CHECK-GI-NEXT: fmul s1, s1, s4 517; CHECK-GI-NEXT: mov s3, v2.s[1] 518; CHECK-GI-NEXT: fmul s0, s0, s1 519; CHECK-GI-NEXT: fmul s1, s2, s3 520; CHECK-GI-NEXT: fmul s0, s0, s1 521; CHECK-GI-NEXT: ret 522 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a) 523 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 524 %r = fmul fast float %r1, %r2 525 ret float %r 526} 527 528define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { 529; CHECK-SD-LABEL: fmul_reduct_reassoc_v4v8f32: 530; CHECK-SD: // %bb.0: 531; CHECK-SD-NEXT: fmul v1.4s, v1.4s, v2.4s 532; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 533; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 534; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 535; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 536; CHECK-SD-NEXT: ret 537; 538; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32: 539; CHECK-GI: // %bb.0: 540; CHECK-GI-NEXT: fmul v1.4s, v1.4s, v2.4s 541; CHECK-GI-NEXT: mov d2, v0.d[1] 542; CHECK-GI-NEXT: mov d3, v1.d[1] 543; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s 544; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 545; CHECK-GI-NEXT: mov s2, v0.s[1] 546; CHECK-GI-NEXT: mov s3, v1.s[1] 547; CHECK-GI-NEXT: fmul s0, s0, s2 548; CHECK-GI-NEXT: fmul s1, s1, s3 549; CHECK-GI-NEXT: fmul s0, s0, s1 550; CHECK-GI-NEXT: ret 551 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 552 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b) 553 %r = fmul fast float %r1, %r2 554 ret float %r 555} 556 557define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { 558; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f64: 559; CHECK-SD: // %bb.0: 560; CHECK-SD-NEXT: fmul v2.2d, v2.2d, v3.2d 561; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v1.2d 562; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v2.2d 563; CHECK-SD-NEXT: fmul d0, d0, v0.d[1] 564; CHECK-SD-NEXT: ret 565; 566; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f64: 567; CHECK-GI: // %bb.0: 568; CHECK-GI-NEXT: fmul v0.2d, v0.2d, v1.2d 569; CHECK-GI-NEXT: fmul v1.2d, v2.2d, v3.2d 570; CHECK-GI-NEXT: fmul d0, d0, v0.d[1] 571; CHECK-GI-NEXT: fmul d1, d1, v1.d[1] 572; CHECK-GI-NEXT: fmul d0, d0, d1 573; CHECK-GI-NEXT: ret 574 %r1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a) 575 %r2 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %b) 576 %r = fmul fast double %r1, %r2 577 ret double %r 578} 579 580define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) { 581; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause: 582; CHECK-SD: // %bb.0: 583; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 584; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 585; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v2.2s 586; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s 587; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 588; CHECK-SD-NEXT: fmul s1, s1, v1.s[1] 589; CHECK-SD-NEXT: fmul s1, s0, s1 590; CHECK-SD-NEXT: fmul s0, s1, s0 591; CHECK-SD-NEXT: ret 592; 593; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause: 594; CHECK-GI: // %bb.0: 595; CHECK-GI-NEXT: mov d2, v0.d[1] 596; CHECK-GI-NEXT: mov d3, v1.d[1] 597; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s 598; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 599; CHECK-GI-NEXT: mov s2, v0.s[1] 600; CHECK-GI-NEXT: mov s3, v1.s[1] 601; CHECK-GI-NEXT: fmul s0, s0, s2 602; CHECK-GI-NEXT: fmul s1, s1, s3 603; CHECK-GI-NEXT: fmul s1, s0, s1 604; CHECK-GI-NEXT: fmul s0, s1, s0 605; CHECK-GI-NEXT: ret 606 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 607 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 608 %r = fmul fast float %r1, %r2 609 %p = fmul float %r, %r1 610 ret float %p 611} 612 613; Function Attrs: nounwind readnone 614declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>) 615declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>) 616declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>) 617declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>) 618declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 619declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 620declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) 621declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) 622