1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=false | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc --mtriple=aarch64-eabi < %s -global-isel=true | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5define float @add_f32(<8 x float> %a, <4 x float> %b) { 6; CHECK-SD-LABEL: add_f32: 7; CHECK-SD: // %bb.0: 8; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s 9; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v2.4s 10; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 11; CHECK-SD-NEXT: faddp s0, v0.2s 12; CHECK-SD-NEXT: ret 13; 14; CHECK-GI-LABEL: add_f32: 15; CHECK-GI: // %bb.0: 16; CHECK-GI-NEXT: fadd v0.4s, v0.4s, v1.4s 17; CHECK-GI-NEXT: faddp v1.4s, v2.4s, v2.4s 18; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 19; CHECK-GI-NEXT: faddp s1, v1.2s 20; CHECK-GI-NEXT: faddp s0, v0.2s 21; CHECK-GI-NEXT: fadd s0, s0, s1 22; CHECK-GI-NEXT: ret 23 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a) 24 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 25 %r = fadd fast float %r1, %r2 26 ret float %r 27} 28 29define float @add_f32_same(<4 x float> %a, <4 x float> %b) { 30; CHECK-SD-LABEL: add_f32_same: 31; CHECK-SD: // %bb.0: 32; CHECK-SD-NEXT: fadd v0.4s, v0.4s, v1.4s 33; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 34; CHECK-SD-NEXT: faddp s0, v0.2s 35; CHECK-SD-NEXT: ret 36; 37; CHECK-GI-LABEL: add_f32_same: 38; CHECK-GI: // %bb.0: 39; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 40; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s 41; CHECK-GI-NEXT: faddp s0, v0.2s 42; CHECK-GI-NEXT: faddp s1, v1.2s 43; CHECK-GI-NEXT: fadd s0, s0, s1 44; CHECK-GI-NEXT: ret 45 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 46 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 47 %r = fadd fast float %r1, %r2 48 ret float %r 49} 50 51define float @fmul_f32(<8 x float> %a, <4 x float> %b) { 52; CHECK-SD-LABEL: fmul_f32: 53; CHECK-SD: // %bb.0: 54; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 55; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s 56; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 57; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 58; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 59; CHECK-SD-NEXT: ret 60; 61; CHECK-GI-LABEL: fmul_f32: 62; CHECK-GI: // %bb.0: 63; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s 64; CHECK-GI-NEXT: mov d3, v2.d[1] 65; CHECK-GI-NEXT: mov d1, v0.d[1] 66; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s 67; CHECK-GI-NEXT: fmul v1.2s, v2.2s, v3.2s 68; CHECK-GI-NEXT: mov s2, v0.s[1] 69; CHECK-GI-NEXT: mov s3, v1.s[1] 70; CHECK-GI-NEXT: fmul s0, s0, s2 71; CHECK-GI-NEXT: fmul s1, s1, s3 72; CHECK-GI-NEXT: fmul s0, s0, s1 73; CHECK-GI-NEXT: ret 74 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) 75 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 76 %r = fmul fast float %r1, %r2 77 ret float %r 78} 79 80define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) { 81; CHECK-SD-LABEL: fmul_f32_same: 82; CHECK-SD: // %bb.0: 83; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s 84; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 85; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s 86; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 87; CHECK-SD-NEXT: ret 88; 89; CHECK-GI-LABEL: fmul_f32_same: 90; CHECK-GI: // %bb.0: 91; CHECK-GI-NEXT: mov d2, v0.d[1] 92; CHECK-GI-NEXT: mov d3, v1.d[1] 93; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s 94; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s 95; CHECK-GI-NEXT: mov s2, v0.s[1] 96; CHECK-GI-NEXT: mov s3, v1.s[1] 97; CHECK-GI-NEXT: fmul s0, s0, s2 98; CHECK-GI-NEXT: fmul s1, s1, s3 99; CHECK-GI-NEXT: fmul s0, s0, s1 100; CHECK-GI-NEXT: ret 101 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 102 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 103 %r = fmul fast float %r1, %r2 104 ret float %r 105} 106 107define float @fmin_f32(<8 x float> %a, <4 x float> %b) { 108; CHECK-SD-LABEL: fmin_f32: 109; CHECK-SD: // %bb.0: 110; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s 111; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v2.4s 112; CHECK-SD-NEXT: fminnmv s0, v0.4s 113; CHECK-SD-NEXT: ret 114; 115; CHECK-GI-LABEL: fmin_f32: 116; CHECK-GI: // %bb.0: 117; CHECK-GI-NEXT: fminnm v0.4s, v0.4s, v1.4s 118; CHECK-GI-NEXT: fminnmv s1, v2.4s 119; CHECK-GI-NEXT: fminnmv s0, v0.4s 120; CHECK-GI-NEXT: fminnm s0, s0, s1 121; CHECK-GI-NEXT: ret 122 %r1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a) 123 %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 124 %r = call float @llvm.minnum.f32(float %r1, float %r2) 125 ret float %r 126} 127 128define float @fmin_f32_same(<4 x float> %a, <4 x float> %b) { 129; CHECK-SD-LABEL: fmin_f32_same: 130; CHECK-SD: // %bb.0: 131; CHECK-SD-NEXT: fminnm v0.4s, v0.4s, v1.4s 132; CHECK-SD-NEXT: fminnmv s0, v0.4s 133; CHECK-SD-NEXT: ret 134; 135; CHECK-GI-LABEL: fmin_f32_same: 136; CHECK-GI: // %bb.0: 137; CHECK-GI-NEXT: fminnmv s0, v0.4s 138; CHECK-GI-NEXT: fminnmv s1, v1.4s 139; CHECK-GI-NEXT: fminnm s0, s0, s1 140; CHECK-GI-NEXT: ret 141 %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 142 %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 143 %r = call float @llvm.minnum.f32(float %r1, float %r2) 144 ret float %r 145} 146 147define float @fmax_f32(<8 x float> %a, <4 x float> %b) { 148; CHECK-SD-LABEL: fmax_f32: 149; CHECK-SD: // %bb.0: 150; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s 151; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v2.4s 152; CHECK-SD-NEXT: fmaxnmv s0, v0.4s 153; CHECK-SD-NEXT: ret 154; 155; CHECK-GI-LABEL: fmax_f32: 156; CHECK-GI: // %bb.0: 157; CHECK-GI-NEXT: fmaxnm v0.4s, v0.4s, v1.4s 158; CHECK-GI-NEXT: fmaxnmv s1, v2.4s 159; CHECK-GI-NEXT: fmaxnmv s0, v0.4s 160; CHECK-GI-NEXT: fmaxnm s0, s0, s1 161; CHECK-GI-NEXT: ret 162 %r1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a) 163 %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 164 %r = call float @llvm.maxnum.f32(float %r1, float %r2) 165 ret float %r 166} 167 168define float @fmax_f32_same(<4 x float> %a, <4 x float> %b) { 169; CHECK-SD-LABEL: fmax_f32_same: 170; CHECK-SD: // %bb.0: 171; CHECK-SD-NEXT: fmaxnm v0.4s, v0.4s, v1.4s 172; CHECK-SD-NEXT: fmaxnmv s0, v0.4s 173; CHECK-SD-NEXT: ret 174; 175; CHECK-GI-LABEL: fmax_f32_same: 176; CHECK-GI: // %bb.0: 177; CHECK-GI-NEXT: fmaxnmv s0, v0.4s 178; CHECK-GI-NEXT: fmaxnmv s1, v1.4s 179; CHECK-GI-NEXT: fmaxnm s0, s0, s1 180; CHECK-GI-NEXT: ret 181 %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 182 %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 183 %r = call float @llvm.maxnum.f32(float %r1, float %r2) 184 ret float %r 185} 186 187define float @fminimum_f32(<8 x float> %a, <4 x float> %b) { 188; CHECK-SD-LABEL: fminimum_f32: 189; CHECK-SD: // %bb.0: 190; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s 191; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v2.4s 192; CHECK-SD-NEXT: fminv s0, v0.4s 193; CHECK-SD-NEXT: ret 194; 195; CHECK-GI-LABEL: fminimum_f32: 196; CHECK-GI: // %bb.0: 197; CHECK-GI-NEXT: fmin v0.4s, v0.4s, v1.4s 198; CHECK-GI-NEXT: fminv s1, v2.4s 199; CHECK-GI-NEXT: fminv s0, v0.4s 200; CHECK-GI-NEXT: fmin s0, s0, s1 201; CHECK-GI-NEXT: ret 202 %r1 = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %a) 203 %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b) 204 %r = call float @llvm.minimum.f32(float %r1, float %r2) 205 ret float %r 206} 207 208define float @fminimum_f32_same(<4 x float> %a, <4 x float> %b) { 209; CHECK-SD-LABEL: fminimum_f32_same: 210; CHECK-SD: // %bb.0: 211; CHECK-SD-NEXT: fmin v0.4s, v0.4s, v1.4s 212; CHECK-SD-NEXT: fminv s0, v0.4s 213; CHECK-SD-NEXT: ret 214; 215; CHECK-GI-LABEL: fminimum_f32_same: 216; CHECK-GI: // %bb.0: 217; CHECK-GI-NEXT: fminv s0, v0.4s 218; CHECK-GI-NEXT: fminv s1, v1.4s 219; CHECK-GI-NEXT: fmin s0, s0, s1 220; CHECK-GI-NEXT: ret 221 %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) 222 %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b) 223 %r = call float @llvm.minimum.f32(float %r1, float %r2) 224 ret float %r 225} 226 227define float @fmaximum_f32(<8 x float> %a, <4 x float> %b) { 228; CHECK-SD-LABEL: fmaximum_f32: 229; CHECK-SD: // %bb.0: 230; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s 231; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v2.4s 232; CHECK-SD-NEXT: fmaxv s0, v0.4s 233; CHECK-SD-NEXT: ret 234; 235; CHECK-GI-LABEL: fmaximum_f32: 236; CHECK-GI: // %bb.0: 237; CHECK-GI-NEXT: fmax v0.4s, v0.4s, v1.4s 238; CHECK-GI-NEXT: fmaxv s1, v2.4s 239; CHECK-GI-NEXT: fmaxv s0, v0.4s 240; CHECK-GI-NEXT: fmax s0, s0, s1 241; CHECK-GI-NEXT: ret 242 %r1 = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %a) 243 %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b) 244 %r = call float @llvm.maximum.f32(float %r1, float %r2) 245 ret float %r 246} 247 248define float @fmaximum_f32_same(<4 x float> %a, <4 x float> %b) { 249; CHECK-SD-LABEL: fmaximum_f32_same: 250; CHECK-SD: // %bb.0: 251; CHECK-SD-NEXT: fmax v0.4s, v0.4s, v1.4s 252; CHECK-SD-NEXT: fmaxv s0, v0.4s 253; CHECK-SD-NEXT: ret 254; 255; CHECK-GI-LABEL: fmaximum_f32_same: 256; CHECK-GI: // %bb.0: 257; CHECK-GI-NEXT: fmaxv s0, v0.4s 258; CHECK-GI-NEXT: fmaxv s1, v1.4s 259; CHECK-GI-NEXT: fmax s0, s0, s1 260; CHECK-GI-NEXT: ret 261 %r1 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a) 262 %r2 = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %b) 263 %r = call float @llvm.maximum.f32(float %r1, float %r2) 264 ret float %r 265} 266 267; These next two tests have incorrect minnum/minimum combinations 268define float @fminimumnum_f32(<4 x float> %a, <4 x float> %b) { 269; CHECK-LABEL: fminimumnum_f32: 270; CHECK: // %bb.0: 271; CHECK-NEXT: fminv s0, v0.4s 272; CHECK-NEXT: fminv s1, v1.4s 273; CHECK-NEXT: fminnm s0, s0, s1 274; CHECK-NEXT: ret 275 %r1 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) 276 %r2 = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %b) 277 %r = call float @llvm.minnum.f32(float %r1, float %r2) 278 ret float %r 279} 280 281define float @fmaxnumimum_f32(<4 x float> %a, <4 x float> %b) { 282; CHECK-LABEL: fmaxnumimum_f32: 283; CHECK: // %bb.0: 284; CHECK-NEXT: fmaxnmv s0, v0.4s 285; CHECK-NEXT: fmaxnmv s1, v1.4s 286; CHECK-NEXT: fmax s0, s0, s1 287; CHECK-NEXT: ret 288 %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 289 %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 290 %r = call float @llvm.maximum.f32(float %r1, float %r2) 291 ret float %r 292} 293 294 295define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) { 296; CHECK-SD-LABEL: add_i32: 297; CHECK-SD: // %bb.0: 298; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 299; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s 300; CHECK-SD-NEXT: addv s0, v0.4s 301; CHECK-SD-NEXT: fmov w0, s0 302; CHECK-SD-NEXT: ret 303; 304; CHECK-GI-LABEL: add_i32: 305; CHECK-GI: // %bb.0: 306; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s 307; CHECK-GI-NEXT: addv s1, v2.4s 308; CHECK-GI-NEXT: addv s0, v0.4s 309; CHECK-GI-NEXT: fmov w9, s1 310; CHECK-GI-NEXT: fmov w8, s0 311; CHECK-GI-NEXT: add w0, w8, w9 312; CHECK-GI-NEXT: ret 313 %r1 = call i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32> %a) 314 %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) 315 %r = add i32 %r1, %r2 316 ret i32 %r 317} 318 319define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { 320; CHECK-SD-LABEL: add_ext_i16: 321; CHECK-SD: // %bb.0: 322; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b 323; CHECK-SD-NEXT: uadalp v1.8h, v0.16b 324; CHECK-SD-NEXT: addv h0, v1.8h 325; CHECK-SD-NEXT: fmov w0, s0 326; CHECK-SD-NEXT: ret 327; 328; CHECK-GI-LABEL: add_ext_i16: 329; CHECK-GI: // %bb.0: 330; CHECK-GI-NEXT: uaddlv h0, v0.16b 331; CHECK-GI-NEXT: uaddlv h1, v1.16b 332; CHECK-GI-NEXT: fmov w8, s0 333; CHECK-GI-NEXT: fmov w9, s1 334; CHECK-GI-NEXT: add w0, w8, w9 335; CHECK-GI-NEXT: ret 336 %ae = zext <16 x i8> %a to <16 x i16> 337 %be = zext <16 x i8> %b to <16 x i16> 338 %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) 339 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 340 %r = add i16 %r1, %r2 341 ret i16 %r 342} 343 344define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { 345; CHECK-SD-LABEL: add_ext_v32i16: 346; CHECK-SD: // %bb.0: 347; CHECK-SD-NEXT: uaddl2 v3.8h, v0.16b, v1.16b 348; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b 349; CHECK-SD-NEXT: add v0.8h, v0.8h, v3.8h 350; CHECK-SD-NEXT: uadalp v0.8h, v2.16b 351; CHECK-SD-NEXT: addv h0, v0.8h 352; CHECK-SD-NEXT: fmov w0, s0 353; CHECK-SD-NEXT: ret 354; 355; CHECK-GI-LABEL: add_ext_v32i16: 356; CHECK-GI: // %bb.0: 357; CHECK-GI-NEXT: uaddlv h0, v0.16b 358; CHECK-GI-NEXT: uaddlv h1, v1.16b 359; CHECK-GI-NEXT: uaddlv h2, v2.16b 360; CHECK-GI-NEXT: fmov w8, s0 361; CHECK-GI-NEXT: fmov w9, s1 362; CHECK-GI-NEXT: add w8, w8, w9 363; CHECK-GI-NEXT: fmov w9, s2 364; CHECK-GI-NEXT: add w0, w8, w9 365; CHECK-GI-NEXT: ret 366 %ae = zext <32 x i8> %a to <32 x i16> 367 %be = zext <16 x i8> %b to <16 x i16> 368 %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) 369 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 370 %r = add i16 %r1, %r2 371 ret i16 %r 372} 373 374define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) { 375; CHECK-SD-LABEL: mul_i32: 376; CHECK-SD: // %bb.0: 377; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s 378; CHECK-SD-NEXT: mul v0.4s, v0.4s, v2.4s 379; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 380; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s 381; CHECK-SD-NEXT: mov w8, v0.s[1] 382; CHECK-SD-NEXT: fmov w9, s0 383; CHECK-SD-NEXT: mul w0, w9, w8 384; CHECK-SD-NEXT: ret 385; 386; CHECK-GI-LABEL: mul_i32: 387; CHECK-GI: // %bb.0: 388; CHECK-GI-NEXT: mov d3, v0.d[1] 389; CHECK-GI-NEXT: mov d4, v1.d[1] 390; CHECK-GI-NEXT: mul v0.2s, v0.2s, v3.2s 391; CHECK-GI-NEXT: mul v1.2s, v1.2s, v4.2s 392; CHECK-GI-NEXT: mov d3, v2.d[1] 393; CHECK-GI-NEXT: mul v0.2s, v0.2s, v1.2s 394; CHECK-GI-NEXT: mul v1.2s, v2.2s, v3.2s 395; CHECK-GI-NEXT: mov w8, v0.s[1] 396; CHECK-GI-NEXT: fmov w10, s0 397; CHECK-GI-NEXT: mov w9, v1.s[1] 398; CHECK-GI-NEXT: mul w8, w10, w8 399; CHECK-GI-NEXT: fmov w10, s1 400; CHECK-GI-NEXT: mul w9, w10, w9 401; CHECK-GI-NEXT: mul w0, w8, w9 402; CHECK-GI-NEXT: ret 403 %r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a) 404 %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) 405 %r = mul i32 %r1, %r2 406 ret i32 %r 407} 408 409define i32 @mul_i32_same(<4 x i32> %a, <4 x i32> %b) { 410; CHECK-SD-LABEL: mul_i32_same: 411; CHECK-SD: // %bb.0: 412; CHECK-SD-NEXT: mul v0.4s, v0.4s, v1.4s 413; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 414; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.2s 415; CHECK-SD-NEXT: mov w8, v0.s[1] 416; CHECK-SD-NEXT: fmov w9, s0 417; CHECK-SD-NEXT: mul w0, w9, w8 418; CHECK-SD-NEXT: ret 419; 420; CHECK-GI-LABEL: mul_i32_same: 421; CHECK-GI: // %bb.0: 422; CHECK-GI-NEXT: mov d2, v0.d[1] 423; CHECK-GI-NEXT: mov d3, v1.d[1] 424; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s 425; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s 426; CHECK-GI-NEXT: mov w8, v0.s[1] 427; CHECK-GI-NEXT: mov w9, v1.s[1] 428; CHECK-GI-NEXT: fmov w10, s0 429; CHECK-GI-NEXT: fmov w11, s1 430; CHECK-GI-NEXT: mul w8, w10, w8 431; CHECK-GI-NEXT: mul w9, w11, w9 432; CHECK-GI-NEXT: mul w0, w8, w9 433; CHECK-GI-NEXT: ret 434 %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a) 435 %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) 436 %r = mul i32 %r1, %r2 437 ret i32 %r 438} 439 440define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) { 441; CHECK-SD-LABEL: and_i32: 442; CHECK-SD: // %bb.0: 443; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b 444; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b 445; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 446; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 447; CHECK-SD-NEXT: fmov x8, d0 448; CHECK-SD-NEXT: lsr x9, x8, #32 449; CHECK-SD-NEXT: and w0, w8, w9 450; CHECK-SD-NEXT: ret 451; 452; CHECK-GI-LABEL: and_i32: 453; CHECK-GI: // %bb.0: 454; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b 455; CHECK-GI-NEXT: mov d1, v2.d[1] 456; CHECK-GI-NEXT: mov d3, v0.d[1] 457; CHECK-GI-NEXT: and v1.8b, v2.8b, v1.8b 458; CHECK-GI-NEXT: and v0.8b, v0.8b, v3.8b 459; CHECK-GI-NEXT: mov w8, v1.s[1] 460; CHECK-GI-NEXT: fmov w10, s1 461; CHECK-GI-NEXT: mov w9, v0.s[1] 462; CHECK-GI-NEXT: fmov w11, s0 463; CHECK-GI-NEXT: and w8, w10, w8 464; CHECK-GI-NEXT: and w8, w11, w8 465; CHECK-GI-NEXT: and w0, w8, w9 466; CHECK-GI-NEXT: ret 467 %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) 468 %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) 469 %r = and i32 %r1, %r2 470 ret i32 %r 471} 472 473define i32 @and_i32_same(<4 x i32> %a, <4 x i32> %b) { 474; CHECK-SD-LABEL: and_i32_same: 475; CHECK-SD: // %bb.0: 476; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b 477; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 478; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b 479; CHECK-SD-NEXT: fmov x8, d0 480; CHECK-SD-NEXT: lsr x9, x8, #32 481; CHECK-SD-NEXT: and w0, w8, w9 482; CHECK-SD-NEXT: ret 483; 484; CHECK-GI-LABEL: and_i32_same: 485; CHECK-GI: // %bb.0: 486; CHECK-GI-NEXT: mov d2, v0.d[1] 487; CHECK-GI-NEXT: mov d3, v1.d[1] 488; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b 489; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b 490; CHECK-GI-NEXT: mov w8, v0.s[1] 491; CHECK-GI-NEXT: mov w9, v1.s[1] 492; CHECK-GI-NEXT: fmov w10, s0 493; CHECK-GI-NEXT: fmov w11, s1 494; CHECK-GI-NEXT: and w8, w10, w8 495; CHECK-GI-NEXT: and w9, w11, w9 496; CHECK-GI-NEXT: and w0, w8, w9 497; CHECK-GI-NEXT: ret 498 %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a) 499 %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) 500 %r = and i32 %r1, %r2 501 ret i32 %r 502} 503 504define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) { 505; CHECK-SD-LABEL: or_i32: 506; CHECK-SD: // %bb.0: 507; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b 508; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b 509; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 510; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b 511; CHECK-SD-NEXT: fmov x8, d0 512; CHECK-SD-NEXT: lsr x9, x8, #32 513; CHECK-SD-NEXT: orr w0, w8, w9 514; CHECK-SD-NEXT: ret 515; 516; CHECK-GI-LABEL: or_i32: 517; CHECK-GI: // %bb.0: 518; CHECK-GI-NEXT: orr v0.16b, v0.16b, v1.16b 519; CHECK-GI-NEXT: mov d1, v2.d[1] 520; CHECK-GI-NEXT: mov d3, v0.d[1] 521; CHECK-GI-NEXT: orr v1.8b, v2.8b, v1.8b 522; CHECK-GI-NEXT: orr v0.8b, v0.8b, v3.8b 523; CHECK-GI-NEXT: mov w8, v1.s[1] 524; CHECK-GI-NEXT: fmov w10, s1 525; CHECK-GI-NEXT: mov w9, v0.s[1] 526; CHECK-GI-NEXT: fmov w11, s0 527; CHECK-GI-NEXT: orr w8, w10, w8 528; CHECK-GI-NEXT: orr w8, w11, w8 529; CHECK-GI-NEXT: orr w0, w8, w9 530; CHECK-GI-NEXT: ret 531 %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) 532 %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) 533 %r = or i32 %r1, %r2 534 ret i32 %r 535} 536 537define i32 @or_i32_same(<4 x i32> %a, <4 x i32> %b) { 538; CHECK-SD-LABEL: or_i32_same: 539; CHECK-SD: // %bb.0: 540; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b 541; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 542; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b 543; CHECK-SD-NEXT: fmov x8, d0 544; CHECK-SD-NEXT: lsr x9, x8, #32 545; CHECK-SD-NEXT: orr w0, w8, w9 546; CHECK-SD-NEXT: ret 547; 548; CHECK-GI-LABEL: or_i32_same: 549; CHECK-GI: // %bb.0: 550; CHECK-GI-NEXT: mov d2, v0.d[1] 551; CHECK-GI-NEXT: mov d3, v1.d[1] 552; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b 553; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b 554; CHECK-GI-NEXT: mov w8, v0.s[1] 555; CHECK-GI-NEXT: mov w9, v1.s[1] 556; CHECK-GI-NEXT: fmov w10, s0 557; CHECK-GI-NEXT: fmov w11, s1 558; CHECK-GI-NEXT: orr w8, w10, w8 559; CHECK-GI-NEXT: orr w9, w11, w9 560; CHECK-GI-NEXT: orr w0, w8, w9 561; CHECK-GI-NEXT: ret 562 %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a) 563 %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) 564 %r = or i32 %r1, %r2 565 ret i32 %r 566} 567 568define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) { 569; CHECK-SD-LABEL: xor_i32: 570; CHECK-SD: // %bb.0: 571; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b 572; CHECK-SD-NEXT: eor v0.16b, v0.16b, v2.16b 573; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 574; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b 575; CHECK-SD-NEXT: fmov x8, d0 576; CHECK-SD-NEXT: lsr x9, x8, #32 577; CHECK-SD-NEXT: eor w0, w8, w9 578; CHECK-SD-NEXT: ret 579; 580; CHECK-GI-LABEL: xor_i32: 581; CHECK-GI: // %bb.0: 582; CHECK-GI-NEXT: eor v0.16b, v0.16b, v1.16b 583; CHECK-GI-NEXT: mov d1, v2.d[1] 584; CHECK-GI-NEXT: mov d3, v0.d[1] 585; CHECK-GI-NEXT: eor v1.8b, v2.8b, v1.8b 586; CHECK-GI-NEXT: eor v0.8b, v0.8b, v3.8b 587; CHECK-GI-NEXT: mov w8, v1.s[1] 588; CHECK-GI-NEXT: fmov w10, s1 589; CHECK-GI-NEXT: mov w9, v0.s[1] 590; CHECK-GI-NEXT: fmov w11, s0 591; CHECK-GI-NEXT: eor w8, w10, w8 592; CHECK-GI-NEXT: eor w8, w11, w8 593; CHECK-GI-NEXT: eor w0, w8, w9 594; CHECK-GI-NEXT: ret 595 %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) 596 %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) 597 %r = xor i32 %r1, %r2 598 ret i32 %r 599} 600 601define i32 @xor_i32_same(<4 x i32> %a, <4 x i32> %b) { 602; CHECK-SD-LABEL: xor_i32_same: 603; CHECK-SD: // %bb.0: 604; CHECK-SD-NEXT: eor v0.16b, v0.16b, v1.16b 605; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 606; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b 607; CHECK-SD-NEXT: fmov x8, d0 608; CHECK-SD-NEXT: lsr x9, x8, #32 609; CHECK-SD-NEXT: eor w0, w8, w9 610; CHECK-SD-NEXT: ret 611; 612; CHECK-GI-LABEL: xor_i32_same: 613; CHECK-GI: // %bb.0: 614; CHECK-GI-NEXT: mov d2, v0.d[1] 615; CHECK-GI-NEXT: mov d3, v1.d[1] 616; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b 617; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b 618; CHECK-GI-NEXT: mov w8, v0.s[1] 619; CHECK-GI-NEXT: mov w9, v1.s[1] 620; CHECK-GI-NEXT: fmov w10, s0 621; CHECK-GI-NEXT: fmov w11, s1 622; CHECK-GI-NEXT: eor w8, w10, w8 623; CHECK-GI-NEXT: eor w9, w11, w9 624; CHECK-GI-NEXT: eor w0, w8, w9 625; CHECK-GI-NEXT: ret 626 %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a) 627 %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) 628 %r = xor i32 %r1, %r2 629 ret i32 %r 630} 631 632define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) { 633; CHECK-SD-LABEL: umin_i32: 634; CHECK-SD: // %bb.0: 635; CHECK-SD-NEXT: umin v0.4s, v0.4s, v1.4s 636; CHECK-SD-NEXT: umin v0.4s, v0.4s, v2.4s 637; CHECK-SD-NEXT: uminv s0, v0.4s 638; CHECK-SD-NEXT: fmov w0, s0 639; CHECK-SD-NEXT: ret 640; 641; CHECK-GI-LABEL: umin_i32: 642; CHECK-GI: // %bb.0: 643; CHECK-GI-NEXT: umin v0.4s, v0.4s, v1.4s 644; CHECK-GI-NEXT: uminv s1, v2.4s 645; CHECK-GI-NEXT: uminv s0, v0.4s 646; CHECK-GI-NEXT: fmov w9, s1 647; CHECK-GI-NEXT: fmov w8, s0 648; CHECK-GI-NEXT: cmp w8, w9 649; CHECK-GI-NEXT: fcsel s0, s0, s1, lo 650; CHECK-GI-NEXT: fmov w0, s0 651; CHECK-GI-NEXT: ret 652 %r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a) 653 %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) 654 %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) 655 ret i32 %r 656} 657 658define i32 @umin_i32_same(<4 x i32> %a, <4 x i32> %b) { 659; CHECK-SD-LABEL: umin_i32_same: 660; CHECK-SD: // %bb.0: 661; CHECK-SD-NEXT: umin v0.4s, v0.4s, v1.4s 662; CHECK-SD-NEXT: uminv s0, v0.4s 663; CHECK-SD-NEXT: fmov w0, s0 664; CHECK-SD-NEXT: ret 665; 666; CHECK-GI-LABEL: umin_i32_same: 667; CHECK-GI: // %bb.0: 668; CHECK-GI-NEXT: uminv s0, v0.4s 669; CHECK-GI-NEXT: uminv s1, v1.4s 670; CHECK-GI-NEXT: fmov w8, s0 671; CHECK-GI-NEXT: fmov w9, s1 672; CHECK-GI-NEXT: cmp w8, w9 673; CHECK-GI-NEXT: fcsel s0, s0, s1, lo 674; CHECK-GI-NEXT: fmov w0, s0 675; CHECK-GI-NEXT: ret 676 %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a) 677 %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) 678 %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) 679 ret i32 %r 680} 681 682define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) { 683; CHECK-SD-LABEL: umax_i32: 684; CHECK-SD: // %bb.0: 685; CHECK-SD-NEXT: umax v0.4s, v0.4s, v1.4s 686; CHECK-SD-NEXT: umax v0.4s, v0.4s, v2.4s 687; CHECK-SD-NEXT: umaxv s0, v0.4s 688; CHECK-SD-NEXT: fmov w0, s0 689; CHECK-SD-NEXT: ret 690; 691; CHECK-GI-LABEL: umax_i32: 692; CHECK-GI: // %bb.0: 693; CHECK-GI-NEXT: umax v0.4s, v0.4s, v1.4s 694; CHECK-GI-NEXT: umaxv s1, v2.4s 695; CHECK-GI-NEXT: umaxv s0, v0.4s 696; CHECK-GI-NEXT: fmov w9, s1 697; CHECK-GI-NEXT: fmov w8, s0 698; CHECK-GI-NEXT: cmp w8, w9 699; CHECK-GI-NEXT: fcsel s0, s0, s1, hi 700; CHECK-GI-NEXT: fmov w0, s0 701; CHECK-GI-NEXT: ret 702 %r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a) 703 %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) 704 %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) 705 ret i32 %r 706} 707 708define i32 @umax_i32_same(<4 x i32> %a, <4 x i32> %b) { 709; CHECK-SD-LABEL: umax_i32_same: 710; CHECK-SD: // %bb.0: 711; CHECK-SD-NEXT: umax v0.4s, v0.4s, v1.4s 712; CHECK-SD-NEXT: umaxv s0, v0.4s 713; CHECK-SD-NEXT: fmov w0, s0 714; CHECK-SD-NEXT: ret 715; 716; CHECK-GI-LABEL: umax_i32_same: 717; CHECK-GI: // %bb.0: 718; CHECK-GI-NEXT: umaxv s0, v0.4s 719; CHECK-GI-NEXT: umaxv s1, v1.4s 720; CHECK-GI-NEXT: fmov w8, s0 721; CHECK-GI-NEXT: fmov w9, s1 722; CHECK-GI-NEXT: cmp w8, w9 723; CHECK-GI-NEXT: fcsel s0, s0, s1, hi 724; CHECK-GI-NEXT: fmov w0, s0 725; CHECK-GI-NEXT: ret 726 %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a) 727 %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) 728 %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) 729 ret i32 %r 730} 731 732define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) { 733; CHECK-SD-LABEL: smin_i32: 734; CHECK-SD: // %bb.0: 735; CHECK-SD-NEXT: smin v0.4s, v0.4s, v1.4s 736; CHECK-SD-NEXT: smin v0.4s, v0.4s, v2.4s 737; CHECK-SD-NEXT: sminv s0, v0.4s 738; CHECK-SD-NEXT: fmov w0, s0 739; CHECK-SD-NEXT: ret 740; 741; CHECK-GI-LABEL: smin_i32: 742; CHECK-GI: // %bb.0: 743; CHECK-GI-NEXT: smin v0.4s, v0.4s, v1.4s 744; CHECK-GI-NEXT: sminv s1, v2.4s 745; CHECK-GI-NEXT: sminv s0, v0.4s 746; CHECK-GI-NEXT: fmov w9, s1 747; CHECK-GI-NEXT: fmov w8, s0 748; CHECK-GI-NEXT: cmp w8, w9 749; CHECK-GI-NEXT: fcsel s0, s0, s1, lt 750; CHECK-GI-NEXT: fmov w0, s0 751; CHECK-GI-NEXT: ret 752 %r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a) 753 %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) 754 %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) 755 ret i32 %r 756} 757 758define i32 @smin_i32_same(<4 x i32> %a, <4 x i32> %b) { 759; CHECK-SD-LABEL: smin_i32_same: 760; CHECK-SD: // %bb.0: 761; CHECK-SD-NEXT: smin v0.4s, v0.4s, v1.4s 762; CHECK-SD-NEXT: sminv s0, v0.4s 763; CHECK-SD-NEXT: fmov w0, s0 764; CHECK-SD-NEXT: ret 765; 766; CHECK-GI-LABEL: smin_i32_same: 767; CHECK-GI: // %bb.0: 768; CHECK-GI-NEXT: sminv s0, v0.4s 769; CHECK-GI-NEXT: sminv s1, v1.4s 770; CHECK-GI-NEXT: fmov w8, s0 771; CHECK-GI-NEXT: fmov w9, s1 772; CHECK-GI-NEXT: cmp w8, w9 773; CHECK-GI-NEXT: fcsel s0, s0, s1, lt 774; CHECK-GI-NEXT: fmov w0, s0 775; CHECK-GI-NEXT: ret 776 %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a) 777 %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) 778 %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) 779 ret i32 %r 780} 781 782define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) { 783; CHECK-SD-LABEL: smax_i32: 784; CHECK-SD: // %bb.0: 785; CHECK-SD-NEXT: smax v0.4s, v0.4s, v1.4s 786; CHECK-SD-NEXT: smax v0.4s, v0.4s, v2.4s 787; CHECK-SD-NEXT: smaxv s0, v0.4s 788; CHECK-SD-NEXT: fmov w0, s0 789; CHECK-SD-NEXT: ret 790; 791; CHECK-GI-LABEL: smax_i32: 792; CHECK-GI: // %bb.0: 793; CHECK-GI-NEXT: smax v0.4s, v0.4s, v1.4s 794; CHECK-GI-NEXT: smaxv s1, v2.4s 795; CHECK-GI-NEXT: smaxv s0, v0.4s 796; CHECK-GI-NEXT: fmov w9, s1 797; CHECK-GI-NEXT: fmov w8, s0 798; CHECK-GI-NEXT: cmp w8, w9 799; CHECK-GI-NEXT: fcsel s0, s0, s1, gt 800; CHECK-GI-NEXT: fmov w0, s0 801; CHECK-GI-NEXT: ret 802 %r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a) 803 %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) 804 %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) 805 ret i32 %r 806} 807 808define i32 @smax_i32_same(<4 x i32> %a, <4 x i32> %b) { 809; CHECK-SD-LABEL: smax_i32_same: 810; CHECK-SD: // %bb.0: 811; CHECK-SD-NEXT: smax v0.4s, v0.4s, v1.4s 812; CHECK-SD-NEXT: smaxv s0, v0.4s 813; CHECK-SD-NEXT: fmov w0, s0 814; CHECK-SD-NEXT: ret 815; 816; CHECK-GI-LABEL: smax_i32_same: 817; CHECK-GI: // %bb.0: 818; CHECK-GI-NEXT: smaxv s0, v0.4s 819; CHECK-GI-NEXT: smaxv s1, v1.4s 820; CHECK-GI-NEXT: fmov w8, s0 821; CHECK-GI-NEXT: fmov w9, s1 822; CHECK-GI-NEXT: cmp w8, w9 823; CHECK-GI-NEXT: fcsel s0, s0, s1, gt 824; CHECK-GI-NEXT: fmov w0, s0 825; CHECK-GI-NEXT: ret 826 %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a) 827 %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) 828 %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) 829 ret i32 %r 830} 831 832 833define float @nested_fadd_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) { 834; CHECK-SD-LABEL: nested_fadd_f32: 835; CHECK-SD: // %bb.0: 836; CHECK-SD-NEXT: faddp v1.4s, v1.4s, v1.4s 837; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s 838; CHECK-SD-NEXT: faddp s1, v1.2s 839; CHECK-SD-NEXT: faddp s0, v0.2s 840; CHECK-SD-NEXT: fadd s1, s1, s3 841; CHECK-SD-NEXT: fadd s0, s0, s2 842; CHECK-SD-NEXT: fadd s0, s0, s1 843; CHECK-SD-NEXT: ret 844; 845; CHECK-GI-LABEL: nested_fadd_f32: 846; CHECK-GI: // %bb.0: 847; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s 848; CHECK-GI-NEXT: faddp v1.4s, v1.4s, v1.4s 849; CHECK-GI-NEXT: faddp s0, v0.2s 850; CHECK-GI-NEXT: faddp s1, v1.2s 851; CHECK-GI-NEXT: fadd s0, s0, s2 852; CHECK-GI-NEXT: fadd s1, s1, s3 853; CHECK-GI-NEXT: fadd s0, s0, s1 854; CHECK-GI-NEXT: ret 855 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 856 %a1 = fadd fast float %r1, %c 857 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 858 %a2 = fadd fast float %r2, %d 859 %r = fadd fast float %a1, %a2 860 ret float %r 861} 862 863define float @nested_fadd_f32_slow(<4 x float> %a, <4 x float> %b, float %c, float %d) { 864; CHECK-SD-LABEL: nested_fadd_f32_slow: 865; CHECK-SD: // %bb.0: 866; CHECK-SD-NEXT: mov s4, v1.s[2] 867; CHECK-SD-NEXT: mov s5, v0.s[2] 868; CHECK-SD-NEXT: faddp s6, v0.2s 869; CHECK-SD-NEXT: faddp s7, v1.2s 870; CHECK-SD-NEXT: mov s1, v1.s[3] 871; CHECK-SD-NEXT: mov s0, v0.s[3] 872; CHECK-SD-NEXT: fadd s5, s6, s5 873; CHECK-SD-NEXT: fadd s4, s7, s4 874; CHECK-SD-NEXT: fadd s0, s5, s0 875; CHECK-SD-NEXT: fadd s1, s4, s1 876; CHECK-SD-NEXT: fadd s0, s0, s2 877; CHECK-SD-NEXT: fadd s1, s1, s3 878; CHECK-SD-NEXT: fadd s0, s0, s1 879; CHECK-SD-NEXT: ret 880; 881; CHECK-GI-LABEL: nested_fadd_f32_slow: 882; CHECK-GI: // %bb.0: 883; CHECK-GI-NEXT: mov s4, v0.s[2] 884; CHECK-GI-NEXT: faddp s5, v0.2s 885; CHECK-GI-NEXT: mov s6, v1.s[2] 886; CHECK-GI-NEXT: faddp s7, v1.2s 887; CHECK-GI-NEXT: mov s0, v0.s[3] 888; CHECK-GI-NEXT: mov s1, v1.s[3] 889; CHECK-GI-NEXT: fadd s4, s5, s4 890; CHECK-GI-NEXT: fadd s5, s7, s6 891; CHECK-GI-NEXT: fadd s0, s4, s0 892; CHECK-GI-NEXT: fadd s1, s5, s1 893; CHECK-GI-NEXT: fadd s0, s0, s2 894; CHECK-GI-NEXT: fadd s1, s1, s3 895; CHECK-GI-NEXT: fadd s0, s0, s1 896; CHECK-GI-NEXT: ret 897 %r1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 898 %a1 = fadd float %r1, %c 899 %r2 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 900 %a2 = fadd float %r2, %d 901 %r = fadd float %a1, %a2 902 ret float %r 903} 904 905define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) { 906; CHECK-SD-LABEL: nested_mul_f32: 907; CHECK-SD: // %bb.0: 908; CHECK-SD-NEXT: ext v4.16b, v1.16b, v1.16b, #8 909; CHECK-SD-NEXT: ext v5.16b, v0.16b, v0.16b, #8 910; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v4.2s 911; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v5.2s 912; CHECK-SD-NEXT: fmul s1, s1, v1.s[1] 913; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] 914; CHECK-SD-NEXT: fmul s1, s1, s3 915; CHECK-SD-NEXT: fmul s0, s0, s2 916; CHECK-SD-NEXT: fmul s0, s0, s1 917; CHECK-SD-NEXT: ret 918; 919; CHECK-GI-LABEL: nested_mul_f32: 920; CHECK-GI: // %bb.0: 921; CHECK-GI-NEXT: mov d4, v0.d[1] 922; CHECK-GI-NEXT: mov d5, v1.d[1] 923; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v4.2s 924; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v5.2s 925; CHECK-GI-NEXT: mov s4, v0.s[1] 926; CHECK-GI-NEXT: mov s5, v1.s[1] 927; CHECK-GI-NEXT: fmul s0, s0, s4 928; CHECK-GI-NEXT: fmul s1, s1, s5 929; CHECK-GI-NEXT: fmul s0, s0, s2 930; CHECK-GI-NEXT: fmul s1, s1, s3 931; CHECK-GI-NEXT: fmul s0, s0, s1 932; CHECK-GI-NEXT: ret 933 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 934 %a1 = fmul fast float %r1, %c 935 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 936 %a2 = fmul fast float %r2, %d 937 %r = fmul fast float %a1, %a2 938 ret float %r 939} 940 941define i32 @nested_add_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 942; CHECK-SD-LABEL: nested_add_i32: 943; CHECK-SD: // %bb.0: 944; CHECK-SD-NEXT: addv s1, v1.4s 945; CHECK-SD-NEXT: addv s0, v0.4s 946; CHECK-SD-NEXT: fmov w8, s1 947; CHECK-SD-NEXT: fmov w9, s0 948; CHECK-SD-NEXT: add w9, w9, w0 949; CHECK-SD-NEXT: add w8, w8, w1 950; CHECK-SD-NEXT: add w0, w9, w8 951; CHECK-SD-NEXT: ret 952; 953; CHECK-GI-LABEL: nested_add_i32: 954; CHECK-GI: // %bb.0: 955; CHECK-GI-NEXT: addv s0, v0.4s 956; CHECK-GI-NEXT: addv s1, v1.4s 957; CHECK-GI-NEXT: fmov w8, s0 958; CHECK-GI-NEXT: fmov w9, s1 959; CHECK-GI-NEXT: add w8, w8, w0 960; CHECK-GI-NEXT: add w9, w9, w1 961; CHECK-GI-NEXT: add w0, w8, w9 962; CHECK-GI-NEXT: ret 963 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 964 %a1 = add i32 %r1, %c 965 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b) 966 %a2 = add i32 %r2, %d 967 %r = add i32 %a1, %a2 968 ret i32 %r 969} 970 971define i32 @nested_add_c1_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 972; CHECK-SD-LABEL: nested_add_c1_i32: 973; CHECK-SD: // %bb.0: 974; CHECK-SD-NEXT: addv s1, v1.4s 975; CHECK-SD-NEXT: addv s0, v0.4s 976; CHECK-SD-NEXT: fmov w8, s1 977; CHECK-SD-NEXT: fmov w9, s0 978; CHECK-SD-NEXT: add w9, w0, w9 979; CHECK-SD-NEXT: add w8, w8, w1 980; CHECK-SD-NEXT: add w0, w9, w8 981; CHECK-SD-NEXT: ret 982; 983; CHECK-GI-LABEL: nested_add_c1_i32: 984; CHECK-GI: // %bb.0: 985; CHECK-GI-NEXT: addv s0, v0.4s 986; CHECK-GI-NEXT: addv s1, v1.4s 987; CHECK-GI-NEXT: fmov w8, s0 988; CHECK-GI-NEXT: fmov w9, s1 989; CHECK-GI-NEXT: add w8, w0, w8 990; CHECK-GI-NEXT: add w9, w9, w1 991; CHECK-GI-NEXT: add w0, w8, w9 992; CHECK-GI-NEXT: ret 993 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 994 %a1 = add i32 %c, %r1 995 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b) 996 %a2 = add i32 %r2, %d 997 %r = add i32 %a1, %a2 998 ret i32 %r 999} 1000 1001define i32 @nested_add_c2_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1002; CHECK-SD-LABEL: nested_add_c2_i32: 1003; CHECK-SD: // %bb.0: 1004; CHECK-SD-NEXT: addv s1, v1.4s 1005; CHECK-SD-NEXT: addv s0, v0.4s 1006; CHECK-SD-NEXT: fmov w8, s1 1007; CHECK-SD-NEXT: fmov w9, s0 1008; CHECK-SD-NEXT: add w9, w9, w0 1009; CHECK-SD-NEXT: add w8, w1, w8 1010; CHECK-SD-NEXT: add w0, w9, w8 1011; CHECK-SD-NEXT: ret 1012; 1013; CHECK-GI-LABEL: nested_add_c2_i32: 1014; CHECK-GI: // %bb.0: 1015; CHECK-GI-NEXT: addv s0, v0.4s 1016; CHECK-GI-NEXT: addv s1, v1.4s 1017; CHECK-GI-NEXT: fmov w8, s0 1018; CHECK-GI-NEXT: fmov w9, s1 1019; CHECK-GI-NEXT: add w8, w8, w0 1020; CHECK-GI-NEXT: add w9, w1, w9 1021; CHECK-GI-NEXT: add w0, w8, w9 1022; CHECK-GI-NEXT: ret 1023 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 1024 %a1 = add i32 %r1, %c 1025 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b) 1026 %a2 = add i32 %d, %r2 1027 %r = add i32 %a1, %a2 1028 ret i32 %r 1029} 1030 1031define i32 @nested_add_manyreduct_i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { 1032; CHECK-SD-LABEL: nested_add_manyreduct_i32: 1033; CHECK-SD: // %bb.0: 1034; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s 1035; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s 1036; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s 1037; CHECK-SD-NEXT: addv s0, v0.4s 1038; CHECK-SD-NEXT: fmov w0, s0 1039; CHECK-SD-NEXT: ret 1040; 1041; CHECK-GI-LABEL: nested_add_manyreduct_i32: 1042; CHECK-GI: // %bb.0: 1043; CHECK-GI-NEXT: addv s0, v0.4s 1044; CHECK-GI-NEXT: addv s2, v2.4s 1045; CHECK-GI-NEXT: addv s1, v1.4s 1046; CHECK-GI-NEXT: addv s3, v3.4s 1047; CHECK-GI-NEXT: fmov w8, s0 1048; CHECK-GI-NEXT: fmov w9, s2 1049; CHECK-GI-NEXT: fmov w10, s1 1050; CHECK-GI-NEXT: fmov w11, s3 1051; CHECK-GI-NEXT: add w8, w8, w9 1052; CHECK-GI-NEXT: add w9, w10, w11 1053; CHECK-GI-NEXT: add w0, w8, w9 1054; CHECK-GI-NEXT: ret 1055 %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 1056 %r3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %c) 1057 %a1 = add i32 %r1, %r3 1058 %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %b) 1059 %r4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %d) 1060 %a2 = add i32 %r2, %r4 1061 %r = add i32 %a1, %a2 1062 ret i32 %r 1063} 1064 1065define i32 @nested_mul_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1066; CHECK-SD-LABEL: nested_mul_i32: 1067; CHECK-SD: // %bb.0: 1068; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 1069; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 1070; CHECK-SD-NEXT: mul v0.2s, v0.2s, v3.2s 1071; CHECK-SD-NEXT: mul v1.2s, v1.2s, v2.2s 1072; CHECK-SD-NEXT: mov w8, v0.s[1] 1073; CHECK-SD-NEXT: fmov w10, s0 1074; CHECK-SD-NEXT: mov w9, v1.s[1] 1075; CHECK-SD-NEXT: mul w8, w10, w8 1076; CHECK-SD-NEXT: fmov w10, s1 1077; CHECK-SD-NEXT: mul w9, w10, w9 1078; CHECK-SD-NEXT: mul w8, w8, w0 1079; CHECK-SD-NEXT: mul w9, w9, w1 1080; CHECK-SD-NEXT: mul w0, w8, w9 1081; CHECK-SD-NEXT: ret 1082; 1083; CHECK-GI-LABEL: nested_mul_i32: 1084; CHECK-GI: // %bb.0: 1085; CHECK-GI-NEXT: mov d2, v0.d[1] 1086; CHECK-GI-NEXT: mov d3, v1.d[1] 1087; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s 1088; CHECK-GI-NEXT: mul v1.2s, v1.2s, v3.2s 1089; CHECK-GI-NEXT: mov w8, v0.s[1] 1090; CHECK-GI-NEXT: fmov w10, s0 1091; CHECK-GI-NEXT: mov w9, v1.s[1] 1092; CHECK-GI-NEXT: mul w8, w10, w8 1093; CHECK-GI-NEXT: fmov w10, s1 1094; CHECK-GI-NEXT: mul w9, w10, w9 1095; CHECK-GI-NEXT: mul w8, w8, w0 1096; CHECK-GI-NEXT: mul w9, w9, w1 1097; CHECK-GI-NEXT: mul w0, w8, w9 1098; CHECK-GI-NEXT: ret 1099 %r1 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %a) 1100 %a1 = mul i32 %r1, %c 1101 %r2 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %b) 1102 %a2 = mul i32 %r2, %d 1103 %r = mul i32 %a1, %a2 1104 ret i32 %r 1105} 1106 1107define i32 @nested_and_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1108; CHECK-SD-LABEL: nested_and_i32: 1109; CHECK-SD: // %bb.0: 1110; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 1111; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 1112; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b 1113; CHECK-SD-NEXT: and v0.8b, v0.8b, v3.8b 1114; CHECK-SD-NEXT: fmov x8, d1 1115; CHECK-SD-NEXT: fmov x9, d0 1116; CHECK-SD-NEXT: lsr x10, x9, #32 1117; CHECK-SD-NEXT: lsr x11, x8, #32 1118; CHECK-SD-NEXT: and w9, w9, w0 1119; CHECK-SD-NEXT: and w8, w8, w1 1120; CHECK-SD-NEXT: and w9, w9, w10 1121; CHECK-SD-NEXT: and w8, w8, w11 1122; CHECK-SD-NEXT: and w0, w9, w8 1123; CHECK-SD-NEXT: ret 1124; 1125; CHECK-GI-LABEL: nested_and_i32: 1126; CHECK-GI: // %bb.0: 1127; CHECK-GI-NEXT: mov d2, v0.d[1] 1128; CHECK-GI-NEXT: mov d3, v1.d[1] 1129; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b 1130; CHECK-GI-NEXT: and v1.8b, v1.8b, v3.8b 1131; CHECK-GI-NEXT: mov w8, v0.s[1] 1132; CHECK-GI-NEXT: mov w9, v1.s[1] 1133; CHECK-GI-NEXT: fmov w10, s0 1134; CHECK-GI-NEXT: fmov w11, s1 1135; CHECK-GI-NEXT: and w10, w10, w0 1136; CHECK-GI-NEXT: and w11, w11, w1 1137; CHECK-GI-NEXT: and w8, w10, w8 1138; CHECK-GI-NEXT: and w9, w11, w9 1139; CHECK-GI-NEXT: and w0, w8, w9 1140; CHECK-GI-NEXT: ret 1141 %r1 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) 1142 %a1 = and i32 %r1, %c 1143 %r2 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %b) 1144 %a2 = and i32 %r2, %d 1145 %r = and i32 %a1, %a2 1146 ret i32 %r 1147} 1148 1149define i32 @nested_or_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1150; CHECK-SD-LABEL: nested_or_i32: 1151; CHECK-SD: // %bb.0: 1152; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 1153; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 1154; CHECK-SD-NEXT: orr v1.8b, v1.8b, v2.8b 1155; CHECK-SD-NEXT: orr v0.8b, v0.8b, v3.8b 1156; CHECK-SD-NEXT: fmov x8, d1 1157; CHECK-SD-NEXT: fmov x9, d0 1158; CHECK-SD-NEXT: lsr x10, x9, #32 1159; CHECK-SD-NEXT: lsr x11, x8, #32 1160; CHECK-SD-NEXT: orr w9, w9, w0 1161; CHECK-SD-NEXT: orr w8, w8, w1 1162; CHECK-SD-NEXT: orr w9, w9, w10 1163; CHECK-SD-NEXT: orr w8, w8, w11 1164; CHECK-SD-NEXT: orr w0, w9, w8 1165; CHECK-SD-NEXT: ret 1166; 1167; CHECK-GI-LABEL: nested_or_i32: 1168; CHECK-GI: // %bb.0: 1169; CHECK-GI-NEXT: mov d2, v0.d[1] 1170; CHECK-GI-NEXT: mov d3, v1.d[1] 1171; CHECK-GI-NEXT: orr v0.8b, v0.8b, v2.8b 1172; CHECK-GI-NEXT: orr v1.8b, v1.8b, v3.8b 1173; CHECK-GI-NEXT: mov w8, v0.s[1] 1174; CHECK-GI-NEXT: mov w9, v1.s[1] 1175; CHECK-GI-NEXT: fmov w10, s0 1176; CHECK-GI-NEXT: fmov w11, s1 1177; CHECK-GI-NEXT: orr w10, w10, w0 1178; CHECK-GI-NEXT: orr w11, w11, w1 1179; CHECK-GI-NEXT: orr w8, w10, w8 1180; CHECK-GI-NEXT: orr w9, w11, w9 1181; CHECK-GI-NEXT: orr w0, w8, w9 1182; CHECK-GI-NEXT: ret 1183 %r1 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) 1184 %a1 = or i32 %r1, %c 1185 %r2 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %b) 1186 %a2 = or i32 %r2, %d 1187 %r = or i32 %a1, %a2 1188 ret i32 %r 1189} 1190 1191define i32 @nested_xor_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1192; CHECK-SD-LABEL: nested_xor_i32: 1193; CHECK-SD: // %bb.0: 1194; CHECK-SD-NEXT: ext v2.16b, v1.16b, v1.16b, #8 1195; CHECK-SD-NEXT: ext v3.16b, v0.16b, v0.16b, #8 1196; CHECK-SD-NEXT: eor v1.8b, v1.8b, v2.8b 1197; CHECK-SD-NEXT: eor v0.8b, v0.8b, v3.8b 1198; CHECK-SD-NEXT: fmov x8, d1 1199; CHECK-SD-NEXT: fmov x9, d0 1200; CHECK-SD-NEXT: lsr x10, x9, #32 1201; CHECK-SD-NEXT: lsr x11, x8, #32 1202; CHECK-SD-NEXT: eor w9, w9, w0 1203; CHECK-SD-NEXT: eor w8, w8, w1 1204; CHECK-SD-NEXT: eor w9, w9, w10 1205; CHECK-SD-NEXT: eor w8, w8, w11 1206; CHECK-SD-NEXT: eor w0, w9, w8 1207; CHECK-SD-NEXT: ret 1208; 1209; CHECK-GI-LABEL: nested_xor_i32: 1210; CHECK-GI: // %bb.0: 1211; CHECK-GI-NEXT: mov d2, v0.d[1] 1212; CHECK-GI-NEXT: mov d3, v1.d[1] 1213; CHECK-GI-NEXT: eor v0.8b, v0.8b, v2.8b 1214; CHECK-GI-NEXT: eor v1.8b, v1.8b, v3.8b 1215; CHECK-GI-NEXT: mov w8, v0.s[1] 1216; CHECK-GI-NEXT: mov w9, v1.s[1] 1217; CHECK-GI-NEXT: fmov w10, s0 1218; CHECK-GI-NEXT: fmov w11, s1 1219; CHECK-GI-NEXT: eor w10, w10, w0 1220; CHECK-GI-NEXT: eor w11, w11, w1 1221; CHECK-GI-NEXT: eor w8, w10, w8 1222; CHECK-GI-NEXT: eor w9, w11, w9 1223; CHECK-GI-NEXT: eor w0, w8, w9 1224; CHECK-GI-NEXT: ret 1225 %r1 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) 1226 %a1 = xor i32 %r1, %c 1227 %r2 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %b) 1228 %a2 = xor i32 %r2, %d 1229 %r = xor i32 %a1, %a2 1230 ret i32 %r 1231} 1232 1233define i32 @nested_smin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1234; CHECK-SD-LABEL: nested_smin_i32: 1235; CHECK-SD: // %bb.0: 1236; CHECK-SD-NEXT: sminv s0, v0.4s 1237; CHECK-SD-NEXT: sminv s1, v1.4s 1238; CHECK-SD-NEXT: fmov w9, s0 1239; CHECK-SD-NEXT: fmov w8, s1 1240; CHECK-SD-NEXT: cmp w9, w0 1241; CHECK-SD-NEXT: csel w9, w9, w0, lt 1242; CHECK-SD-NEXT: cmp w8, w1 1243; CHECK-SD-NEXT: csel w8, w8, w1, lt 1244; CHECK-SD-NEXT: cmp w9, w8 1245; CHECK-SD-NEXT: csel w0, w9, w8, lt 1246; CHECK-SD-NEXT: ret 1247; 1248; CHECK-GI-LABEL: nested_smin_i32: 1249; CHECK-GI: // %bb.0: 1250; CHECK-GI-NEXT: sminv s0, v0.4s 1251; CHECK-GI-NEXT: sminv s1, v1.4s 1252; CHECK-GI-NEXT: fmov w8, s0 1253; CHECK-GI-NEXT: fmov w9, s1 1254; CHECK-GI-NEXT: cmp w8, w0 1255; CHECK-GI-NEXT: csel w8, w8, w0, lt 1256; CHECK-GI-NEXT: cmp w9, w1 1257; CHECK-GI-NEXT: csel w9, w9, w1, lt 1258; CHECK-GI-NEXT: cmp w8, w9 1259; CHECK-GI-NEXT: csel w0, w8, w9, lt 1260; CHECK-GI-NEXT: ret 1261 %r1 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) 1262 %a1 = call i32 @llvm.smin.i32(i32 %r1, i32 %c) 1263 %r2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %b) 1264 %a2 = call i32 @llvm.smin.i32(i32 %r2, i32 %d) 1265 %r = call i32 @llvm.smin.i32(i32 %a1, i32 %a2) 1266 ret i32 %r 1267} 1268 1269define i32 @nested_smax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1270; CHECK-SD-LABEL: nested_smax_i32: 1271; CHECK-SD: // %bb.0: 1272; CHECK-SD-NEXT: smaxv s0, v0.4s 1273; CHECK-SD-NEXT: smaxv s1, v1.4s 1274; CHECK-SD-NEXT: fmov w9, s0 1275; CHECK-SD-NEXT: fmov w8, s1 1276; CHECK-SD-NEXT: cmp w9, w0 1277; CHECK-SD-NEXT: csel w9, w9, w0, gt 1278; CHECK-SD-NEXT: cmp w8, w1 1279; CHECK-SD-NEXT: csel w8, w8, w1, gt 1280; CHECK-SD-NEXT: cmp w9, w8 1281; CHECK-SD-NEXT: csel w0, w9, w8, gt 1282; CHECK-SD-NEXT: ret 1283; 1284; CHECK-GI-LABEL: nested_smax_i32: 1285; CHECK-GI: // %bb.0: 1286; CHECK-GI-NEXT: smaxv s0, v0.4s 1287; CHECK-GI-NEXT: smaxv s1, v1.4s 1288; CHECK-GI-NEXT: fmov w8, s0 1289; CHECK-GI-NEXT: fmov w9, s1 1290; CHECK-GI-NEXT: cmp w8, w0 1291; CHECK-GI-NEXT: csel w8, w8, w0, gt 1292; CHECK-GI-NEXT: cmp w9, w1 1293; CHECK-GI-NEXT: csel w9, w9, w1, gt 1294; CHECK-GI-NEXT: cmp w8, w9 1295; CHECK-GI-NEXT: csel w0, w8, w9, gt 1296; CHECK-GI-NEXT: ret 1297 %r1 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) 1298 %a1 = call i32 @llvm.smax.i32(i32 %r1, i32 %c) 1299 %r2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %b) 1300 %a2 = call i32 @llvm.smax.i32(i32 %r2, i32 %d) 1301 %r = call i32 @llvm.smax.i32(i32 %a1, i32 %a2) 1302 ret i32 %r 1303} 1304 1305define i32 @nested_umin_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1306; CHECK-SD-LABEL: nested_umin_i32: 1307; CHECK-SD: // %bb.0: 1308; CHECK-SD-NEXT: uminv s0, v0.4s 1309; CHECK-SD-NEXT: uminv s1, v1.4s 1310; CHECK-SD-NEXT: fmov w9, s0 1311; CHECK-SD-NEXT: fmov w8, s1 1312; CHECK-SD-NEXT: cmp w9, w0 1313; CHECK-SD-NEXT: csel w9, w9, w0, lo 1314; CHECK-SD-NEXT: cmp w8, w1 1315; CHECK-SD-NEXT: csel w8, w8, w1, lo 1316; CHECK-SD-NEXT: cmp w9, w8 1317; CHECK-SD-NEXT: csel w0, w9, w8, lo 1318; CHECK-SD-NEXT: ret 1319; 1320; CHECK-GI-LABEL: nested_umin_i32: 1321; CHECK-GI: // %bb.0: 1322; CHECK-GI-NEXT: uminv s0, v0.4s 1323; CHECK-GI-NEXT: uminv s1, v1.4s 1324; CHECK-GI-NEXT: fmov w8, s0 1325; CHECK-GI-NEXT: fmov w9, s1 1326; CHECK-GI-NEXT: cmp w8, w0 1327; CHECK-GI-NEXT: csel w8, w8, w0, lo 1328; CHECK-GI-NEXT: cmp w9, w1 1329; CHECK-GI-NEXT: csel w9, w9, w1, lo 1330; CHECK-GI-NEXT: cmp w8, w9 1331; CHECK-GI-NEXT: csel w0, w8, w9, lo 1332; CHECK-GI-NEXT: ret 1333 %r1 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) 1334 %a1 = call i32 @llvm.umin.i32(i32 %r1, i32 %c) 1335 %r2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %b) 1336 %a2 = call i32 @llvm.umin.i32(i32 %r2, i32 %d) 1337 %r = call i32 @llvm.umin.i32(i32 %a1, i32 %a2) 1338 ret i32 %r 1339} 1340 1341define i32 @nested_umax_i32(<4 x i32> %a, <4 x i32> %b, i32 %c, i32 %d) { 1342; CHECK-SD-LABEL: nested_umax_i32: 1343; CHECK-SD: // %bb.0: 1344; CHECK-SD-NEXT: umaxv s0, v0.4s 1345; CHECK-SD-NEXT: umaxv s1, v1.4s 1346; CHECK-SD-NEXT: fmov w9, s0 1347; CHECK-SD-NEXT: fmov w8, s1 1348; CHECK-SD-NEXT: cmp w9, w0 1349; CHECK-SD-NEXT: csel w9, w9, w0, hi 1350; CHECK-SD-NEXT: cmp w8, w1 1351; CHECK-SD-NEXT: csel w8, w8, w1, hi 1352; CHECK-SD-NEXT: cmp w9, w8 1353; CHECK-SD-NEXT: csel w0, w9, w8, hi 1354; CHECK-SD-NEXT: ret 1355; 1356; CHECK-GI-LABEL: nested_umax_i32: 1357; CHECK-GI: // %bb.0: 1358; CHECK-GI-NEXT: umaxv s0, v0.4s 1359; CHECK-GI-NEXT: umaxv s1, v1.4s 1360; CHECK-GI-NEXT: fmov w8, s0 1361; CHECK-GI-NEXT: fmov w9, s1 1362; CHECK-GI-NEXT: cmp w8, w0 1363; CHECK-GI-NEXT: csel w8, w8, w0, hi 1364; CHECK-GI-NEXT: cmp w9, w1 1365; CHECK-GI-NEXT: csel w9, w9, w1, hi 1366; CHECK-GI-NEXT: cmp w8, w9 1367; CHECK-GI-NEXT: csel w0, w8, w9, hi 1368; CHECK-GI-NEXT: ret 1369 %r1 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) 1370 %a1 = call i32 @llvm.umax.i32(i32 %r1, i32 %c) 1371 %r2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %b) 1372 %a2 = call i32 @llvm.umax.i32(i32 %r2, i32 %d) 1373 %r = call i32 @llvm.umax.i32(i32 %a1, i32 %a2) 1374 ret i32 %r 1375} 1376 1377define float @nested_fmin_float(<4 x float> %a, <4 x float> %b, float %c, float %d) { 1378; CHECK-SD-LABEL: nested_fmin_float: 1379; CHECK-SD: // %bb.0: 1380; CHECK-SD-NEXT: fminnmv s1, v1.4s 1381; CHECK-SD-NEXT: fminnmv s0, v0.4s 1382; CHECK-SD-NEXT: fminnm s1, s1, s3 1383; CHECK-SD-NEXT: fminnm s0, s0, s2 1384; CHECK-SD-NEXT: fminnm s0, s0, s1 1385; CHECK-SD-NEXT: ret 1386; 1387; CHECK-GI-LABEL: nested_fmin_float: 1388; CHECK-GI: // %bb.0: 1389; CHECK-GI-NEXT: fminnmv s0, v0.4s 1390; CHECK-GI-NEXT: fminnmv s1, v1.4s 1391; CHECK-GI-NEXT: fminnm s0, s0, s2 1392; CHECK-GI-NEXT: fminnm s1, s1, s3 1393; CHECK-GI-NEXT: fminnm s0, s0, s1 1394; CHECK-GI-NEXT: ret 1395 %r1 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 1396 %a1 = call float @llvm.minnum.f32(float %r1, float %c) 1397 %r2 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 1398 %a2 = call float @llvm.minnum.f32(float %r2, float %d) 1399 %r = call float @llvm.minnum.f32(float %a1, float %a2) 1400 ret float %r 1401} 1402 1403define float @nested_fmax_float(<4 x float> %a, <4 x float> %b, float %c, float %d) { 1404; CHECK-SD-LABEL: nested_fmax_float: 1405; CHECK-SD: // %bb.0: 1406; CHECK-SD-NEXT: fmaxnmv s1, v1.4s 1407; CHECK-SD-NEXT: fmaxnmv s0, v0.4s 1408; CHECK-SD-NEXT: fmaxnm s1, s1, s3 1409; CHECK-SD-NEXT: fmaxnm s0, s0, s2 1410; CHECK-SD-NEXT: fmaxnm s0, s0, s1 1411; CHECK-SD-NEXT: ret 1412; 1413; CHECK-GI-LABEL: nested_fmax_float: 1414; CHECK-GI: // %bb.0: 1415; CHECK-GI-NEXT: fmaxnmv s0, v0.4s 1416; CHECK-GI-NEXT: fmaxnmv s1, v1.4s 1417; CHECK-GI-NEXT: fmaxnm s0, s0, s2 1418; CHECK-GI-NEXT: fmaxnm s1, s1, s3 1419; CHECK-GI-NEXT: fmaxnm s0, s0, s1 1420; CHECK-GI-NEXT: ret 1421 %r1 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 1422 %a1 = call float @llvm.maxnum.f32(float %r1, float %c) 1423 %r2 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 1424 %a2 = call float @llvm.maxnum.f32(float %r2, float %d) 1425 %r = call float @llvm.maxnum.f32(float %a1, float %a2) 1426 ret float %r 1427} 1428 1429 1430declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1431declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 1432declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) 1433declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 1434declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) 1435declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 1436declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) 1437declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 1438declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>) 1439declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>) 1440declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>) 1441declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) 1442declare i32 @llvm.vector.reduce.add.i32.v8i32(<8 x i32>) 1443declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) 1444declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) 1445declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) 1446declare i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32>) 1447declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) 1448declare i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32>) 1449declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) 1450declare i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32>) 1451declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) 1452declare i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32>) 1453declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) 1454declare i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32>) 1455declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) 1456declare i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32>) 1457declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) 1458declare i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32>) 1459declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) 1460declare i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32>) 1461declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) 1462declare float @llvm.minnum.f32(float, float) 1463declare float @llvm.maxnum.f32(float, float) 1464declare float @llvm.minimum.f32(float, float) 1465declare float @llvm.maximum.f32(float, float) 1466declare i32 @llvm.umin.i32(i32, i32) 1467declare i32 @llvm.umax.i32(i32, i32) 1468declare i32 @llvm.smin.i32(i32, i32) 1469declare i32 @llvm.smax.i32(i32, i32) 1470