1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST 8 9; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. 10 11define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) { 12; SSE-LABEL: test1_undef: 13; SSE: # %bb.0: 14; SSE-NEXT: haddps %xmm1, %xmm0 15; SSE-NEXT: retq 16; 17; AVX-LABEL: test1_undef: 18; AVX: # %bb.0: 19; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %vecext = extractelement <4 x float> %a, i32 0 22 %vecext1 = extractelement <4 x float> %a, i32 1 23 %add = fadd float %vecext, %vecext1 24 %vecinit = insertelement <4 x float> undef, float %add, i32 0 25 %vecext2 = extractelement <4 x float> %a, i32 2 26 %vecext3 = extractelement <4 x float> %a, i32 3 27 %add4 = fadd float %vecext2, %vecext3 28 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 29 %vecext10 = extractelement <4 x float> %b, i32 2 30 %vecext11 = extractelement <4 x float> %b, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit5, float %add12, i32 3 33 ret <4 x float> %vecinit13 34} 35 36define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) { 37; SSE-LABEL: test2_undef: 38; SSE: # %bb.0: 39; SSE-NEXT: haddps %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: test2_undef: 43; AVX: # %bb.0: 44; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %a, i32 0 47 %vecext1 = extractelement <4 x float> %a, i32 1 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 0 50 %vecext6 = extractelement <4 x float> %b, i32 0 51 %vecext7 = extractelement <4 x float> %b, i32 1 52 %add8 = fadd float %vecext6, %vecext7 53 %vecinit9 = insertelement <4 x float> %vecinit, float %add8, i32 2 54 %vecext10 = extractelement <4 x float> %b, i32 2 55 %vecext11 = extractelement <4 x float> %b, i32 3 56 %add12 = fadd float %vecext10, %vecext11 57 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 58 ret <4 x float> %vecinit13 59} 60 61define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) { 62; SSE-LABEL: test3_undef: 63; SSE: # %bb.0: 64; SSE-NEXT: haddps %xmm1, %xmm0 65; SSE-NEXT: retq 66; 67; AVX-LABEL: test3_undef: 68; AVX: # %bb.0: 69; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 70; AVX-NEXT: retq 71 %vecext = extractelement <4 x float> %a, i32 0 72 %vecext1 = extractelement <4 x float> %a, i32 1 73 %add = fadd float %vecext, %vecext1 74 %vecinit = insertelement <4 x float> undef, float %add, i32 0 75 %vecext2 = extractelement <4 x float> %a, i32 2 76 %vecext3 = extractelement <4 x float> %a, i32 3 77 %add4 = fadd float %vecext2, %vecext3 78 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 79 %vecext6 = extractelement <4 x float> %b, i32 0 80 %vecext7 = extractelement <4 x float> %b, i32 1 81 %add8 = fadd float %vecext6, %vecext7 82 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 83 ret <4 x float> %vecinit9 84} 85 86define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) { 87; SSE-SLOW-LABEL: test4_undef: 88; SSE-SLOW: # %bb.0: 89; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 90; SSE-SLOW-NEXT: addss %xmm1, %xmm0 91; SSE-SLOW-NEXT: retq 92; 93; SSE-FAST-LABEL: test4_undef: 94; SSE-FAST: # %bb.0: 95; SSE-FAST-NEXT: haddps %xmm0, %xmm0 96; SSE-FAST-NEXT: retq 97; 98; AVX-SLOW-LABEL: test4_undef: 99; AVX-SLOW: # %bb.0: 100; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 101; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 102; AVX-SLOW-NEXT: retq 103; 104; AVX-FAST-LABEL: test4_undef: 105; AVX-FAST: # %bb.0: 106; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 107; AVX-FAST-NEXT: retq 108 %vecext = extractelement <4 x float> %a, i32 0 109 %vecext1 = extractelement <4 x float> %a, i32 1 110 %add = fadd float %vecext, %vecext1 111 %vecinit = insertelement <4 x float> undef, float %add, i32 0 112 ret <4 x float> %vecinit 113} 114 115define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { 116; SSE-SLOW-LABEL: test5_undef: 117; SSE-SLOW: # %bb.0: 118; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 119; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 120; SSE-SLOW-NEXT: addsd %xmm1, %xmm0 121; SSE-SLOW-NEXT: retq 122; 123; SSE-FAST-LABEL: test5_undef: 124; SSE-FAST: # %bb.0: 125; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 126; SSE-FAST-NEXT: retq 127; 128; AVX-SLOW-LABEL: test5_undef: 129; AVX-SLOW: # %bb.0: 130; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 131; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 132; AVX-SLOW-NEXT: retq 133; 134; AVX-FAST-LABEL: test5_undef: 135; AVX-FAST: # %bb.0: 136; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 137; AVX-FAST-NEXT: retq 138 %vecext = extractelement <2 x double> %a, i32 0 139 %vecext1 = extractelement <2 x double> %a, i32 1 140 %add = fadd double %vecext, %vecext1 141 %vecinit = insertelement <2 x double> undef, double %add, i32 0 142 ret <2 x double> %vecinit 143} 144 145define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) { 146; SSE-LABEL: test6_undef: 147; SSE: # %bb.0: 148; SSE-NEXT: haddps %xmm0, %xmm0 149; SSE-NEXT: retq 150; 151; AVX-LABEL: test6_undef: 152; AVX: # %bb.0: 153; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 154; AVX-NEXT: retq 155 %vecext = extractelement <4 x float> %a, i32 0 156 %vecext1 = extractelement <4 x float> %a, i32 1 157 %add = fadd float %vecext, %vecext1 158 %vecinit = insertelement <4 x float> undef, float %add, i32 0 159 %vecext2 = extractelement <4 x float> %a, i32 2 160 %vecext3 = extractelement <4 x float> %a, i32 3 161 %add4 = fadd float %vecext2, %vecext3 162 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 163 ret <4 x float> %vecinit5 164} 165 166define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) { 167; SSE-LABEL: test7_undef: 168; SSE: # %bb.0: 169; SSE-NEXT: haddps %xmm1, %xmm0 170; SSE-NEXT: retq 171; 172; AVX-LABEL: test7_undef: 173; AVX: # %bb.0: 174; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 175; AVX-NEXT: retq 176 %vecext = extractelement <4 x float> %b, i32 0 177 %vecext1 = extractelement <4 x float> %b, i32 1 178 %add = fadd float %vecext, %vecext1 179 %vecinit = insertelement <4 x float> undef, float %add, i32 2 180 %vecext2 = extractelement <4 x float> %b, i32 2 181 %vecext3 = extractelement <4 x float> %b, i32 3 182 %add4 = fadd float %vecext2, %vecext3 183 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 184 ret <4 x float> %vecinit5 185} 186 187define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) { 188; SSE-SLOW-LABEL: test8_undef: 189; SSE-SLOW: # %bb.0: 190; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 191; SSE-SLOW-NEXT: addss %xmm0, %xmm1 192; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 193; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 194; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 195; SSE-SLOW-NEXT: addss %xmm2, %xmm0 196; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 197; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 198; SSE-SLOW-NEXT: retq 199; 200; SSE-FAST-LABEL: test8_undef: 201; SSE-FAST: # %bb.0: 202; SSE-FAST-NEXT: haddps %xmm0, %xmm0 203; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,1] 204; SSE-FAST-NEXT: retq 205; 206; AVX-SLOW-LABEL: test8_undef: 207; AVX-SLOW: # %bb.0: 208; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 209; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 210; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 211; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 212; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 213; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 214; AVX-SLOW-NEXT: retq 215; 216; AVX-FAST-LABEL: test8_undef: 217; AVX-FAST: # %bb.0: 218; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 219; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 220; AVX-FAST-NEXT: retq 221 %vecext = extractelement <4 x float> %a, i32 0 222 %vecext1 = extractelement <4 x float> %a, i32 1 223 %add = fadd float %vecext, %vecext1 224 %vecinit = insertelement <4 x float> undef, float %add, i32 0 225 %vecext2 = extractelement <4 x float> %a, i32 2 226 %vecext3 = extractelement <4 x float> %a, i32 3 227 %add4 = fadd float %vecext2, %vecext3 228 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 2 229 ret <4 x float> %vecinit5 230} 231 232define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) { 233; SSE-LABEL: test9_undef: 234; SSE: # %bb.0: 235; SSE-NEXT: haddps %xmm1, %xmm0 236; SSE-NEXT: retq 237; 238; AVX-LABEL: test9_undef: 239; AVX: # %bb.0: 240; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 241; AVX-NEXT: retq 242 %vecext = extractelement <4 x float> %a, i32 0 243 %vecext1 = extractelement <4 x float> %a, i32 1 244 %add = fadd float %vecext, %vecext1 245 %vecinit = insertelement <4 x float> undef, float %add, i32 0 246 %vecext2 = extractelement <4 x float> %b, i32 2 247 %vecext3 = extractelement <4 x float> %b, i32 3 248 %add4 = fadd float %vecext2, %vecext3 249 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 3 250 ret <4 x float> %vecinit5 251} 252 253define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) { 254; SSE-LABEL: test10_undef: 255; SSE: # %bb.0: 256; SSE-NEXT: haddps %xmm2, %xmm0 257; SSE-NEXT: retq 258; 259; AVX-LABEL: test10_undef: 260; AVX: # %bb.0: 261; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 262; AVX-NEXT: retq 263 %vecext = extractelement <8 x float> %a, i32 0 264 %vecext1 = extractelement <8 x float> %a, i32 1 265 %add = fadd float %vecext, %vecext1 266 %vecinit = insertelement <8 x float> undef, float %add, i32 0 267 %vecext2 = extractelement <8 x float> %b, i32 2 268 %vecext3 = extractelement <8 x float> %b, i32 3 269 %add4 = fadd float %vecext2, %vecext3 270 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 3 271 ret <8 x float> %vecinit5 272} 273 274define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) { 275; SSE-SLOW-LABEL: test11_undef: 276; SSE-SLOW: # %bb.0: 277; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 278; SSE-SLOW-NEXT: addss %xmm1, %xmm0 279; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 280; SSE-SLOW-NEXT: addss %xmm3, %xmm1 281; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 282; SSE-SLOW-NEXT: retq 283; 284; SSE-FAST-LABEL: test11_undef: 285; SSE-FAST: # %bb.0: 286; SSE-FAST-NEXT: movaps %xmm3, %xmm1 287; SSE-FAST-NEXT: haddps %xmm0, %xmm0 288; SSE-FAST-NEXT: haddps %xmm3, %xmm1 289; SSE-FAST-NEXT: retq 290; 291; AVX-LABEL: test11_undef: 292; AVX: # %bb.0: 293; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 294; AVX-NEXT: retq 295 %vecext = extractelement <8 x float> %a, i32 0 296 %vecext1 = extractelement <8 x float> %a, i32 1 297 %add = fadd float %vecext, %vecext1 298 %vecinit = insertelement <8 x float> undef, float %add, i32 0 299 %vecext2 = extractelement <8 x float> %b, i32 4 300 %vecext3 = extractelement <8 x float> %b, i32 5 301 %add4 = fadd float %vecext2, %vecext3 302 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 6 303 ret <8 x float> %vecinit5 304} 305 306define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) { 307; SSE-LABEL: test12_undef: 308; SSE: # %bb.0: 309; SSE-NEXT: haddps %xmm0, %xmm0 310; SSE-NEXT: retq 311; 312; AVX-LABEL: test12_undef: 313; AVX: # %bb.0: 314; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 315; AVX-NEXT: retq 316 %vecext = extractelement <8 x float> %a, i32 0 317 %vecext1 = extractelement <8 x float> %a, i32 1 318 %add = fadd float %vecext, %vecext1 319 %vecinit = insertelement <8 x float> undef, float %add, i32 0 320 %vecext2 = extractelement <8 x float> %a, i32 2 321 %vecext3 = extractelement <8 x float> %a, i32 3 322 %add4 = fadd float %vecext2, %vecext3 323 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 324 ret <8 x float> %vecinit5 325} 326 327define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) { 328; SSE-LABEL: test13_undef: 329; SSE: # %bb.0: 330; SSE-NEXT: haddps %xmm1, %xmm0 331; SSE-NEXT: retq 332; 333; AVX-LABEL: test13_undef: 334; AVX: # %bb.0: 335; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 336; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 337; AVX-NEXT: retq 338 %vecext = extractelement <8 x float> %a, i32 0 339 %vecext1 = extractelement <8 x float> %a, i32 1 340 %add1 = fadd float %vecext, %vecext1 341 %vecinit1 = insertelement <8 x float> undef, float %add1, i32 0 342 %vecext2 = extractelement <8 x float> %a, i32 2 343 %vecext3 = extractelement <8 x float> %a, i32 3 344 %add2 = fadd float %vecext2, %vecext3 345 %vecinit2 = insertelement <8 x float> %vecinit1, float %add2, i32 1 346 %vecext4 = extractelement <8 x float> %a, i32 4 347 %vecext5 = extractelement <8 x float> %a, i32 5 348 %add3 = fadd float %vecext4, %vecext5 349 %vecinit3 = insertelement <8 x float> %vecinit2, float %add3, i32 2 350 %vecext6 = extractelement <8 x float> %a, i32 6 351 %vecext7 = extractelement <8 x float> %a, i32 7 352 %add4 = fadd float %vecext6, %vecext7 353 %vecinit4 = insertelement <8 x float> %vecinit3, float %add4, i32 3 354 ret <8 x float> %vecinit4 355} 356 357define <16 x float> @test13_v16f32_undef(<16 x float> %a, <16 x float> %b) { 358; SSE-LABEL: test13_v16f32_undef: 359; SSE: # %bb.0: 360; SSE-NEXT: haddps %xmm1, %xmm0 361; SSE-NEXT: retq 362; 363; AVX1-SLOW-LABEL: test13_v16f32_undef: 364; AVX1-SLOW: # %bb.0: 365; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 366; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 367; AVX1-SLOW-NEXT: retq 368; 369; AVX-FAST-LABEL: test13_v16f32_undef: 370; AVX-FAST: # %bb.0: 371; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 372; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 373; AVX-FAST-NEXT: retq 374; 375; AVX512-SLOW-LABEL: test13_v16f32_undef: 376; AVX512-SLOW: # %bb.0: 377; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 378; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 379; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 380; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 381; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 382; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 383; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 384; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 385; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 386; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 387; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 388; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 389; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 390; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 391; AVX512-SLOW-NEXT: retq 392 %vecext = extractelement <16 x float> %a, i32 0 393 %vecext1 = extractelement <16 x float> %a, i32 1 394 %add1 = fadd float %vecext, %vecext1 395 %vecinit1 = insertelement <16 x float> undef, float %add1, i32 0 396 %vecext2 = extractelement <16 x float> %a, i32 2 397 %vecext3 = extractelement <16 x float> %a, i32 3 398 %add2 = fadd float %vecext2, %vecext3 399 %vecinit2 = insertelement <16 x float> %vecinit1, float %add2, i32 1 400 %vecext4 = extractelement <16 x float> %a, i32 4 401 %vecext5 = extractelement <16 x float> %a, i32 5 402 %add3 = fadd float %vecext4, %vecext5 403 %vecinit3 = insertelement <16 x float> %vecinit2, float %add3, i32 2 404 %vecext6 = extractelement <16 x float> %a, i32 6 405 %vecext7 = extractelement <16 x float> %a, i32 7 406 %add4 = fadd float %vecext6, %vecext7 407 %vecinit4 = insertelement <16 x float> %vecinit3, float %add4, i32 3 408 ret <16 x float> %vecinit4 409} 410define <2 x double> @add_pd_003(<2 x double> %x) { 411; SSE-SLOW-LABEL: add_pd_003: 412; SSE-SLOW: # %bb.0: 413; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] 414; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 415; SSE-SLOW-NEXT: retq 416; 417; SSE-FAST-LABEL: add_pd_003: 418; SSE-FAST: # %bb.0: 419; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 420; SSE-FAST-NEXT: retq 421; 422; AVX-SLOW-LABEL: add_pd_003: 423; AVX-SLOW: # %bb.0: 424; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 425; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 426; AVX-SLOW-NEXT: retq 427; 428; AVX-FAST-LABEL: add_pd_003: 429; AVX-FAST: # %bb.0: 430; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 431; AVX-FAST-NEXT: retq 432 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0> 433 %add = fadd <2 x double> %l, %x 434 ret <2 x double> %add 435} 436 437; Change shuffle mask - no undefs. 438 439define <2 x double> @add_pd_003_2(<2 x double> %x) { 440; SSE-SLOW-LABEL: add_pd_003_2: 441; SSE-SLOW: # %bb.0: 442; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 443; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] 444; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 445; SSE-SLOW-NEXT: retq 446; 447; SSE-FAST-LABEL: add_pd_003_2: 448; SSE-FAST: # %bb.0: 449; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 450; SSE-FAST-NEXT: retq 451; 452; AVX-SLOW-LABEL: add_pd_003_2: 453; AVX-SLOW: # %bb.0: 454; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 455; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 456; AVX-SLOW-NEXT: retq 457; 458; AVX-FAST-LABEL: add_pd_003_2: 459; AVX-FAST: # %bb.0: 460; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 461; AVX-FAST-NEXT: retq 462 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0> 463 %add = fadd <2 x double> %l, %x 464 ret <2 x double> %add 465} 466 467define <2 x double> @add_pd_010(<2 x double> %x) { 468; SSE-LABEL: add_pd_010: 469; SSE: # %bb.0: 470; SSE-NEXT: haddpd %xmm0, %xmm0 471; SSE-NEXT: retq 472; 473; AVX-SLOW-LABEL: add_pd_010: 474; AVX-SLOW: # %bb.0: 475; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 476; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 477; AVX-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 478; AVX-SLOW-NEXT: retq 479; 480; AVX-FAST-LABEL: add_pd_010: 481; AVX-FAST: # %bb.0: 482; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 483; AVX-FAST-NEXT: retq 484 %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0> 485 %add = fadd <2 x double> %l, %x 486 %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 487 ret <2 x double> %shuffle2 488} 489 490define <4 x float> @add_ps_007(<4 x float> %x) { 491; SSE-LABEL: add_ps_007: 492; SSE: # %bb.0: 493; SSE-NEXT: haddps %xmm0, %xmm0 494; SSE-NEXT: retq 495; 496; AVX-LABEL: add_ps_007: 497; AVX: # %bb.0: 498; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 499; AVX-NEXT: retq 500 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 501 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 502 %add = fadd <4 x float> %l, %r 503 ret <4 x float> %add 504} 505 506define <4 x float> @add_ps_030(<4 x float> %x) { 507; SSE-SLOW-LABEL: add_ps_030: 508; SSE-SLOW: # %bb.0: 509; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 510; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 511; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3] 512; SSE-SLOW-NEXT: addps %xmm1, %xmm0 513; SSE-SLOW-NEXT: retq 514; 515; SSE-FAST-LABEL: add_ps_030: 516; SSE-FAST: # %bb.0: 517; SSE-FAST-NEXT: haddps %xmm0, %xmm0 518; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 519; SSE-FAST-NEXT: retq 520; 521; AVX-SLOW-LABEL: add_ps_030: 522; AVX-SLOW: # %bb.0: 523; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] 524; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] 525; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 526; AVX-SLOW-NEXT: retq 527; 528; AVX-FAST-LABEL: add_ps_030: 529; AVX-FAST: # %bb.0: 530; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 531; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 532; AVX-FAST-NEXT: retq 533 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 534 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 535 %add = fadd <4 x float> %l, %r 536 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef> 537 ret <4 x float> %shuffle2 538} 539 540define <4 x float> @add_ps_007_2(<4 x float> %x) { 541; SSE-LABEL: add_ps_007_2: 542; SSE: # %bb.0: 543; SSE-NEXT: haddps %xmm0, %xmm0 544; SSE-NEXT: retq 545; 546; AVX-LABEL: add_ps_007_2: 547; AVX: # %bb.0: 548; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 549; AVX-NEXT: retq 550 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 551 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 552 %add = fadd <4 x float> %l, %r 553 ret <4 x float> %add 554} 555 556define <4 x float> @add_ps_008(<4 x float> %x) { 557; SSE-SLOW-LABEL: add_ps_008: 558; SSE-SLOW: # %bb.0: 559; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] 560; SSE-SLOW-NEXT: addps %xmm1, %xmm0 561; SSE-SLOW-NEXT: retq 562; 563; SSE-FAST-LABEL: add_ps_008: 564; SSE-FAST: # %bb.0: 565; SSE-FAST-NEXT: haddps %xmm0, %xmm0 566; SSE-FAST-NEXT: retq 567; 568; AVX-SLOW-LABEL: add_ps_008: 569; AVX-SLOW: # %bb.0: 570; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] 571; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 572; AVX-SLOW-NEXT: retq 573; 574; AVX-FAST-LABEL: add_ps_008: 575; AVX-FAST: # %bb.0: 576; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 577; AVX-FAST-NEXT: retq 578 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 579 %add = fadd <4 x float> %l, %x 580 ret <4 x float> %add 581} 582 583define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) { 584; SSE-LABEL: add_ps_016: 585; SSE: # %bb.0: 586; SSE-NEXT: haddps %xmm0, %xmm1 587; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] 588; SSE-NEXT: movaps %xmm1, %xmm0 589; SSE-NEXT: retq 590; 591; AVX-LABEL: add_ps_016: 592; AVX: # %bb.0: 593; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 594; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] 595; AVX-NEXT: retq 596 %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 0, i32 6> 597 %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> <i32 1, i32 7> 598 %5 = fadd <2 x float> %3, %4 599 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 600 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> 601 %8 = fadd <4 x float> %7, %1 602 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 6, i32 1, i32 2, i32 undef> 603 ret <4 x float> %9 604} 605 606define <4 x float> @add_ps_017(<4 x float> %x) { 607; SSE-SLOW-LABEL: add_ps_017: 608; SSE-SLOW: # %bb.0: 609; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 610; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 611; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2] 612; SSE-SLOW-NEXT: addps %xmm1, %xmm0 613; SSE-SLOW-NEXT: retq 614; 615; SSE-FAST-LABEL: add_ps_017: 616; SSE-FAST: # %bb.0: 617; SSE-FAST-NEXT: haddps %xmm0, %xmm0 618; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 619; SSE-FAST-NEXT: retq 620; 621; AVX-SLOW-LABEL: add_ps_017: 622; AVX-SLOW: # %bb.0: 623; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] 624; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2] 625; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 626; AVX-SLOW-NEXT: retq 627; 628; AVX-FAST-LABEL: add_ps_017: 629; AVX-FAST: # %bb.0: 630; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 631; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 632; AVX-FAST-NEXT: retq 633 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 634 %add = fadd <4 x float> %l, %x 635 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 636 ret <4 x float> %shuffle2 637} 638 639define <4 x float> @add_ps_018(<4 x float> %x) { 640; SSE-LABEL: add_ps_018: 641; SSE: # %bb.0: 642; SSE-NEXT: haddps %xmm0, %xmm0 643; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 644; SSE-NEXT: retq 645; 646; AVX1-SLOW-LABEL: add_ps_018: 647; AVX1-SLOW: # %bb.0: 648; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 649; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 650; AVX1-SLOW-NEXT: retq 651; 652; AVX1-FAST-LABEL: add_ps_018: 653; AVX1-FAST: # %bb.0: 654; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 655; AVX1-FAST-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 656; AVX1-FAST-NEXT: retq 657; 658; AVX512-LABEL: add_ps_018: 659; AVX512: # %bb.0: 660; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 661; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 662; AVX512-NEXT: retq 663 %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 664 %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 665 %add = fadd <4 x float> %l, %r 666 %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 667 ret <4 x float> %shuffle2 668} 669 670define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) { 671; SSE-SLOW-LABEL: add_pd_011: 672; SSE-SLOW: # %bb.0: 673; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 674; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 675; SSE-SLOW-NEXT: movapd %xmm0, %xmm3 676; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] 677; SSE-SLOW-NEXT: addpd %xmm3, %xmm0 678; SSE-SLOW-NEXT: addpd %xmm2, %xmm1 679; SSE-SLOW-NEXT: retq 680; 681; SSE-FAST-LABEL: add_pd_011: 682; SSE-FAST: # %bb.0: 683; SSE-FAST-NEXT: movapd %xmm2, %xmm1 684; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 685; SSE-FAST-NEXT: haddpd %xmm2, %xmm1 686; SSE-FAST-NEXT: retq 687; 688; AVX1-SLOW-LABEL: add_pd_011: 689; AVX1-SLOW: # %bb.0: 690; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 691; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 692; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 693; AVX1-SLOW-NEXT: retq 694; 695; AVX1-FAST-LABEL: add_pd_011: 696; AVX1-FAST: # %bb.0: 697; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 698; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 699; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 700; AVX1-FAST-NEXT: retq 701; 702; AVX512-LABEL: add_pd_011: 703; AVX512: # %bb.0: 704; AVX512-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 705; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] 706; AVX512-NEXT: retq 707 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef> 708 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 1, i32 undef, i32 5, i32 undef> 709 %5 = fadd <4 x double> %3, %4 710 %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef> 711 ret <4 x double> %6 712} 713 714define <4 x float> @v8f32_inputs_v4f32_output_0101(<8 x float> %a, <8 x float> %b) { 715; SSE-LABEL: v8f32_inputs_v4f32_output_0101: 716; SSE: # %bb.0: 717; SSE-NEXT: haddps %xmm2, %xmm0 718; SSE-NEXT: retq 719; 720; AVX-LABEL: v8f32_inputs_v4f32_output_0101: 721; AVX: # %bb.0: 722; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 723; AVX-NEXT: vzeroupper 724; AVX-NEXT: retq 725 %a0 = extractelement <8 x float> %a, i32 0 726 %a1 = extractelement <8 x float> %a, i32 1 727 %b0 = extractelement <8 x float> %b, i32 0 728 %b1 = extractelement <8 x float> %b, i32 1 729 %add0 = fadd float %a0, %a1 730 %add2 = fadd float %b0, %b1 731 %r0 = insertelement <4 x float> undef, float %add0, i32 0 732 %r = insertelement <4 x float> %r0, float %add2, i32 2 733 ret <4 x float> %r 734} 735 736define <4 x float> @v8f32_input0_v4f32_output_0123(<8 x float> %a, <4 x float> %b) { 737; SSE-LABEL: v8f32_input0_v4f32_output_0123: 738; SSE: # %bb.0: 739; SSE-NEXT: haddps %xmm2, %xmm0 740; SSE-NEXT: retq 741; 742; AVX-LABEL: v8f32_input0_v4f32_output_0123: 743; AVX: # %bb.0: 744; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 745; AVX-NEXT: vzeroupper 746; AVX-NEXT: retq 747 %a0 = extractelement <8 x float> %a, i32 0 748 %a1 = extractelement <8 x float> %a, i32 1 749 %b2 = extractelement <4 x float> %b, i32 2 750 %b3 = extractelement <4 x float> %b, i32 3 751 %add0 = fadd float %a0, %a1 752 %add3 = fadd float %b2, %b3 753 %r0 = insertelement <4 x float> undef, float %add0, i32 0 754 %r = insertelement <4 x float> %r0, float %add3, i32 3 755 ret <4 x float> %r 756} 757 758define <4 x float> @v8f32_input1_v4f32_output_2301(<4 x float> %a, <8 x float> %b) { 759; SSE-LABEL: v8f32_input1_v4f32_output_2301: 760; SSE: # %bb.0: 761; SSE-NEXT: haddps %xmm1, %xmm0 762; SSE-NEXT: retq 763; 764; AVX-LABEL: v8f32_input1_v4f32_output_2301: 765; AVX: # %bb.0: 766; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 767; AVX-NEXT: vzeroupper 768; AVX-NEXT: retq 769 %a2 = extractelement <4 x float> %a, i32 2 770 %a3 = extractelement <4 x float> %a, i32 3 771 %b0 = extractelement <8 x float> %b, i32 0 772 %b1 = extractelement <8 x float> %b, i32 1 773 %add1 = fadd float %a2, %a3 774 %add2 = fadd float %b0, %b1 775 %r1 = insertelement <4 x float> undef, float %add1, i32 1 776 %r = insertelement <4 x float> %r1, float %add2, i32 2 777 ret <4 x float> %r 778} 779 780define <4 x float> @v8f32_inputs_v4f32_output_2323(<8 x float> %a, <8 x float> %b) { 781; SSE-LABEL: v8f32_inputs_v4f32_output_2323: 782; SSE: # %bb.0: 783; SSE-NEXT: haddps %xmm2, %xmm0 784; SSE-NEXT: retq 785; 786; AVX-LABEL: v8f32_inputs_v4f32_output_2323: 787; AVX: # %bb.0: 788; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 789; AVX-NEXT: vzeroupper 790; AVX-NEXT: retq 791 %a2 = extractelement <8 x float> %a, i32 2 792 %a3 = extractelement <8 x float> %a, i32 3 793 %b2 = extractelement <8 x float> %b, i32 2 794 %b3 = extractelement <8 x float> %b, i32 3 795 %add1 = fadd float %a2, %a3 796 %add3 = fadd float %b2, %b3 797 %r1 = insertelement <4 x float> undef, float %add1, i32 1 798 %r = insertelement <4 x float> %r1, float %add3, i32 3 799 ret <4 x float> %r 800} 801 802define <4 x float> @v16f32_inputs_v4f32_output_0123(<16 x float> %a, <16 x float> %b) { 803; SSE-LABEL: v16f32_inputs_v4f32_output_0123: 804; SSE: # %bb.0: 805; SSE-NEXT: haddps %xmm4, %xmm0 806; SSE-NEXT: retq 807; 808; AVX1-SLOW-LABEL: v16f32_inputs_v4f32_output_0123: 809; AVX1-SLOW: # %bb.0: 810; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm0, %xmm0 811; AVX1-SLOW-NEXT: vzeroupper 812; AVX1-SLOW-NEXT: retq 813; 814; AVX1-FAST-LABEL: v16f32_inputs_v4f32_output_0123: 815; AVX1-FAST: # %bb.0: 816; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0 817; AVX1-FAST-NEXT: vzeroupper 818; AVX1-FAST-NEXT: retq 819; 820; AVX512-LABEL: v16f32_inputs_v4f32_output_0123: 821; AVX512: # %bb.0: 822; AVX512-NEXT: vhaddps %xmm1, %xmm0, %xmm0 823; AVX512-NEXT: vzeroupper 824; AVX512-NEXT: retq 825 %a0 = extractelement <16 x float> %a, i32 0 826 %a1 = extractelement <16 x float> %a, i32 1 827 %b2 = extractelement <16 x float> %b, i32 2 828 %b3 = extractelement <16 x float> %b, i32 3 829 %add0 = fadd float %a0, %a1 830 %add3 = fadd float %b2, %b3 831 %r0 = insertelement <4 x float> undef, float %add0, i32 0 832 %r = insertelement <4 x float> %r0, float %add3, i32 3 833 ret <4 x float> %r 834} 835 836define <8 x float> @v16f32_inputs_v8f32_output_4567(<16 x float> %a, <16 x float> %b) { 837; SSE-LABEL: v16f32_inputs_v8f32_output_4567: 838; SSE: # %bb.0: 839; SSE-NEXT: haddps %xmm5, %xmm1 840; SSE-NEXT: retq 841; 842; AVX1-SLOW-LABEL: v16f32_inputs_v8f32_output_4567: 843; AVX1-SLOW: # %bb.0: 844; AVX1-SLOW-NEXT: vhaddps %ymm2, %ymm0, %ymm0 845; AVX1-SLOW-NEXT: retq 846; 847; AVX1-FAST-LABEL: v16f32_inputs_v8f32_output_4567: 848; AVX1-FAST: # %bb.0: 849; AVX1-FAST-NEXT: vhaddps %ymm2, %ymm0, %ymm0 850; AVX1-FAST-NEXT: retq 851; 852; AVX512-LABEL: v16f32_inputs_v8f32_output_4567: 853; AVX512: # %bb.0: 854; AVX512-NEXT: vhaddps %ymm1, %ymm0, %ymm0 855; AVX512-NEXT: retq 856 %a4 = extractelement <16 x float> %a, i32 4 857 %a5 = extractelement <16 x float> %a, i32 5 858 %b6 = extractelement <16 x float> %b, i32 6 859 %b7 = extractelement <16 x float> %b, i32 7 860 %add4 = fadd float %a4, %a5 861 %add7 = fadd float %b6, %b7 862 %r4 = insertelement <8 x float> undef, float %add4, i32 4 863 %r = insertelement <8 x float> %r4, float %add7, i32 7 864 ret <8 x float> %r 865} 866 867define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) { 868; SSE-LABEL: PR40243: 869; SSE: # %bb.0: 870; SSE-NEXT: haddps %xmm3, %xmm1 871; SSE-NEXT: retq 872; 873; AVX-LABEL: PR40243: 874; AVX: # %bb.0: 875; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 876; AVX-NEXT: retq 877 %a4 = extractelement <8 x float> %a, i32 4 878 %a5 = extractelement <8 x float> %a, i32 5 879 %add4 = fadd float %a4, %a5 880 %b6 = extractelement <8 x float> %b, i32 6 881 %b7 = extractelement <8 x float> %b, i32 7 882 %add7 = fadd float %b6, %b7 883 %r4 = insertelement <8 x float> undef, float %add4, i32 4 884 %r = insertelement <8 x float> %r4, float %add7, i32 7 885 ret <8 x float> %r 886} 887 888define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { 889; SSE-SLOW-LABEL: PR44694: 890; SSE-SLOW: # %bb.0: 891; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 892; SSE-SLOW-NEXT: haddpd %xmm3, %xmm2 893; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 894; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 895; SSE-SLOW-NEXT: retq 896; 897; SSE-FAST-LABEL: PR44694: 898; SSE-FAST: # %bb.0: 899; SSE-FAST-NEXT: movapd %xmm1, %xmm0 900; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 901; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 902; SSE-FAST-NEXT: movapd %xmm2, %xmm1 903; SSE-FAST-NEXT: retq 904; 905; AVX1-SLOW-LABEL: PR44694: 906; AVX1-SLOW: # %bb.0: 907; AVX1-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 908; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 909; AVX1-SLOW-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 910; AVX1-SLOW-NEXT: retq 911; 912; AVX1-FAST-LABEL: PR44694: 913; AVX1-FAST: # %bb.0: 914; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 915; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 916; AVX1-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 917; AVX1-FAST-NEXT: retq 918; 919; AVX512-LABEL: PR44694: 920; AVX512: # %bb.0: 921; AVX512-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 922; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 923; AVX512-NEXT: retq 924 %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 925 %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 926 %5 = fadd <4 x double> %3, %4 927 ret <4 x double> %5 928} 929 930define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { 931; SSE-SLOW-LABEL: PR45747_1: 932; SSE-SLOW: # %bb.0: 933; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 934; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] 935; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 936; SSE-SLOW-NEXT: addps %xmm1, %xmm0 937; SSE-SLOW-NEXT: retq 938; 939; SSE-FAST-LABEL: PR45747_1: 940; SSE-FAST: # %bb.0: 941; SSE-FAST-NEXT: haddps %xmm0, %xmm0 942; SSE-FAST-NEXT: retq 943; 944; AVX-SLOW-LABEL: PR45747_1: 945; AVX-SLOW: # %bb.0: 946; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2] 947; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 948; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 949; AVX-SLOW-NEXT: retq 950; 951; AVX-FAST-LABEL: PR45747_1: 952; AVX-FAST: # %bb.0: 953; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 954; AVX-FAST-NEXT: retq 955 %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> 956 %t1 = fadd <4 x float> %t0, %a 957 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 958 ret <4 x float> %shuffle 959} 960 961define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { 962; SSE-SLOW-LABEL: PR45747_2: 963; SSE-SLOW: # %bb.0: 964; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 965; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 966; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 967; SSE-SLOW-NEXT: addps %xmm1, %xmm0 968; SSE-SLOW-NEXT: retq 969; 970; SSE-FAST-LABEL: PR45747_2: 971; SSE-FAST: # %bb.0: 972; SSE-FAST-NEXT: haddps %xmm1, %xmm1 973; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 974; SSE-FAST-NEXT: retq 975; 976; AVX-SLOW-LABEL: PR45747_2: 977; AVX-SLOW: # %bb.0: 978; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] 979; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1] 980; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 981; AVX-SLOW-NEXT: retq 982; 983; AVX-FAST-LABEL: PR45747_2: 984; AVX-FAST: # %bb.0: 985; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 986; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 987; AVX-FAST-NEXT: retq 988 %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 3, i32 undef> 989 %t1 = fadd <4 x float> %t0, %b 990 %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 991 ret <4 x float> %shuffle 992} 993 994define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) { 995; SSE-LABEL: PR34724_add_v4f32_u123: 996; SSE: # %bb.0: 997; SSE-NEXT: haddps %xmm1, %xmm0 998; SSE-NEXT: retq 999; 1000; AVX-LABEL: PR34724_add_v4f32_u123: 1001; AVX: # %bb.0: 1002; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1003; AVX-NEXT: retq 1004 %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4> 1005 %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5> 1006 %5 = fadd <2 x float> %3, %4 1007 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 1008 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 1009 %8 = fadd <4 x float> %7, %1 1010 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 undef, i32 1, i32 2, i32 7> 1011 ret <4 x float> %9 1012} 1013 1014define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { 1015; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23: 1016; SSE-SLOW: # %bb.0: 1017; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 1018; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1019; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 1020; SSE-SLOW-NEXT: addps %xmm2, %xmm0 1021; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] 1022; SSE-SLOW-NEXT: addps %xmm1, %xmm2 1023; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] 1024; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 1025; SSE-SLOW-NEXT: retq 1026; 1027; SSE-FAST-LABEL: PR34724_add_v4f32_0u23: 1028; SSE-FAST: # %bb.0: 1029; SSE-FAST-NEXT: haddps %xmm1, %xmm0 1030; SSE-FAST-NEXT: retq 1031; 1032; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23: 1033; AVX-SLOW: # %bb.0: 1034; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] 1035; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] 1036; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 1037; AVX-SLOW-NEXT: retq 1038; 1039; AVX-FAST-LABEL: PR34724_add_v4f32_0u23: 1040; AVX-FAST: # %bb.0: 1041; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1042; AVX-FAST-NEXT: retq 1043 %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1044 %4 = fadd <4 x float> %3, %0 1045 %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1046 %6 = fadd <4 x float> %5, %1 1047 %7 = shufflevector <4 x float> %4, <4 x float> %6, <4 x i32> <i32 0, i32 undef, i32 4, i32 undef> 1048 %8 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 1049 %9 = fadd <4 x float> %8, %1 1050 %10 = shufflevector <4 x float> %7, <4 x float> %9, <4 x i32> <i32 0, i32 undef, i32 2, i32 7> 1051 ret <4 x float> %10 1052} 1053 1054define <4 x float> @PR34724_add_v4f32_01u3(<4 x float> %0, <4 x float> %1) { 1055; SSE-LABEL: PR34724_add_v4f32_01u3: 1056; SSE: # %bb.0: 1057; SSE-NEXT: haddps %xmm1, %xmm0 1058; SSE-NEXT: retq 1059; 1060; AVX-LABEL: PR34724_add_v4f32_01u3: 1061; AVX: # %bb.0: 1062; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1063; AVX-NEXT: retq 1064 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2> 1065 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3> 1066 %5 = fadd <2 x float> %3, %4 1067 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1068 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 1069 %8 = fadd <4 x float> %7, %1 1070 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 undef, i32 7> 1071 ret <4 x float> %9 1072} 1073 1074define <4 x float> @PR34724_add_v4f32_012u(<4 x float> %0, <4 x float> %1) { 1075; SSE-LABEL: PR34724_add_v4f32_012u: 1076; SSE: # %bb.0: 1077; SSE-NEXT: haddps %xmm1, %xmm0 1078; SSE-NEXT: retq 1079; 1080; AVX-LABEL: PR34724_add_v4f32_012u: 1081; AVX: # %bb.0: 1082; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1083; AVX-NEXT: retq 1084 %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 2> 1085 %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 1, i32 3> 1086 %5 = fadd <2 x float> %3, %4 1087 %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 1088 %7 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1089 %8 = fadd <4 x float> %7, %1 1090 %9 = shufflevector <4 x float> %6, <4 x float> %8, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 1091 ret <4 x float> %9 1092} 1093 1094define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) { 1095; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: 1096; SSE-SLOW: # %bb.0: 1097; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 1098; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 1099; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 1100; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 1101; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1102; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] 1103; SSE-SLOW-NEXT: retq 1104; 1105; SSE-FAST-LABEL: PR34724_add_v4f64_u123: 1106; SSE-FAST: # %bb.0: 1107; SSE-FAST-NEXT: movapd %xmm1, %xmm0 1108; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 1109; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 1110; SSE-FAST-NEXT: movapd %xmm2, %xmm1 1111; SSE-FAST-NEXT: retq 1112; 1113; AVX-SLOW-LABEL: PR34724_add_v4f64_u123: 1114; AVX-SLOW: # %bb.0: 1115; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1116; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 1117; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1118; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1119; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1120; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] 1121; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1122; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1123; AVX-SLOW-NEXT: retq 1124; 1125; AVX-FAST-LABEL: PR34724_add_v4f64_u123: 1126; AVX-FAST: # %bb.0: 1127; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 1128; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1129; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1130; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 1131; AVX-FAST-NEXT: retq 1132 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4> 1133 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5> 1134 %5 = fadd <2 x double> %3, %4 1135 %6 = extractelement <2 x double> %5, i32 0 1136 %7 = insertelement <4 x double> undef, double %6, i32 1 1137 %8 = extractelement <2 x double> %5, i32 1 1138 %9 = insertelement <4 x double> %7, double %8, i32 2 1139 %10 = extractelement <4 x double> %1, i32 2 1140 %11 = extractelement <4 x double> %1, i32 3 1141 %12 = fadd double %10, %11 1142 %13 = insertelement <4 x double> %9, double %12, i32 3 1143 ret <4 x double> %13 1144} 1145 1146define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) { 1147; SSE-SLOW-LABEL: PR34724_add_v4f64_0u23: 1148; SSE-SLOW: # %bb.0: 1149; SSE-SLOW-NEXT: haddpd %xmm2, %xmm0 1150; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 1151; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 1152; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 1153; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 1154; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] 1155; SSE-SLOW-NEXT: retq 1156; 1157; SSE-FAST-LABEL: PR34724_add_v4f64_0u23: 1158; SSE-FAST: # %bb.0: 1159; SSE-FAST-NEXT: movapd %xmm2, %xmm1 1160; SSE-FAST-NEXT: haddpd %xmm2, %xmm0 1161; SSE-FAST-NEXT: haddpd %xmm3, %xmm1 1162; SSE-FAST-NEXT: retq 1163; 1164; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23: 1165; AVX-SLOW: # %bb.0: 1166; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 1167; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1168; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1169; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1170; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] 1171; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1172; AVX-SLOW-NEXT: retq 1173; 1174; AVX-FAST-LABEL: PR34724_add_v4f64_0u23: 1175; AVX-FAST: # %bb.0: 1176; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1177; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1178; AVX-FAST-NEXT: retq 1179 %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4> 1180 %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5> 1181 %5 = fadd <2 x double> %3, %4 1182 %6 = extractelement <2 x double> %5, i32 0 1183 %7 = insertelement <4 x double> undef, double %6, i32 0 1184 %8 = extractelement <2 x double> %5, i32 1 1185 %9 = insertelement <4 x double> %7, double %8, i32 2 1186 %10 = extractelement <4 x double> %1, i32 2 1187 %11 = extractelement <4 x double> %1, i32 3 1188 %12 = fadd double %10, %11 1189 %13 = insertelement <4 x double> %9, double %12, i32 3 1190 ret <4 x double> %13 1191} 1192 1193define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) { 1194; SSE-SLOW-LABEL: PR34724_add_v4f64_01u3: 1195; SSE-SLOW: # %bb.0: 1196; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0 1197; SSE-SLOW-NEXT: movapd %xmm3, %xmm1 1198; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 1199; SSE-SLOW-NEXT: addsd %xmm3, %xmm1 1200; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 1201; SSE-SLOW-NEXT: retq 1202; 1203; SSE-FAST-LABEL: PR34724_add_v4f64_01u3: 1204; SSE-FAST: # %bb.0: 1205; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 1206; SSE-FAST-NEXT: haddpd %xmm3, %xmm3 1207; SSE-FAST-NEXT: movapd %xmm3, %xmm1 1208; SSE-FAST-NEXT: retq 1209; 1210; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3: 1211; AVX-SLOW: # %bb.0: 1212; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1213; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 1214; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 1215; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1216; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1217; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 1218; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1219; AVX-SLOW-NEXT: retq 1220; 1221; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: 1222; AVX1-FAST: # %bb.0: 1223; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1224; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] 1225; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 1226; AVX1-FAST-NEXT: retq 1227; 1228; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: 1229; AVX512-FAST: # %bb.0: 1230; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1231; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] 1232; AVX512-FAST-NEXT: retq 1233 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2> 1234 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3> 1235 %5 = fadd <2 x double> %3, %4 1236 %6 = extractelement <2 x double> %5, i32 0 1237 %7 = insertelement <4 x double> undef, double %6, i32 0 1238 %8 = extractelement <2 x double> %5, i32 1 1239 %9 = insertelement <4 x double> %7, double %8, i32 1 1240 %10 = extractelement <4 x double> %1, i32 2 1241 %11 = extractelement <4 x double> %1, i32 3 1242 %12 = fadd double %10, %11 1243 %13 = insertelement <4 x double> %9, double %12, i32 3 1244 ret <4 x double> %13 1245} 1246 1247define <4 x double> @PR34724_add_v4f64_012u(<4 x double> %0, <4 x double> %1) { 1248; SSE-SLOW-LABEL: PR34724_add_v4f64_012u: 1249; SSE-SLOW: # %bb.0: 1250; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0 1251; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 1252; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 1253; SSE-SLOW-NEXT: addsd %xmm2, %xmm1 1254; SSE-SLOW-NEXT: retq 1255; 1256; SSE-FAST-LABEL: PR34724_add_v4f64_012u: 1257; SSE-FAST: # %bb.0: 1258; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 1259; SSE-FAST-NEXT: haddpd %xmm2, %xmm2 1260; SSE-FAST-NEXT: movapd %xmm2, %xmm1 1261; SSE-FAST-NEXT: retq 1262; 1263; AVX-SLOW-LABEL: PR34724_add_v4f64_012u: 1264; AVX-SLOW: # %bb.0: 1265; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1266; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0 1267; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1268; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1269; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1270; AVX-SLOW-NEXT: retq 1271; 1272; AVX-FAST-LABEL: PR34724_add_v4f64_012u: 1273; AVX-FAST: # %bb.0: 1274; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm2 1275; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1276; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1277; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1278; AVX-FAST-NEXT: retq 1279 %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2> 1280 %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3> 1281 %5 = fadd <2 x double> %3, %4 1282 %6 = extractelement <2 x double> %5, i32 0 1283 %7 = insertelement <4 x double> undef, double %6, i32 0 1284 %8 = extractelement <2 x double> %5, i32 1 1285 %9 = insertelement <4 x double> %7, double %8, i32 1 1286 %10 = extractelement <4 x double> %1, i32 0 1287 %11 = extractelement <4 x double> %1, i32 1 1288 %12 = fadd double %10, %11 1289 %13 = insertelement <4 x double> %9, double %12, i32 2 1290 ret <4 x double> %13 1291} 1292