1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST 10 11define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { 12; SSE3-LABEL: haddpd1: 13; SSE3: # %bb.0: 14; SSE3-NEXT: haddpd %xmm1, %xmm0 15; SSE3-NEXT: retq 16; 17; AVX-LABEL: haddpd1: 18; AVX: # %bb.0: 19; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 22 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 23 %r = fadd <2 x double> %a, %b 24 ret <2 x double> %r 25} 26 27define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) { 28; SSE3-LABEL: haddpd2: 29; SSE3: # %bb.0: 30; SSE3-NEXT: haddpd %xmm1, %xmm0 31; SSE3-NEXT: retq 32; 33; AVX-LABEL: haddpd2: 34; AVX: # %bb.0: 35; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 36; AVX-NEXT: retq 37 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2> 38 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1> 39 %r = fadd <2 x double> %a, %b 40 ret <2 x double> %r 41} 42 43define <2 x double> @haddpd3(<2 x double> %x) { 44; SSE3-SLOW-LABEL: haddpd3: 45; SSE3-SLOW: # %bb.0: 46; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 47; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 48; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0 49; SSE3-SLOW-NEXT: retq 50; 51; SSE3-FAST-LABEL: haddpd3: 52; SSE3-FAST: # %bb.0: 53; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 54; SSE3-FAST-NEXT: retq 55; 56; AVX-SLOW-LABEL: haddpd3: 57; AVX-SLOW: # %bb.0: 58; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 59; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 60; AVX-SLOW-NEXT: retq 61; 62; AVX-FAST-LABEL: haddpd3: 63; AVX-FAST: # %bb.0: 64; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 65; AVX-FAST-NEXT: retq 66 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 67 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 68 %r = fadd <2 x double> %a, %b 69 ret <2 x double> %r 70} 71 72define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) { 73; SSE3-LABEL: haddps1: 74; SSE3: # %bb.0: 75; SSE3-NEXT: haddps %xmm1, %xmm0 76; SSE3-NEXT: retq 77; 78; AVX-LABEL: haddps1: 79; AVX: # %bb.0: 80; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 81; AVX-NEXT: retq 82 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 83 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 84 %r = fadd <4 x float> %a, %b 85 ret <4 x float> %r 86} 87 88define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) { 89; SSE3-LABEL: haddps2: 90; SSE3: # %bb.0: 91; SSE3-NEXT: haddps %xmm1, %xmm0 92; SSE3-NEXT: retq 93; 94; AVX-LABEL: haddps2: 95; AVX: # %bb.0: 96; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 97; AVX-NEXT: retq 98 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 99 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 100 %r = fadd <4 x float> %a, %b 101 ret <4 x float> %r 102} 103 104define <4 x float> @haddps3(<4 x float> %x) { 105; SSE3-LABEL: haddps3: 106; SSE3: # %bb.0: 107; SSE3-NEXT: haddps %xmm0, %xmm0 108; SSE3-NEXT: retq 109; 110; AVX-LABEL: haddps3: 111; AVX: # %bb.0: 112; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 113; AVX-NEXT: retq 114 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 115 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 116 %r = fadd <4 x float> %a, %b 117 ret <4 x float> %r 118} 119 120define <4 x float> @haddps4(<4 x float> %x) { 121; SSE3-LABEL: haddps4: 122; SSE3: # %bb.0: 123; SSE3-NEXT: haddps %xmm0, %xmm0 124; SSE3-NEXT: retq 125; 126; AVX-LABEL: haddps4: 127; AVX: # %bb.0: 128; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 129; AVX-NEXT: retq 130 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 131 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 132 %r = fadd <4 x float> %a, %b 133 ret <4 x float> %r 134} 135 136define <4 x float> @haddps5(<4 x float> %x) { 137; SSE3-LABEL: haddps5: 138; SSE3: # %bb.0: 139; SSE3-NEXT: haddps %xmm0, %xmm0 140; SSE3-NEXT: retq 141; 142; AVX-LABEL: haddps5: 143; AVX: # %bb.0: 144; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 145; AVX-NEXT: retq 146 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 147 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 148 %r = fadd <4 x float> %a, %b 149 ret <4 x float> %r 150} 151 152define <4 x float> @haddps6(<4 x float> %x) { 153; SSE3-SLOW-LABEL: haddps6: 154; SSE3-SLOW: # %bb.0: 155; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 156; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 157; SSE3-SLOW-NEXT: retq 158; 159; SSE3-FAST-LABEL: haddps6: 160; SSE3-FAST: # %bb.0: 161; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 162; SSE3-FAST-NEXT: retq 163; 164; AVX-SLOW-LABEL: haddps6: 165; AVX-SLOW: # %bb.0: 166; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 167; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 168; AVX-SLOW-NEXT: retq 169; 170; AVX-FAST-LABEL: haddps6: 171; AVX-FAST: # %bb.0: 172; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 173; AVX-FAST-NEXT: retq 174 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 175 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 176 %r = fadd <4 x float> %a, %b 177 ret <4 x float> %r 178} 179 180define <4 x float> @haddps7(<4 x float> %x) { 181; SSE3-LABEL: haddps7: 182; SSE3: # %bb.0: 183; SSE3-NEXT: haddps %xmm0, %xmm0 184; SSE3-NEXT: retq 185; 186; AVX-LABEL: haddps7: 187; AVX: # %bb.0: 188; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 189; AVX-NEXT: retq 190 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 191 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 192 %r = fadd <4 x float> %a, %b 193 ret <4 x float> %r 194} 195 196define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) { 197; SSE3-LABEL: hsubpd1: 198; SSE3: # %bb.0: 199; SSE3-NEXT: hsubpd %xmm1, %xmm0 200; SSE3-NEXT: retq 201; 202; AVX-LABEL: hsubpd1: 203; AVX: # %bb.0: 204; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 205; AVX-NEXT: retq 206 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 207 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 208 %r = fsub <2 x double> %a, %b 209 ret <2 x double> %r 210} 211 212define <2 x double> @hsubpd2(<2 x double> %x) { 213; SSE3-SLOW-LABEL: hsubpd2: 214; SSE3-SLOW: # %bb.0: 215; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 216; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 217; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 218; SSE3-SLOW-NEXT: retq 219; 220; SSE3-FAST-LABEL: hsubpd2: 221; SSE3-FAST: # %bb.0: 222; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 223; SSE3-FAST-NEXT: retq 224; 225; AVX-SLOW-LABEL: hsubpd2: 226; AVX-SLOW: # %bb.0: 227; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 228; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 229; AVX-SLOW-NEXT: retq 230; 231; AVX-FAST-LABEL: hsubpd2: 232; AVX-FAST: # %bb.0: 233; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 234; AVX-FAST-NEXT: retq 235 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 236 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 237 %r = fsub <2 x double> %a, %b 238 ret <2 x double> %r 239} 240 241define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) { 242; SSE3-LABEL: hsubps1: 243; SSE3: # %bb.0: 244; SSE3-NEXT: hsubps %xmm1, %xmm0 245; SSE3-NEXT: retq 246; 247; AVX-LABEL: hsubps1: 248; AVX: # %bb.0: 249; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 250; AVX-NEXT: retq 251 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 252 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 253 %r = fsub <4 x float> %a, %b 254 ret <4 x float> %r 255} 256 257define <4 x float> @hsubps2(<4 x float> %x) { 258; SSE3-LABEL: hsubps2: 259; SSE3: # %bb.0: 260; SSE3-NEXT: hsubps %xmm0, %xmm0 261; SSE3-NEXT: retq 262; 263; AVX-LABEL: hsubps2: 264; AVX: # %bb.0: 265; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 266; AVX-NEXT: retq 267 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 268 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 269 %r = fsub <4 x float> %a, %b 270 ret <4 x float> %r 271} 272 273define <4 x float> @hsubps3(<4 x float> %x) { 274; SSE3-LABEL: hsubps3: 275; SSE3: # %bb.0: 276; SSE3-NEXT: hsubps %xmm0, %xmm0 277; SSE3-NEXT: retq 278; 279; AVX-LABEL: hsubps3: 280; AVX: # %bb.0: 281; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 282; AVX-NEXT: retq 283 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 284 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 285 %r = fsub <4 x float> %a, %b 286 ret <4 x float> %r 287} 288 289define <4 x float> @hsubps4(<4 x float> %x) { 290; SSE3-SLOW-LABEL: hsubps4: 291; SSE3-SLOW: # %bb.0: 292; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 293; SSE3-SLOW-NEXT: subps %xmm1, %xmm0 294; SSE3-SLOW-NEXT: retq 295; 296; SSE3-FAST-LABEL: hsubps4: 297; SSE3-FAST: # %bb.0: 298; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 299; SSE3-FAST-NEXT: retq 300; 301; AVX-SLOW-LABEL: hsubps4: 302; AVX-SLOW: # %bb.0: 303; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 304; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0 305; AVX-SLOW-NEXT: retq 306; 307; AVX-FAST-LABEL: hsubps4: 308; AVX-FAST: # %bb.0: 309; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 310; AVX-FAST-NEXT: retq 311 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 312 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 313 %r = fsub <4 x float> %a, %b 314 ret <4 x float> %r 315} 316 317define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) { 318; SSE3-LABEL: vhaddps1: 319; SSE3: # %bb.0: 320; SSE3-NEXT: haddps %xmm2, %xmm0 321; SSE3-NEXT: haddps %xmm3, %xmm1 322; SSE3-NEXT: retq 323; 324; AVX-LABEL: vhaddps1: 325; AVX: # %bb.0: 326; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 327; AVX-NEXT: retq 328 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 329 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 330 %r = fadd <8 x float> %a, %b 331 ret <8 x float> %r 332} 333 334define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) { 335; SSE3-LABEL: vhaddps2: 336; SSE3: # %bb.0: 337; SSE3-NEXT: haddps %xmm2, %xmm0 338; SSE3-NEXT: haddps %xmm3, %xmm1 339; SSE3-NEXT: retq 340; 341; AVX-LABEL: vhaddps2: 342; AVX: # %bb.0: 343; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 344; AVX-NEXT: retq 345 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14> 346 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7> 347 %r = fadd <8 x float> %a, %b 348 ret <8 x float> %r 349} 350 351define <8 x float> @vhaddps3(<8 x float> %x) { 352; SSE3-LABEL: vhaddps3: 353; SSE3: # %bb.0: 354; SSE3-NEXT: haddps %xmm0, %xmm0 355; SSE3-NEXT: haddps %xmm1, %xmm1 356; SSE3-NEXT: retq 357; 358; AVX-LABEL: vhaddps3: 359; AVX: # %bb.0: 360; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 361; AVX-NEXT: retq 362 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 363 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 364 %r = fadd <8 x float> %a, %b 365 ret <8 x float> %r 366} 367 368define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) { 369; SSE3-LABEL: vhsubps1: 370; SSE3: # %bb.0: 371; SSE3-NEXT: hsubps %xmm2, %xmm0 372; SSE3-NEXT: hsubps %xmm3, %xmm1 373; SSE3-NEXT: retq 374; 375; AVX-LABEL: vhsubps1: 376; AVX: # %bb.0: 377; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 378; AVX-NEXT: retq 379 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 380 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 381 %r = fsub <8 x float> %a, %b 382 ret <8 x float> %r 383} 384 385define <8 x float> @vhsubps3(<8 x float> %x) { 386; SSE3-LABEL: vhsubps3: 387; SSE3: # %bb.0: 388; SSE3-NEXT: hsubps %xmm0, %xmm0 389; SSE3-NEXT: hsubps %xmm1, %xmm1 390; SSE3-NEXT: retq 391; 392; AVX-LABEL: vhsubps3: 393; AVX: # %bb.0: 394; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 395; AVX-NEXT: retq 396 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 397 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 398 %r = fsub <8 x float> %a, %b 399 ret <8 x float> %r 400} 401 402define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) { 403; SSE3-LABEL: vhaddpd1: 404; SSE3: # %bb.0: 405; SSE3-NEXT: haddpd %xmm2, %xmm0 406; SSE3-NEXT: haddpd %xmm3, %xmm1 407; SSE3-NEXT: retq 408; 409; AVX-LABEL: vhaddpd1: 410; AVX: # %bb.0: 411; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 412; AVX-NEXT: retq 413 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 414 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 415 %r = fadd <4 x double> %a, %b 416 ret <4 x double> %r 417} 418 419define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) { 420; SSE3-LABEL: vhsubpd1: 421; SSE3: # %bb.0: 422; SSE3-NEXT: hsubpd %xmm2, %xmm0 423; SSE3-NEXT: hsubpd %xmm3, %xmm1 424; SSE3-NEXT: retq 425; 426; AVX-LABEL: vhsubpd1: 427; AVX: # %bb.0: 428; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 429; AVX-NEXT: retq 430 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 431 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 432 %r = fsub <4 x double> %a, %b 433 ret <4 x double> %r 434} 435 436define <2 x float> @haddps_v2f32(<4 x float> %v0) { 437; SSE3-LABEL: haddps_v2f32: 438; SSE3: # %bb.0: 439; SSE3-NEXT: haddps %xmm0, %xmm0 440; SSE3-NEXT: retq 441; 442; AVX-LABEL: haddps_v2f32: 443; AVX: # %bb.0: 444; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 445; AVX-NEXT: retq 446 %v0.0 = extractelement <4 x float> %v0, i32 0 447 %v0.1 = extractelement <4 x float> %v0, i32 1 448 %v0.2 = extractelement <4 x float> %v0, i32 2 449 %v0.3 = extractelement <4 x float> %v0, i32 3 450 %op0 = fadd float %v0.0, %v0.1 451 %op1 = fadd float %v0.2, %v0.3 452 %res0 = insertelement <2 x float> undef, float %op0, i32 0 453 %res1 = insertelement <2 x float> %res0, float %op1, i32 1 454 ret <2 x float> %res1 455} 456 457; 128-bit vectors, float/double, fadd/fsub 458 459define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) { 460; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32: 461; SSE3-SLOW: # %bb.0: 462; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 463; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 464; SSE3-SLOW-NEXT: retq 465; 466; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32: 467; SSE3-FAST: # %bb.0: 468; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 469; SSE3-FAST-NEXT: retq 470; 471; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32: 472; AVX-SLOW: # %bb.0: 473; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 474; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 475; AVX-SLOW-NEXT: retq 476; 477; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32: 478; AVX-FAST: # %bb.0: 479; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 480; AVX-FAST-NEXT: retq 481 %x0 = extractelement <4 x float> %x, i32 0 482 %x1 = extractelement <4 x float> %x, i32 1 483 %x01 = fadd float %x0, %x1 484 ret float %x01 485} 486 487define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) { 488; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: 489; SSE3-SLOW: # %bb.0: 490; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 491; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 492; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 493; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 494; SSE3-SLOW-NEXT: retq 495; 496; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32: 497; SSE3-FAST: # %bb.0: 498; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 499; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 500; SSE3-FAST-NEXT: retq 501; 502; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: 503; AVX-SLOW: # %bb.0: 504; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 505; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 506; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 507; AVX-SLOW-NEXT: retq 508; 509; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32: 510; AVX-FAST: # %bb.0: 511; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 512; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 513; AVX-FAST-NEXT: retq 514 %x0 = extractelement <4 x float> %x, i32 2 515 %x1 = extractelement <4 x float> %x, i32 3 516 %x01 = fadd float %x0, %x1 517 ret float %x01 518} 519 520define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) { 521; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute: 522; SSE3-SLOW: # %bb.0: 523; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 524; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 525; SSE3-SLOW-NEXT: retq 526; 527; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute: 528; SSE3-FAST: # %bb.0: 529; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 530; SSE3-FAST-NEXT: retq 531; 532; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute: 533; AVX-SLOW: # %bb.0: 534; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 535; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 536; AVX-SLOW-NEXT: retq 537; 538; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute: 539; AVX-FAST: # %bb.0: 540; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 541; AVX-FAST-NEXT: retq 542 %x0 = extractelement <4 x float> %x, i32 0 543 %x1 = extractelement <4 x float> %x, i32 1 544 %x01 = fadd float %x1, %x0 545 ret float %x01 546} 547 548define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) { 549; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: 550; SSE3-SLOW: # %bb.0: 551; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 552; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 553; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 554; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 555; SSE3-SLOW-NEXT: retq 556; 557; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute: 558; SSE3-FAST: # %bb.0: 559; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 560; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 561; SSE3-FAST-NEXT: retq 562; 563; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: 564; AVX-SLOW: # %bb.0: 565; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 566; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 567; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 568; AVX-SLOW-NEXT: retq 569; 570; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute: 571; AVX-FAST: # %bb.0: 572; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 573; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 574; AVX-FAST-NEXT: retq 575 %x0 = extractelement <4 x float> %x, i32 2 576 %x1 = extractelement <4 x float> %x, i32 3 577 %x01 = fadd float %x1, %x0 578 ret float %x01 579} 580 581define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) { 582; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: 583; SSE3-SLOW: # %bb.0: 584; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 585; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 586; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 587; SSE3-SLOW-NEXT: retq 588; 589; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64: 590; SSE3-FAST: # %bb.0: 591; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 592; SSE3-FAST-NEXT: retq 593; 594; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: 595; AVX-SLOW: # %bb.0: 596; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 597; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 598; AVX-SLOW-NEXT: retq 599; 600; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64: 601; AVX-FAST: # %bb.0: 602; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 603; AVX-FAST-NEXT: retq 604 %x0 = extractelement <2 x double> %x, i32 0 605 %x1 = extractelement <2 x double> %x, i32 1 606 %x01 = fadd double %x0, %x1 607 ret double %x01 608} 609 610define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) { 611; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: 612; SSE3-SLOW: # %bb.0: 613; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 614; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 615; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 616; SSE3-SLOW-NEXT: retq 617; 618; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute: 619; SSE3-FAST: # %bb.0: 620; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 621; SSE3-FAST-NEXT: retq 622; 623; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: 624; AVX-SLOW: # %bb.0: 625; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 626; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 627; AVX-SLOW-NEXT: retq 628; 629; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute: 630; AVX-FAST: # %bb.0: 631; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 632; AVX-FAST-NEXT: retq 633 %x0 = extractelement <2 x double> %x, i32 0 634 %x1 = extractelement <2 x double> %x, i32 1 635 %x01 = fadd double %x1, %x0 636 ret double %x01 637} 638 639define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) { 640; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32: 641; SSE3-SLOW: # %bb.0: 642; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 643; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 644; SSE3-SLOW-NEXT: retq 645; 646; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32: 647; SSE3-FAST: # %bb.0: 648; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 649; SSE3-FAST-NEXT: retq 650; 651; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32: 652; AVX-SLOW: # %bb.0: 653; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 654; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 655; AVX-SLOW-NEXT: retq 656; 657; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32: 658; AVX-FAST: # %bb.0: 659; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 660; AVX-FAST-NEXT: retq 661 %x0 = extractelement <4 x float> %x, i32 0 662 %x1 = extractelement <4 x float> %x, i32 1 663 %x01 = fsub float %x0, %x1 664 ret float %x01 665} 666 667define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) { 668; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: 669; SSE3-SLOW: # %bb.0: 670; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 671; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 672; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 673; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 674; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 675; SSE3-SLOW-NEXT: retq 676; 677; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32: 678; SSE3-FAST: # %bb.0: 679; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 680; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 681; SSE3-FAST-NEXT: retq 682; 683; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: 684; AVX-SLOW: # %bb.0: 685; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 686; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 687; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 688; AVX-SLOW-NEXT: retq 689; 690; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32: 691; AVX-FAST: # %bb.0: 692; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 693; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 694; AVX-FAST-NEXT: retq 695 %x0 = extractelement <4 x float> %x, i32 2 696 %x1 = extractelement <4 x float> %x, i32 3 697 %x01 = fsub float %x0, %x1 698 ret float %x01 699} 700 701define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) { 702; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute: 703; SSE3: # %bb.0: 704; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 705; SSE3-NEXT: subss %xmm0, %xmm1 706; SSE3-NEXT: movaps %xmm1, %xmm0 707; SSE3-NEXT: retq 708; 709; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute: 710; AVX: # %bb.0: 711; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 712; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 713; AVX-NEXT: retq 714 %x0 = extractelement <4 x float> %x, i32 0 715 %x1 = extractelement <4 x float> %x, i32 1 716 %x01 = fsub float %x1, %x0 717 ret float %x01 718} 719 720define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) { 721; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute: 722; SSE3: # %bb.0: 723; SSE3-NEXT: movaps %xmm0, %xmm1 724; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 725; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 726; SSE3-NEXT: subss %xmm1, %xmm0 727; SSE3-NEXT: retq 728; 729; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute: 730; AVX: # %bb.0: 731; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 732; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 733; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 734; AVX-NEXT: retq 735 %x0 = extractelement <4 x float> %x, i32 2 736 %x1 = extractelement <4 x float> %x, i32 3 737 %x01 = fsub float %x1, %x0 738 ret float %x01 739} 740 741define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) { 742; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: 743; SSE3-SLOW: # %bb.0: 744; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 745; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 746; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 747; SSE3-SLOW-NEXT: retq 748; 749; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64: 750; SSE3-FAST: # %bb.0: 751; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 752; SSE3-FAST-NEXT: retq 753; 754; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: 755; AVX-SLOW: # %bb.0: 756; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 757; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 758; AVX-SLOW-NEXT: retq 759; 760; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64: 761; AVX-FAST: # %bb.0: 762; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 763; AVX-FAST-NEXT: retq 764 %x0 = extractelement <2 x double> %x, i32 0 765 %x1 = extractelement <2 x double> %x, i32 1 766 %x01 = fsub double %x0, %x1 767 ret double %x01 768} 769 770define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) { 771; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute: 772; SSE3: # %bb.0: 773; SSE3-NEXT: movapd %xmm0, %xmm1 774; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 775; SSE3-NEXT: subsd %xmm0, %xmm1 776; SSE3-NEXT: movapd %xmm1, %xmm0 777; SSE3-NEXT: retq 778; 779; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute: 780; AVX: # %bb.0: 781; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 782; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 783; AVX-NEXT: retq 784 %x0 = extractelement <2 x double> %x, i32 0 785 %x1 = extractelement <2 x double> %x, i32 1 786 %x01 = fsub double %x1, %x0 787 ret double %x01 788} 789 790; 256-bit vectors, float/double, fadd/fsub 791 792define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) { 793; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32: 794; SSE3-SLOW: # %bb.0: 795; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 796; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 797; SSE3-SLOW-NEXT: retq 798; 799; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32: 800; SSE3-FAST: # %bb.0: 801; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 802; SSE3-FAST-NEXT: retq 803; 804; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32: 805; AVX-SLOW: # %bb.0: 806; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 807; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 808; AVX-SLOW-NEXT: vzeroupper 809; AVX-SLOW-NEXT: retq 810; 811; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32: 812; AVX-FAST: # %bb.0: 813; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 814; AVX-FAST-NEXT: vzeroupper 815; AVX-FAST-NEXT: retq 816 %x0 = extractelement <8 x float> %x, i32 0 817 %x1 = extractelement <8 x float> %x, i32 1 818 %x01 = fadd float %x0, %x1 819 ret float %x01 820} 821 822define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) { 823; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: 824; SSE3-SLOW: # %bb.0: 825; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 826; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 827; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 828; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 829; SSE3-SLOW-NEXT: retq 830; 831; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32: 832; SSE3-FAST: # %bb.0: 833; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 834; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 835; SSE3-FAST-NEXT: retq 836; 837; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: 838; AVX-SLOW: # %bb.0: 839; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 840; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 841; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 842; AVX-SLOW-NEXT: vzeroupper 843; AVX-SLOW-NEXT: retq 844; 845; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32: 846; AVX-FAST: # %bb.0: 847; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 848; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 849; AVX-FAST-NEXT: vzeroupper 850; AVX-FAST-NEXT: retq 851 %x0 = extractelement <8 x float> %x, i32 2 852 %x1 = extractelement <8 x float> %x, i32 3 853 %x01 = fadd float %x0, %x1 854 ret float %x01 855} 856 857define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) { 858; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: 859; SSE3-SLOW: # %bb.0: 860; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 861; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 862; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 863; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 864; SSE3-SLOW-NEXT: retq 865; 866; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32: 867; SSE3-FAST: # %bb.0: 868; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 869; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 870; SSE3-FAST-NEXT: retq 871; 872; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: 873; AVX-SLOW: # %bb.0: 874; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 875; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 876; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 877; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 878; AVX-SLOW-NEXT: vzeroupper 879; AVX-SLOW-NEXT: retq 880; 881; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32: 882; AVX-FAST: # %bb.0: 883; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 884; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 885; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 886; AVX-FAST-NEXT: vzeroupper 887; AVX-FAST-NEXT: retq 888 %x0 = extractelement <8 x float> %x, i32 6 889 %x1 = extractelement <8 x float> %x, i32 7 890 %x01 = fadd float %x0, %x1 891 ret float %x01 892} 893 894define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) { 895; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute: 896; SSE3-SLOW: # %bb.0: 897; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 898; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 899; SSE3-SLOW-NEXT: retq 900; 901; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute: 902; SSE3-FAST: # %bb.0: 903; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 904; SSE3-FAST-NEXT: retq 905; 906; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute: 907; AVX-SLOW: # %bb.0: 908; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 909; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 910; AVX-SLOW-NEXT: vzeroupper 911; AVX-SLOW-NEXT: retq 912; 913; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute: 914; AVX-FAST: # %bb.0: 915; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 916; AVX-FAST-NEXT: vzeroupper 917; AVX-FAST-NEXT: retq 918 %x0 = extractelement <8 x float> %x, i32 0 919 %x1 = extractelement <8 x float> %x, i32 1 920 %x01 = fadd float %x1, %x0 921 ret float %x01 922} 923 924define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) { 925; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: 926; SSE3-SLOW: # %bb.0: 927; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 928; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 929; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 930; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 931; SSE3-SLOW-NEXT: retq 932; 933; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute: 934; SSE3-FAST: # %bb.0: 935; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 936; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 937; SSE3-FAST-NEXT: retq 938; 939; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: 940; AVX-SLOW: # %bb.0: 941; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 942; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 943; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 944; AVX-SLOW-NEXT: vzeroupper 945; AVX-SLOW-NEXT: retq 946; 947; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute: 948; AVX-FAST: # %bb.0: 949; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 950; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 951; AVX-FAST-NEXT: vzeroupper 952; AVX-FAST-NEXT: retq 953 %x0 = extractelement <8 x float> %x, i32 2 954 %x1 = extractelement <8 x float> %x, i32 3 955 %x01 = fadd float %x1, %x0 956 ret float %x01 957} 958 959define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) { 960; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: 961; SSE3-SLOW: # %bb.0: 962; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 963; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 964; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 965; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 966; SSE3-SLOW-NEXT: retq 967; 968; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute: 969; SSE3-FAST: # %bb.0: 970; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 971; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 972; SSE3-FAST-NEXT: retq 973; 974; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: 975; AVX-SLOW: # %bb.0: 976; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 977; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 978; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 979; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 980; AVX-SLOW-NEXT: vzeroupper 981; AVX-SLOW-NEXT: retq 982; 983; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute: 984; AVX-FAST: # %bb.0: 985; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 986; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 987; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 988; AVX-FAST-NEXT: vzeroupper 989; AVX-FAST-NEXT: retq 990 %x0 = extractelement <8 x float> %x, i32 6 991 %x1 = extractelement <8 x float> %x, i32 7 992 %x01 = fadd float %x1, %x0 993 ret float %x01 994} 995 996define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) { 997; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: 998; SSE3-SLOW: # %bb.0: 999; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1000; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1001; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1002; SSE3-SLOW-NEXT: retq 1003; 1004; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64: 1005; SSE3-FAST: # %bb.0: 1006; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1007; SSE3-FAST-NEXT: retq 1008; 1009; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: 1010; AVX-SLOW: # %bb.0: 1011; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1012; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1013; AVX-SLOW-NEXT: vzeroupper 1014; AVX-SLOW-NEXT: retq 1015; 1016; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64: 1017; AVX-FAST: # %bb.0: 1018; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1019; AVX-FAST-NEXT: vzeroupper 1020; AVX-FAST-NEXT: retq 1021 %x0 = extractelement <4 x double> %x, i32 0 1022 %x1 = extractelement <4 x double> %x, i32 1 1023 %x01 = fadd double %x0, %x1 1024 ret double %x01 1025} 1026 1027define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) { 1028; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: 1029; SSE3-SLOW: # %bb.0: 1030; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1031; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1032; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1033; SSE3-SLOW-NEXT: retq 1034; 1035; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64: 1036; SSE3-FAST: # %bb.0: 1037; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 1038; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0 1039; SSE3-FAST-NEXT: retq 1040; 1041; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: 1042; AVX-SLOW: # %bb.0: 1043; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1044; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1045; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1046; AVX-SLOW-NEXT: vzeroupper 1047; AVX-SLOW-NEXT: retq 1048; 1049; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64: 1050; AVX-FAST: # %bb.0: 1051; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1052; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1053; AVX-FAST-NEXT: vzeroupper 1054; AVX-FAST-NEXT: retq 1055 %x0 = extractelement <4 x double> %x, i32 2 1056 %x1 = extractelement <4 x double> %x, i32 3 1057 %x01 = fadd double %x0, %x1 1058 ret double %x01 1059} 1060 1061define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) { 1062; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1063; SSE3-SLOW: # %bb.0: 1064; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1065; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1066; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1067; SSE3-SLOW-NEXT: retq 1068; 1069; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1070; SSE3-FAST: # %bb.0: 1071; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1072; SSE3-FAST-NEXT: retq 1073; 1074; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1075; AVX-SLOW: # %bb.0: 1076; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1077; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1078; AVX-SLOW-NEXT: vzeroupper 1079; AVX-SLOW-NEXT: retq 1080; 1081; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1082; AVX-FAST: # %bb.0: 1083; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1084; AVX-FAST-NEXT: vzeroupper 1085; AVX-FAST-NEXT: retq 1086 %x0 = extractelement <4 x double> %x, i32 0 1087 %x1 = extractelement <4 x double> %x, i32 1 1088 %x01 = fadd double %x1, %x0 1089 ret double %x01 1090} 1091 1092define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) { 1093; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1094; SSE3-SLOW: # %bb.0: 1095; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1096; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1097; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1098; SSE3-SLOW-NEXT: retq 1099; 1100; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1101; SSE3-FAST: # %bb.0: 1102; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 1103; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0 1104; SSE3-FAST-NEXT: retq 1105; 1106; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1107; AVX-SLOW: # %bb.0: 1108; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1109; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1110; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1111; AVX-SLOW-NEXT: vzeroupper 1112; AVX-SLOW-NEXT: retq 1113; 1114; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1115; AVX-FAST: # %bb.0: 1116; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1117; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1118; AVX-FAST-NEXT: vzeroupper 1119; AVX-FAST-NEXT: retq 1120 %x0 = extractelement <4 x double> %x, i32 2 1121 %x1 = extractelement <4 x double> %x, i32 3 1122 %x01 = fadd double %x1, %x0 1123 ret double %x01 1124} 1125 1126define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) { 1127; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32: 1128; SSE3-SLOW: # %bb.0: 1129; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1130; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1131; SSE3-SLOW-NEXT: retq 1132; 1133; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32: 1134; SSE3-FAST: # %bb.0: 1135; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1136; SSE3-FAST-NEXT: retq 1137; 1138; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32: 1139; AVX-SLOW: # %bb.0: 1140; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1141; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1142; AVX-SLOW-NEXT: vzeroupper 1143; AVX-SLOW-NEXT: retq 1144; 1145; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32: 1146; AVX-FAST: # %bb.0: 1147; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1148; AVX-FAST-NEXT: vzeroupper 1149; AVX-FAST-NEXT: retq 1150 %x0 = extractelement <8 x float> %x, i32 0 1151 %x1 = extractelement <8 x float> %x, i32 1 1152 %x01 = fsub float %x0, %x1 1153 ret float %x01 1154} 1155 1156define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) { 1157; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: 1158; SSE3-SLOW: # %bb.0: 1159; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1160; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1161; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1162; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 1163; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1164; SSE3-SLOW-NEXT: retq 1165; 1166; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32: 1167; SSE3-FAST: # %bb.0: 1168; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1169; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1170; SSE3-FAST-NEXT: retq 1171; 1172; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: 1173; AVX-SLOW: # %bb.0: 1174; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1175; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1176; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 1177; AVX-SLOW-NEXT: vzeroupper 1178; AVX-SLOW-NEXT: retq 1179; 1180; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32: 1181; AVX-FAST: # %bb.0: 1182; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1183; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1184; AVX-FAST-NEXT: vzeroupper 1185; AVX-FAST-NEXT: retq 1186 %x0 = extractelement <8 x float> %x, i32 2 1187 %x1 = extractelement <8 x float> %x, i32 3 1188 %x01 = fsub float %x0, %x1 1189 ret float %x01 1190} 1191 1192define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) { 1193; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32: 1194; SSE3-SLOW: # %bb.0: 1195; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1196; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 1197; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1198; SSE3-SLOW-NEXT: retq 1199; 1200; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32: 1201; SSE3-FAST: # %bb.0: 1202; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 1203; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0 1204; SSE3-FAST-NEXT: retq 1205; 1206; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32: 1207; AVX-SLOW: # %bb.0: 1208; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1209; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1210; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1211; AVX-SLOW-NEXT: vzeroupper 1212; AVX-SLOW-NEXT: retq 1213; 1214; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32: 1215; AVX-FAST: # %bb.0: 1216; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1217; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1218; AVX-FAST-NEXT: vzeroupper 1219; AVX-FAST-NEXT: retq 1220 %x0 = extractelement <8 x float> %x, i32 4 1221 %x1 = extractelement <8 x float> %x, i32 5 1222 %x01 = fsub float %x0, %x1 1223 ret float %x01 1224} 1225 1226; Negative test...or get hoppy and negate? 1227 1228define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) { 1229; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute: 1230; SSE3: # %bb.0: 1231; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1232; SSE3-NEXT: subss %xmm0, %xmm1 1233; SSE3-NEXT: movaps %xmm1, %xmm0 1234; SSE3-NEXT: retq 1235; 1236; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute: 1237; AVX: # %bb.0: 1238; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1239; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 1240; AVX-NEXT: vzeroupper 1241; AVX-NEXT: retq 1242 %x0 = extractelement <8 x float> %x, i32 0 1243 %x1 = extractelement <8 x float> %x, i32 1 1244 %x01 = fsub float %x1, %x0 1245 ret float %x01 1246} 1247 1248define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) { 1249; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: 1250; SSE3-SLOW: # %bb.0: 1251; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1252; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1253; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 1254; SSE3-SLOW-NEXT: retq 1255; 1256; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64: 1257; SSE3-FAST: # %bb.0: 1258; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 1259; SSE3-FAST-NEXT: retq 1260; 1261; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: 1262; AVX-SLOW: # %bb.0: 1263; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1264; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 1265; AVX-SLOW-NEXT: vzeroupper 1266; AVX-SLOW-NEXT: retq 1267; 1268; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64: 1269; AVX-FAST: # %bb.0: 1270; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1271; AVX-FAST-NEXT: vzeroupper 1272; AVX-FAST-NEXT: retq 1273 %x0 = extractelement <4 x double> %x, i32 0 1274 %x1 = extractelement <4 x double> %x, i32 1 1275 %x01 = fsub double %x0, %x1 1276 ret double %x01 1277} 1278 1279; Negative test...or get hoppy and negate? 1280 1281define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) { 1282; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute: 1283; SSE3: # %bb.0: 1284; SSE3-NEXT: movapd %xmm0, %xmm1 1285; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1286; SSE3-NEXT: subsd %xmm0, %xmm1 1287; SSE3-NEXT: movapd %xmm1, %xmm0 1288; SSE3-NEXT: retq 1289; 1290; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute: 1291; AVX: # %bb.0: 1292; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1293; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1294; AVX-NEXT: vzeroupper 1295; AVX-NEXT: retq 1296 %x0 = extractelement <4 x double> %x, i32 0 1297 %x1 = extractelement <4 x double> %x, i32 1 1298 %x01 = fsub double %x1, %x0 1299 ret double %x01 1300} 1301 1302; 512-bit vectors, float/double, fadd/fsub 1303 1304define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) { 1305; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32: 1306; SSE3-SLOW: # %bb.0: 1307; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1308; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1309; SSE3-SLOW-NEXT: retq 1310; 1311; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32: 1312; SSE3-FAST: # %bb.0: 1313; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1314; SSE3-FAST-NEXT: retq 1315; 1316; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32: 1317; AVX-SLOW: # %bb.0: 1318; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1319; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1320; AVX-SLOW-NEXT: vzeroupper 1321; AVX-SLOW-NEXT: retq 1322; 1323; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32: 1324; AVX-FAST: # %bb.0: 1325; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1326; AVX-FAST-NEXT: vzeroupper 1327; AVX-FAST-NEXT: retq 1328 %x0 = extractelement <16 x float> %x, i32 0 1329 %x1 = extractelement <16 x float> %x, i32 1 1330 %x01 = fadd float %x0, %x1 1331 ret float %x01 1332} 1333 1334define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) { 1335; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1336; SSE3-SLOW: # %bb.0: 1337; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1338; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1339; SSE3-SLOW-NEXT: retq 1340; 1341; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1342; SSE3-FAST: # %bb.0: 1343; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1344; SSE3-FAST-NEXT: retq 1345; 1346; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1347; AVX-SLOW: # %bb.0: 1348; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1349; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 1350; AVX-SLOW-NEXT: vzeroupper 1351; AVX-SLOW-NEXT: retq 1352; 1353; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1354; AVX-FAST: # %bb.0: 1355; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1356; AVX-FAST-NEXT: vzeroupper 1357; AVX-FAST-NEXT: retq 1358 %x0 = extractelement <16 x float> %x, i32 0 1359 %x1 = extractelement <16 x float> %x, i32 1 1360 %x01 = fadd float %x1, %x0 1361 ret float %x01 1362} 1363 1364define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) { 1365; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: 1366; SSE3-SLOW: # %bb.0: 1367; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1368; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1369; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1370; SSE3-SLOW-NEXT: retq 1371; 1372; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64: 1373; SSE3-FAST: # %bb.0: 1374; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1375; SSE3-FAST-NEXT: retq 1376; 1377; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: 1378; AVX-SLOW: # %bb.0: 1379; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1380; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1381; AVX-SLOW-NEXT: vzeroupper 1382; AVX-SLOW-NEXT: retq 1383; 1384; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64: 1385; AVX-FAST: # %bb.0: 1386; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1387; AVX-FAST-NEXT: vzeroupper 1388; AVX-FAST-NEXT: retq 1389 %x0 = extractelement <8 x double> %x, i32 0 1390 %x1 = extractelement <8 x double> %x, i32 1 1391 %x01 = fadd double %x0, %x1 1392 ret double %x01 1393} 1394 1395define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) { 1396; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1397; SSE3-SLOW: # %bb.0: 1398; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1399; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1400; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1401; SSE3-SLOW-NEXT: retq 1402; 1403; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1404; SSE3-FAST: # %bb.0: 1405; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1406; SSE3-FAST-NEXT: retq 1407; 1408; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1409; AVX-SLOW: # %bb.0: 1410; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1411; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1412; AVX-SLOW-NEXT: vzeroupper 1413; AVX-SLOW-NEXT: retq 1414; 1415; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1416; AVX-FAST: # %bb.0: 1417; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1418; AVX-FAST-NEXT: vzeroupper 1419; AVX-FAST-NEXT: retq 1420 %x0 = extractelement <8 x double> %x, i32 0 1421 %x1 = extractelement <8 x double> %x, i32 1 1422 %x01 = fadd double %x1, %x0 1423 ret double %x01 1424} 1425 1426define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) { 1427; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32: 1428; SSE3-SLOW: # %bb.0: 1429; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1430; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1431; SSE3-SLOW-NEXT: retq 1432; 1433; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32: 1434; SSE3-FAST: # %bb.0: 1435; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1436; SSE3-FAST-NEXT: retq 1437; 1438; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32: 1439; AVX-SLOW: # %bb.0: 1440; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1441; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1442; AVX-SLOW-NEXT: vzeroupper 1443; AVX-SLOW-NEXT: retq 1444; 1445; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32: 1446; AVX-FAST: # %bb.0: 1447; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1448; AVX-FAST-NEXT: vzeroupper 1449; AVX-FAST-NEXT: retq 1450 %x0 = extractelement <16 x float> %x, i32 0 1451 %x1 = extractelement <16 x float> %x, i32 1 1452 %x01 = fsub float %x0, %x1 1453 ret float %x01 1454} 1455 1456define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) { 1457; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute: 1458; SSE3: # %bb.0: 1459; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1460; SSE3-NEXT: subss %xmm0, %xmm1 1461; SSE3-NEXT: movaps %xmm1, %xmm0 1462; SSE3-NEXT: retq 1463; 1464; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute: 1465; AVX: # %bb.0: 1466; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1467; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 1468; AVX-NEXT: vzeroupper 1469; AVX-NEXT: retq 1470 %x0 = extractelement <16 x float> %x, i32 0 1471 %x1 = extractelement <16 x float> %x, i32 1 1472 %x01 = fsub float %x1, %x0 1473 ret float %x01 1474} 1475 1476define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) { 1477; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: 1478; SSE3-SLOW: # %bb.0: 1479; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1480; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1481; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 1482; SSE3-SLOW-NEXT: retq 1483; 1484; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64: 1485; SSE3-FAST: # %bb.0: 1486; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 1487; SSE3-FAST-NEXT: retq 1488; 1489; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: 1490; AVX-SLOW: # %bb.0: 1491; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1492; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 1493; AVX-SLOW-NEXT: vzeroupper 1494; AVX-SLOW-NEXT: retq 1495; 1496; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64: 1497; AVX-FAST: # %bb.0: 1498; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1499; AVX-FAST-NEXT: vzeroupper 1500; AVX-FAST-NEXT: retq 1501 %x0 = extractelement <8 x double> %x, i32 0 1502 %x1 = extractelement <8 x double> %x, i32 1 1503 %x01 = fsub double %x0, %x1 1504 ret double %x01 1505} 1506 1507define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) { 1508; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute: 1509; SSE3: # %bb.0: 1510; SSE3-NEXT: movapd %xmm0, %xmm1 1511; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1512; SSE3-NEXT: subsd %xmm0, %xmm1 1513; SSE3-NEXT: movapd %xmm1, %xmm0 1514; SSE3-NEXT: retq 1515; 1516; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute: 1517; AVX: # %bb.0: 1518; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1519; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1520; AVX-NEXT: vzeroupper 1521; AVX-NEXT: retq 1522 %x0 = extractelement <8 x double> %x, i32 0 1523 %x1 = extractelement <8 x double> %x, i32 1 1524 %x01 = fsub double %x1, %x0 1525 ret double %x01 1526} 1527 1528; Check output when 1 or both extracts have extra uses. 1529 1530define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, ptr %p) { 1531; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1532; SSE3-SLOW: # %bb.0: 1533; SSE3-SLOW-NEXT: movss %xmm0, (%rdi) 1534; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1535; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1536; SSE3-SLOW-NEXT: retq 1537; 1538; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1539; SSE3-FAST: # %bb.0: 1540; SSE3-FAST-NEXT: movss %xmm0, (%rdi) 1541; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1542; SSE3-FAST-NEXT: retq 1543; 1544; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1545; AVX-SLOW: # %bb.0: 1546; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi) 1547; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1548; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1549; AVX-SLOW-NEXT: retq 1550; 1551; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1552; AVX-FAST: # %bb.0: 1553; AVX-FAST-NEXT: vmovss %xmm0, (%rdi) 1554; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1555; AVX-FAST-NEXT: retq 1556 %x0 = extractelement <4 x float> %x, i32 0 1557 store float %x0, ptr %p 1558 %x1 = extractelement <4 x float> %x, i32 1 1559 %x01 = fadd float %x0, %x1 1560 ret float %x01 1561} 1562 1563define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, ptr %p) { 1564; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1565; SSE3-SLOW: # %bb.0: 1566; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1567; SSE3-SLOW-NEXT: movss %xmm1, (%rdi) 1568; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1569; SSE3-SLOW-NEXT: retq 1570; 1571; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1572; SSE3-FAST: # %bb.0: 1573; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1574; SSE3-FAST-NEXT: movss %xmm1, (%rdi) 1575; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1576; SSE3-FAST-NEXT: retq 1577; 1578; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1579; AVX-SLOW: # %bb.0: 1580; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1581; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi) 1582; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1583; AVX-SLOW-NEXT: retq 1584; 1585; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1586; AVX-FAST: # %bb.0: 1587; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi) 1588; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1589; AVX-FAST-NEXT: retq 1590 %x0 = extractelement <4 x float> %x, i32 0 1591 %x1 = extractelement <4 x float> %x, i32 1 1592 store float %x1, ptr %p 1593 %x01 = fadd float %x0, %x1 1594 ret float %x01 1595} 1596 1597define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, ptr %p1, ptr %p2) { 1598; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3: 1599; SSE3: # %bb.0: 1600; SSE3-NEXT: movss %xmm0, (%rdi) 1601; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1602; SSE3-NEXT: movss %xmm1, (%rsi) 1603; SSE3-NEXT: addss %xmm1, %xmm0 1604; SSE3-NEXT: retq 1605; 1606; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3: 1607; AVX: # %bb.0: 1608; AVX-NEXT: vmovss %xmm0, (%rdi) 1609; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1610; AVX-NEXT: vmovss %xmm1, (%rsi) 1611; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1612; AVX-NEXT: retq 1613 %x0 = extractelement <4 x float> %x, i32 0 1614 store float %x0, ptr %p1 1615 %x1 = extractelement <4 x float> %x, i32 1 1616 store float %x1, ptr %p2 1617 %x01 = fadd float %x0, %x1 1618 ret float %x01 1619} 1620 1621; Repeat tests from general reductions to verify output for hoppy targets: 1622; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 1623 1624declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1625declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1626 1627define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { 1628; SSE3-SLOW-LABEL: fadd_reduce_v8f32: 1629; SSE3-SLOW: # %bb.0: 1630; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 1631; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 1632; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1633; SSE3-SLOW-NEXT: addps %xmm1, %xmm2 1634; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1635; SSE3-SLOW-NEXT: addss %xmm2, %xmm1 1636; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1637; SSE3-SLOW-NEXT: retq 1638; 1639; SSE3-FAST-LABEL: fadd_reduce_v8f32: 1640; SSE3-FAST: # %bb.0: 1641; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 1642; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 1643; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 1644; SSE3-FAST-NEXT: addss %xmm2, %xmm0 1645; SSE3-FAST-NEXT: retq 1646; 1647; AVX-SLOW-LABEL: fadd_reduce_v8f32: 1648; AVX-SLOW: # %bb.0: 1649; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1650; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 1651; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1652; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 1653; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1654; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 1655; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1656; AVX-SLOW-NEXT: vzeroupper 1657; AVX-SLOW-NEXT: retq 1658; 1659; AVX-FAST-LABEL: fadd_reduce_v8f32: 1660; AVX-FAST: # %bb.0: 1661; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 1662; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 1663; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 1664; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 1665; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 1666; AVX-FAST-NEXT: vzeroupper 1667; AVX-FAST-NEXT: retq 1668 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 1669 ret float %r 1670} 1671 1672define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) { 1673; SSE3-SLOW-LABEL: fadd_reduce_v4f64: 1674; SSE3-SLOW: # %bb.0: 1675; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1 1676; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2 1677; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1678; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2 1679; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0 1680; SSE3-SLOW-NEXT: retq 1681; 1682; SSE3-FAST-LABEL: fadd_reduce_v4f64: 1683; SSE3-FAST: # %bb.0: 1684; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 1685; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 1686; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 1687; SSE3-FAST-NEXT: retq 1688; 1689; AVX-SLOW-LABEL: fadd_reduce_v4f64: 1690; AVX-SLOW: # %bb.0: 1691; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1692; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 1693; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1694; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1695; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1696; AVX-SLOW-NEXT: vzeroupper 1697; AVX-SLOW-NEXT: retq 1698; 1699; AVX-FAST-LABEL: fadd_reduce_v4f64: 1700; AVX-FAST: # %bb.0: 1701; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 1702; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 1703; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 1704; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1705; AVX-FAST-NEXT: vzeroupper 1706; AVX-FAST-NEXT: retq 1707 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 1708 ret double %r 1709} 1710 1711define float @PR39936_v8f32(<8 x float>) { 1712; SSSE3-SLOW-LABEL: PR39936_v8f32: 1713; SSSE3-SLOW: # %bb.0: 1714; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 1715; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1716; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 1717; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 1718; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 1719; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1720; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 1721; SSSE3-SLOW-NEXT: retq 1722; 1723; SSSE3-FAST-LABEL: PR39936_v8f32: 1724; SSSE3-FAST: # %bb.0: 1725; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 1726; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 1727; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 1728; SSSE3-FAST-NEXT: retq 1729; 1730; SSE3-SLOW-LABEL: PR39936_v8f32: 1731; SSE3-SLOW: # %bb.0: 1732; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0 1733; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0 1734; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1735; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1736; SSE3-SLOW-NEXT: retq 1737; 1738; SSE3-FAST-LABEL: PR39936_v8f32: 1739; SSE3-FAST: # %bb.0: 1740; SSE3-FAST-NEXT: haddps %xmm1, %xmm0 1741; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1742; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1743; SSE3-FAST-NEXT: retq 1744; 1745; AVX-SLOW-LABEL: PR39936_v8f32: 1746; AVX-SLOW: # %bb.0: 1747; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1748; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1749; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1750; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1751; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1752; AVX-SLOW-NEXT: vzeroupper 1753; AVX-SLOW-NEXT: retq 1754; 1755; AVX-FAST-LABEL: PR39936_v8f32: 1756; AVX-FAST: # %bb.0: 1757; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1758; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1759; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1760; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1761; AVX-FAST-NEXT: vzeroupper 1762; AVX-FAST-NEXT: retq 1763 %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 1764 %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1765 %4 = fadd <8 x float> %2, %3 1766 %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1767 %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1768 %7 = fadd <8 x float> %5, %6 1769 %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1770 %9 = fadd <8 x float> %7, %8 1771 %10 = extractelement <8 x float> %9, i32 0 1772 ret float %10 1773} 1774 1775define float @hadd32_4(<4 x float> %x225) { 1776; SSE3-SLOW-LABEL: hadd32_4: 1777; SSE3-SLOW: # %bb.0: 1778; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1779; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1780; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 1781; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1782; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1783; SSE3-SLOW-NEXT: retq 1784; 1785; SSE3-FAST-LABEL: hadd32_4: 1786; SSE3-FAST: # %bb.0: 1787; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1788; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1789; SSE3-FAST-NEXT: addps %xmm1, %xmm0 1790; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1791; SSE3-FAST-NEXT: retq 1792; 1793; AVX-SLOW-LABEL: hadd32_4: 1794; AVX-SLOW: # %bb.0: 1795; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1796; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1797; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1798; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1799; AVX-SLOW-NEXT: retq 1800; 1801; AVX-FAST-LABEL: hadd32_4: 1802; AVX-FAST: # %bb.0: 1803; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1804; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1805; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1806; AVX-FAST-NEXT: retq 1807 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1808 %x227 = fadd <4 x float> %x225, %x226 1809 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1810 %x229 = fadd <4 x float> %x227, %x228 1811 %x230 = extractelement <4 x float> %x229, i32 0 1812 ret float %x230 1813} 1814 1815define float @hadd32_8(<8 x float> %x225) { 1816; SSE3-SLOW-LABEL: hadd32_8: 1817; SSE3-SLOW: # %bb.0: 1818; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1819; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1820; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 1821; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1822; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1823; SSE3-SLOW-NEXT: retq 1824; 1825; SSE3-FAST-LABEL: hadd32_8: 1826; SSE3-FAST: # %bb.0: 1827; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1828; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1829; SSE3-FAST-NEXT: addps %xmm1, %xmm0 1830; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1831; SSE3-FAST-NEXT: retq 1832; 1833; AVX-SLOW-LABEL: hadd32_8: 1834; AVX-SLOW: # %bb.0: 1835; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1836; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1837; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1838; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1839; AVX-SLOW-NEXT: vzeroupper 1840; AVX-SLOW-NEXT: retq 1841; 1842; AVX-FAST-LABEL: hadd32_8: 1843; AVX-FAST: # %bb.0: 1844; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1845; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1846; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1847; AVX-FAST-NEXT: vzeroupper 1848; AVX-FAST-NEXT: retq 1849 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1850 %x227 = fadd <8 x float> %x225, %x226 1851 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1852 %x229 = fadd <8 x float> %x227, %x228 1853 %x230 = extractelement <8 x float> %x229, i32 0 1854 ret float %x230 1855} 1856 1857define float @hadd32_16(<16 x float> %x225) { 1858; SSE3-SLOW-LABEL: hadd32_16: 1859; SSE3-SLOW: # %bb.0: 1860; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1861; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1862; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 1863; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1864; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1865; SSE3-SLOW-NEXT: retq 1866; 1867; SSE3-FAST-LABEL: hadd32_16: 1868; SSE3-FAST: # %bb.0: 1869; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1870; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1871; SSE3-FAST-NEXT: addps %xmm1, %xmm0 1872; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1873; SSE3-FAST-NEXT: retq 1874; 1875; AVX-SLOW-LABEL: hadd32_16: 1876; AVX-SLOW: # %bb.0: 1877; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1878; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1879; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1880; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1881; AVX-SLOW-NEXT: vzeroupper 1882; AVX-SLOW-NEXT: retq 1883; 1884; AVX-FAST-LABEL: hadd32_16: 1885; AVX-FAST: # %bb.0: 1886; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1887; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1888; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1889; AVX-FAST-NEXT: vzeroupper 1890; AVX-FAST-NEXT: retq 1891 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1892 %x227 = fadd <16 x float> %x225, %x226 1893 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1894 %x229 = fadd <16 x float> %x227, %x228 1895 %x230 = extractelement <16 x float> %x229, i32 0 1896 ret float %x230 1897} 1898 1899define float @hadd32_4_optsize(<4 x float> %x225) optsize { 1900; SSE3-LABEL: hadd32_4_optsize: 1901; SSE3: # %bb.0: 1902; SSE3-NEXT: movaps %xmm0, %xmm1 1903; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1904; SSE3-NEXT: addps %xmm1, %xmm0 1905; SSE3-NEXT: haddps %xmm0, %xmm0 1906; SSE3-NEXT: retq 1907; 1908; AVX-LABEL: hadd32_4_optsize: 1909; AVX: # %bb.0: 1910; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1911; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1912; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1913; AVX-NEXT: retq 1914 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1915 %x227 = fadd <4 x float> %x225, %x226 1916 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1917 %x229 = fadd <4 x float> %x227, %x228 1918 %x230 = extractelement <4 x float> %x229, i32 0 1919 ret float %x230 1920} 1921 1922define float @hadd32_8_optsize(<8 x float> %x225) optsize { 1923; SSE3-LABEL: hadd32_8_optsize: 1924; SSE3: # %bb.0: 1925; SSE3-NEXT: movaps %xmm0, %xmm1 1926; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1927; SSE3-NEXT: addps %xmm1, %xmm0 1928; SSE3-NEXT: haddps %xmm0, %xmm0 1929; SSE3-NEXT: retq 1930; 1931; AVX-LABEL: hadd32_8_optsize: 1932; AVX: # %bb.0: 1933; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1934; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1935; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1936; AVX-NEXT: vzeroupper 1937; AVX-NEXT: retq 1938 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1939 %x227 = fadd <8 x float> %x225, %x226 1940 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1941 %x229 = fadd <8 x float> %x227, %x228 1942 %x230 = extractelement <8 x float> %x229, i32 0 1943 ret float %x230 1944} 1945 1946define float @hadd32_16_optsize(<16 x float> %x225) optsize { 1947; SSE3-LABEL: hadd32_16_optsize: 1948; SSE3: # %bb.0: 1949; SSE3-NEXT: movaps %xmm0, %xmm1 1950; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1951; SSE3-NEXT: addps %xmm1, %xmm0 1952; SSE3-NEXT: haddps %xmm0, %xmm0 1953; SSE3-NEXT: retq 1954; 1955; AVX-LABEL: hadd32_16_optsize: 1956; AVX: # %bb.0: 1957; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1958; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1959; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1960; AVX-NEXT: vzeroupper 1961; AVX-NEXT: retq 1962 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1963 %x227 = fadd <16 x float> %x225, %x226 1964 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1965 %x229 = fadd <16 x float> %x227, %x228 1966 %x230 = extractelement <16 x float> %x229, i32 0 1967 ret float %x230 1968} 1969 1970define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { 1971; SSE3-LABEL: hadd32_4_pgso: 1972; SSE3: # %bb.0: 1973; SSE3-NEXT: movaps %xmm0, %xmm1 1974; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1975; SSE3-NEXT: addps %xmm1, %xmm0 1976; SSE3-NEXT: haddps %xmm0, %xmm0 1977; SSE3-NEXT: retq 1978; 1979; AVX-LABEL: hadd32_4_pgso: 1980; AVX: # %bb.0: 1981; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 1982; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1983; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1984; AVX-NEXT: retq 1985 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1986 %x227 = fadd <4 x float> %x225, %x226 1987 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1988 %x229 = fadd <4 x float> %x227, %x228 1989 %x230 = extractelement <4 x float> %x229, i32 0 1990 ret float %x230 1991} 1992 1993define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { 1994; SSE3-LABEL: hadd32_8_pgso: 1995; SSE3: # %bb.0: 1996; SSE3-NEXT: movaps %xmm0, %xmm1 1997; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1998; SSE3-NEXT: addps %xmm1, %xmm0 1999; SSE3-NEXT: haddps %xmm0, %xmm0 2000; SSE3-NEXT: retq 2001; 2002; AVX-LABEL: hadd32_8_pgso: 2003; AVX: # %bb.0: 2004; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2005; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2006; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2007; AVX-NEXT: vzeroupper 2008; AVX-NEXT: retq 2009 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2010 %x227 = fadd <8 x float> %x225, %x226 2011 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2012 %x229 = fadd <8 x float> %x227, %x228 2013 %x230 = extractelement <8 x float> %x229, i32 0 2014 ret float %x230 2015} 2016 2017define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { 2018; SSE3-LABEL: hadd32_16_pgso: 2019; SSE3: # %bb.0: 2020; SSE3-NEXT: movaps %xmm0, %xmm1 2021; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2022; SSE3-NEXT: addps %xmm1, %xmm0 2023; SSE3-NEXT: haddps %xmm0, %xmm0 2024; SSE3-NEXT: retq 2025; 2026; AVX-LABEL: hadd32_16_pgso: 2027; AVX: # %bb.0: 2028; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2029; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2030; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2031; AVX-NEXT: vzeroupper 2032; AVX-NEXT: retq 2033 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2034 %x227 = fadd <16 x float> %x225, %x226 2035 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2036 %x229 = fadd <16 x float> %x227, %x228 2037 %x230 = extractelement <16 x float> %x229, i32 0 2038 ret float %x230 2039} 2040 2041define float @partial_reduction_fadd_v8f32(<8 x float> %x) { 2042; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32: 2043; SSE3-SLOW: # %bb.0: 2044; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2045; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2046; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 2047; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2048; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 2049; SSE3-SLOW-NEXT: retq 2050; 2051; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32: 2052; SSE3-FAST: # %bb.0: 2053; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2054; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2055; SSE3-FAST-NEXT: addps %xmm1, %xmm0 2056; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 2057; SSE3-FAST-NEXT: retq 2058; 2059; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: 2060; AVX-SLOW: # %bb.0: 2061; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2062; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2063; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2064; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2065; AVX-SLOW-NEXT: vzeroupper 2066; AVX-SLOW-NEXT: retq 2067; 2068; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: 2069; AVX-FAST: # %bb.0: 2070; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2071; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2072; AVX-FAST-NEXT: vzeroupper 2073; AVX-FAST-NEXT: retq 2074 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2075 %x0213 = fadd <8 x float> %x, %x23 2076 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2077 %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13 2078 %r = extractelement <8 x float> %x0123, i32 0 2079 ret float %r 2080} 2081 2082; Negative test - only the flags on the final math op in the 2083; sequence determine whether we can transform to horizontal ops. 2084 2085define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { 2086; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2087; SSE3-SLOW: # %bb.0: 2088; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2089; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2090; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 2091; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2092; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 2093; SSE3-SLOW-NEXT: retq 2094; 2095; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2096; SSE3-FAST: # %bb.0: 2097; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2098; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2099; SSE3-FAST-NEXT: addps %xmm1, %xmm0 2100; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 2101; SSE3-FAST-NEXT: retq 2102; 2103; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2104; AVX-SLOW: # %bb.0: 2105; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2106; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2107; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2108; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2109; AVX-SLOW-NEXT: vzeroupper 2110; AVX-SLOW-NEXT: retq 2111; 2112; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2113; AVX-FAST: # %bb.0: 2114; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2115; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 2116; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2117; AVX-FAST-NEXT: vzeroupper 2118; AVX-FAST-NEXT: retq 2119 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2120 %x0213 = fadd fast <8 x float> %x, %x23 2121 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2122 %x0123 = fadd ninf nnan <8 x float> %x0213, %x13 2123 %r = extractelement <8 x float> %x0123, i32 0 2124 ret float %r 2125} 2126 2127define float @partial_reduction_fadd_v16f32(<16 x float> %x) { 2128; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32: 2129; SSE3-SLOW: # %bb.0: 2130; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2131; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2132; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 2133; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2134; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 2135; SSE3-SLOW-NEXT: retq 2136; 2137; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32: 2138; SSE3-FAST: # %bb.0: 2139; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2140; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2141; SSE3-FAST-NEXT: addps %xmm1, %xmm0 2142; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 2143; SSE3-FAST-NEXT: retq 2144; 2145; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32: 2146; AVX-SLOW: # %bb.0: 2147; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 2148; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2149; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2150; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2151; AVX-SLOW-NEXT: vzeroupper 2152; AVX-SLOW-NEXT: retq 2153; 2154; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: 2155; AVX-FAST: # %bb.0: 2156; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2157; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2158; AVX-FAST-NEXT: vzeroupper 2159; AVX-FAST-NEXT: retq 2160 %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2161 %x0213 = fadd <16 x float> %x, %x23 2162 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2163 %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13 2164 %r = extractelement <16 x float> %x0123, i32 0 2165 ret float %r 2166} 2167 2168!llvm.module.flags = !{!0} 2169!0 = !{i32 1, !"ProfileSummary", !1} 2170!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 2171!2 = !{!"ProfileFormat", !"InstrProf"} 2172!3 = !{!"TotalCount", i64 10000} 2173!4 = !{!"MaxCount", i64 10} 2174!5 = !{!"MaxInternalCount", i64 1} 2175!6 = !{!"MaxFunctionCount", i64 1000} 2176!7 = !{!"NumCounts", i64 3} 2177!8 = !{!"NumFunctions", i64 3} 2178!9 = !{!"DetailedSummary", !10} 2179!10 = !{!11, !12, !13} 2180!11 = !{i32 10000, i64 100, i32 1} 2181!12 = !{i32 999000, i64 100, i32 1} 2182!13 = !{i32 999999, i64 1, i32 2} 2183!14 = !{!"function_entry_count", i64 0} 2184