1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 6 7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) { 8; SSE-LABEL: hadd_ps_test1: 9; SSE: # %bb.0: 10; SSE-NEXT: haddps %xmm1, %xmm0 11; SSE-NEXT: retq 12; 13; AVX-LABEL: hadd_ps_test1: 14; AVX: # %bb.0: 15; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 16; AVX-NEXT: retq 17 %vecext = extractelement <4 x float> %A, i32 0 18 %vecext1 = extractelement <4 x float> %A, i32 1 19 %add = fadd float %vecext, %vecext1 20 %vecinit = insertelement <4 x float> undef, float %add, i32 0 21 %vecext2 = extractelement <4 x float> %A, i32 2 22 %vecext3 = extractelement <4 x float> %A, i32 3 23 %add4 = fadd float %vecext2, %vecext3 24 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1 25 %vecext6 = extractelement <4 x float> %B, i32 0 26 %vecext7 = extractelement <4 x float> %B, i32 1 27 %add8 = fadd float %vecext6, %vecext7 28 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2 29 %vecext10 = extractelement <4 x float> %B, i32 2 30 %vecext11 = extractelement <4 x float> %B, i32 3 31 %add12 = fadd float %vecext10, %vecext11 32 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3 33 ret <4 x float> %vecinit13 34} 35 36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) { 37; SSE-LABEL: hadd_ps_test2: 38; SSE: # %bb.0: 39; SSE-NEXT: haddps %xmm1, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: hadd_ps_test2: 43; AVX: # %bb.0: 44; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %vecext = extractelement <4 x float> %A, i32 2 47 %vecext1 = extractelement <4 x float> %A, i32 3 48 %add = fadd float %vecext, %vecext1 49 %vecinit = insertelement <4 x float> undef, float %add, i32 1 50 %vecext2 = extractelement <4 x float> %A, i32 0 51 %vecext3 = extractelement <4 x float> %A, i32 1 52 %add4 = fadd float %vecext2, %vecext3 53 %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0 54 %vecext6 = extractelement <4 x float> %B, i32 2 55 %vecext7 = extractelement <4 x float> %B, i32 3 56 %add8 = fadd float %vecext6, %vecext7 57 %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3 58 %vecext10 = extractelement <4 x float> %B, i32 0 59 %vecext11 = extractelement <4 x float> %B, i32 1 60 %add12 = fadd float %vecext10, %vecext11 61 %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2 62 ret <4 x float> %vecinit13 63} 64 65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) { 66; SSE-LABEL: hsub_ps_test1: 67; SSE: # %bb.0: 68; SSE-NEXT: hsubps %xmm1, %xmm0 69; SSE-NEXT: retq 70; 71; AVX-LABEL: hsub_ps_test1: 72; AVX: # %bb.0: 73; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 74; AVX-NEXT: retq 75 %vecext = extractelement <4 x float> %A, i32 0 76 %vecext1 = extractelement <4 x float> %A, i32 1 77 %sub = fsub float %vecext, %vecext1 78 %vecinit = insertelement <4 x float> undef, float %sub, i32 0 79 %vecext2 = extractelement <4 x float> %A, i32 2 80 %vecext3 = extractelement <4 x float> %A, i32 3 81 %sub4 = fsub float %vecext2, %vecext3 82 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1 83 %vecext6 = extractelement <4 x float> %B, i32 0 84 %vecext7 = extractelement <4 x float> %B, i32 1 85 %sub8 = fsub float %vecext6, %vecext7 86 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2 87 %vecext10 = extractelement <4 x float> %B, i32 2 88 %vecext11 = extractelement <4 x float> %B, i32 3 89 %sub12 = fsub float %vecext10, %vecext11 90 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3 91 ret <4 x float> %vecinit13 92} 93 94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) { 95; SSE-LABEL: hsub_ps_test2: 96; SSE: # %bb.0: 97; SSE-NEXT: hsubps %xmm1, %xmm0 98; SSE-NEXT: retq 99; 100; AVX-LABEL: hsub_ps_test2: 101; AVX: # %bb.0: 102; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 103; AVX-NEXT: retq 104 %vecext = extractelement <4 x float> %A, i32 2 105 %vecext1 = extractelement <4 x float> %A, i32 3 106 %sub = fsub float %vecext, %vecext1 107 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 108 %vecext2 = extractelement <4 x float> %A, i32 0 109 %vecext3 = extractelement <4 x float> %A, i32 1 110 %sub4 = fsub float %vecext2, %vecext3 111 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 112 %vecext6 = extractelement <4 x float> %B, i32 2 113 %vecext7 = extractelement <4 x float> %B, i32 3 114 %sub8 = fsub float %vecext6, %vecext7 115 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 116 %vecext10 = extractelement <4 x float> %B, i32 0 117 %vecext11 = extractelement <4 x float> %B, i32 1 118 %sub12 = fsub float %vecext10, %vecext11 119 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 120 ret <4 x float> %vecinit13 121} 122 123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) { 124; SSE3-LABEL: phadd_d_test1: 125; SSE3: # %bb.0: 126; SSE3-NEXT: movd %xmm0, %eax 127; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 128; SSE3-NEXT: movd %xmm2, %ecx 129; SSE3-NEXT: addl %eax, %ecx 130; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 131; SSE3-NEXT: movd %xmm2, %eax 132; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 133; SSE3-NEXT: movd %xmm0, %edx 134; SSE3-NEXT: addl %eax, %edx 135; SSE3-NEXT: movd %xmm1, %eax 136; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 137; SSE3-NEXT: movd %xmm0, %esi 138; SSE3-NEXT: addl %eax, %esi 139; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 140; SSE3-NEXT: movd %xmm0, %eax 141; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 142; SSE3-NEXT: movd %xmm0, %edi 143; SSE3-NEXT: addl %eax, %edi 144; SSE3-NEXT: movd %edi, %xmm0 145; SSE3-NEXT: movd %esi, %xmm1 146; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 147; SSE3-NEXT: movd %edx, %xmm2 148; SSE3-NEXT: movd %ecx, %xmm0 149; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 150; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 151; SSE3-NEXT: retq 152; 153; SSSE3-LABEL: phadd_d_test1: 154; SSSE3: # %bb.0: 155; SSSE3-NEXT: phaddd %xmm1, %xmm0 156; SSSE3-NEXT: retq 157; 158; AVX-LABEL: phadd_d_test1: 159; AVX: # %bb.0: 160; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 161; AVX-NEXT: retq 162 %vecext = extractelement <4 x i32> %A, i32 0 163 %vecext1 = extractelement <4 x i32> %A, i32 1 164 %add = add i32 %vecext, %vecext1 165 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0 166 %vecext2 = extractelement <4 x i32> %A, i32 2 167 %vecext3 = extractelement <4 x i32> %A, i32 3 168 %add4 = add i32 %vecext2, %vecext3 169 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1 170 %vecext6 = extractelement <4 x i32> %B, i32 0 171 %vecext7 = extractelement <4 x i32> %B, i32 1 172 %add8 = add i32 %vecext6, %vecext7 173 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2 174 %vecext10 = extractelement <4 x i32> %B, i32 2 175 %vecext11 = extractelement <4 x i32> %B, i32 3 176 %add12 = add i32 %vecext10, %vecext11 177 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3 178 ret <4 x i32> %vecinit13 179} 180 181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) { 182; SSE3-LABEL: phadd_d_test2: 183; SSE3: # %bb.0: 184; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 185; SSE3-NEXT: movd %xmm2, %eax 186; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 187; SSE3-NEXT: movd %xmm2, %ecx 188; SSE3-NEXT: addl %eax, %ecx 189; SSE3-NEXT: movd %xmm0, %eax 190; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 191; SSE3-NEXT: movd %xmm0, %edx 192; SSE3-NEXT: addl %eax, %edx 193; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 194; SSE3-NEXT: movd %xmm0, %eax 195; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 196; SSE3-NEXT: movd %xmm0, %esi 197; SSE3-NEXT: addl %eax, %esi 198; SSE3-NEXT: movd %esi, %xmm0 199; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 200; SSE3-NEXT: movd %xmm2, %eax 201; SSE3-NEXT: movd %xmm1, %esi 202; SSE3-NEXT: addl %eax, %esi 203; SSE3-NEXT: movd %esi, %xmm1 204; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 205; SSE3-NEXT: movd %ecx, %xmm2 206; SSE3-NEXT: movd %edx, %xmm0 207; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 208; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 209; SSE3-NEXT: retq 210; 211; SSSE3-LABEL: phadd_d_test2: 212; SSSE3: # %bb.0: 213; SSSE3-NEXT: phaddd %xmm1, %xmm0 214; SSSE3-NEXT: retq 215; 216; AVX-LABEL: phadd_d_test2: 217; AVX: # %bb.0: 218; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 219; AVX-NEXT: retq 220 %vecext = extractelement <4 x i32> %A, i32 2 221 %vecext1 = extractelement <4 x i32> %A, i32 3 222 %add = add i32 %vecext, %vecext1 223 %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1 224 %vecext2 = extractelement <4 x i32> %A, i32 0 225 %vecext3 = extractelement <4 x i32> %A, i32 1 226 %add4 = add i32 %vecext2, %vecext3 227 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0 228 %vecext6 = extractelement <4 x i32> %B, i32 3 229 %vecext7 = extractelement <4 x i32> %B, i32 2 230 %add8 = add i32 %vecext6, %vecext7 231 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3 232 %vecext10 = extractelement <4 x i32> %B, i32 1 233 %vecext11 = extractelement <4 x i32> %B, i32 0 234 %add12 = add i32 %vecext10, %vecext11 235 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2 236 ret <4 x i32> %vecinit13 237} 238 239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) { 240; SSE3-LABEL: phsub_d_test1: 241; SSE3: # %bb.0: 242; SSE3-NEXT: movd %xmm0, %eax 243; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 244; SSE3-NEXT: movd %xmm2, %ecx 245; SSE3-NEXT: subl %ecx, %eax 246; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 247; SSE3-NEXT: movd %xmm2, %ecx 248; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 249; SSE3-NEXT: movd %xmm0, %edx 250; SSE3-NEXT: subl %edx, %ecx 251; SSE3-NEXT: movd %xmm1, %edx 252; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 253; SSE3-NEXT: movd %xmm0, %esi 254; SSE3-NEXT: subl %esi, %edx 255; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 256; SSE3-NEXT: movd %xmm0, %esi 257; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 258; SSE3-NEXT: movd %xmm0, %edi 259; SSE3-NEXT: subl %edi, %esi 260; SSE3-NEXT: movd %esi, %xmm0 261; SSE3-NEXT: movd %edx, %xmm1 262; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 263; SSE3-NEXT: movd %ecx, %xmm2 264; SSE3-NEXT: movd %eax, %xmm0 265; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 266; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 267; SSE3-NEXT: retq 268; 269; SSSE3-LABEL: phsub_d_test1: 270; SSSE3: # %bb.0: 271; SSSE3-NEXT: phsubd %xmm1, %xmm0 272; SSSE3-NEXT: retq 273; 274; AVX-LABEL: phsub_d_test1: 275; AVX: # %bb.0: 276; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 277; AVX-NEXT: retq 278 %vecext = extractelement <4 x i32> %A, i32 0 279 %vecext1 = extractelement <4 x i32> %A, i32 1 280 %sub = sub i32 %vecext, %vecext1 281 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 282 %vecext2 = extractelement <4 x i32> %A, i32 2 283 %vecext3 = extractelement <4 x i32> %A, i32 3 284 %sub4 = sub i32 %vecext2, %vecext3 285 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 286 %vecext6 = extractelement <4 x i32> %B, i32 0 287 %vecext7 = extractelement <4 x i32> %B, i32 1 288 %sub8 = sub i32 %vecext6, %vecext7 289 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 290 %vecext10 = extractelement <4 x i32> %B, i32 2 291 %vecext11 = extractelement <4 x i32> %B, i32 3 292 %sub12 = sub i32 %vecext10, %vecext11 293 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 294 ret <4 x i32> %vecinit13 295} 296 297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) { 298; SSE3-LABEL: phsub_d_test2: 299; SSE3: # %bb.0: 300; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 301; SSE3-NEXT: movd %xmm2, %eax 302; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 303; SSE3-NEXT: movd %xmm2, %ecx 304; SSE3-NEXT: subl %ecx, %eax 305; SSE3-NEXT: movd %xmm0, %ecx 306; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 307; SSE3-NEXT: movd %xmm0, %edx 308; SSE3-NEXT: subl %edx, %ecx 309; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 310; SSE3-NEXT: movd %xmm0, %edx 311; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 312; SSE3-NEXT: movd %xmm0, %esi 313; SSE3-NEXT: subl %esi, %edx 314; SSE3-NEXT: movd %edx, %xmm0 315; SSE3-NEXT: movd %xmm1, %edx 316; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 317; SSE3-NEXT: movd %xmm1, %esi 318; SSE3-NEXT: subl %esi, %edx 319; SSE3-NEXT: movd %edx, %xmm1 320; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 321; SSE3-NEXT: movd %eax, %xmm2 322; SSE3-NEXT: movd %ecx, %xmm0 323; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 324; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 325; SSE3-NEXT: retq 326; 327; SSSE3-LABEL: phsub_d_test2: 328; SSSE3: # %bb.0: 329; SSSE3-NEXT: phsubd %xmm1, %xmm0 330; SSSE3-NEXT: retq 331; 332; AVX-LABEL: phsub_d_test2: 333; AVX: # %bb.0: 334; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 335; AVX-NEXT: retq 336 %vecext = extractelement <4 x i32> %A, i32 2 337 %vecext1 = extractelement <4 x i32> %A, i32 3 338 %sub = sub i32 %vecext, %vecext1 339 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1 340 %vecext2 = extractelement <4 x i32> %A, i32 0 341 %vecext3 = extractelement <4 x i32> %A, i32 1 342 %sub4 = sub i32 %vecext2, %vecext3 343 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0 344 %vecext6 = extractelement <4 x i32> %B, i32 2 345 %vecext7 = extractelement <4 x i32> %B, i32 3 346 %sub8 = sub i32 %vecext6, %vecext7 347 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3 348 %vecext10 = extractelement <4 x i32> %B, i32 0 349 %vecext11 = extractelement <4 x i32> %B, i32 1 350 %sub12 = sub i32 %vecext10, %vecext11 351 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2 352 ret <4 x i32> %vecinit13 353} 354 355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) { 356; SSE-LABEL: hadd_pd_test1: 357; SSE: # %bb.0: 358; SSE-NEXT: haddpd %xmm1, %xmm0 359; SSE-NEXT: retq 360; 361; AVX-LABEL: hadd_pd_test1: 362; AVX: # %bb.0: 363; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 364; AVX-NEXT: retq 365 %vecext = extractelement <2 x double> %A, i32 0 366 %vecext1 = extractelement <2 x double> %A, i32 1 367 %add = fadd double %vecext, %vecext1 368 %vecinit = insertelement <2 x double> undef, double %add, i32 0 369 %vecext2 = extractelement <2 x double> %B, i32 0 370 %vecext3 = extractelement <2 x double> %B, i32 1 371 %add2 = fadd double %vecext2, %vecext3 372 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 373 ret <2 x double> %vecinit2 374} 375 376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) { 377; SSE-LABEL: hadd_pd_test2: 378; SSE: # %bb.0: 379; SSE-NEXT: haddpd %xmm1, %xmm0 380; SSE-NEXT: retq 381; 382; AVX-LABEL: hadd_pd_test2: 383; AVX: # %bb.0: 384; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 385; AVX-NEXT: retq 386 %vecext = extractelement <2 x double> %A, i32 1 387 %vecext1 = extractelement <2 x double> %A, i32 0 388 %add = fadd double %vecext, %vecext1 389 %vecinit = insertelement <2 x double> undef, double %add, i32 0 390 %vecext2 = extractelement <2 x double> %B, i32 1 391 %vecext3 = extractelement <2 x double> %B, i32 0 392 %add2 = fadd double %vecext2, %vecext3 393 %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1 394 ret <2 x double> %vecinit2 395} 396 397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) { 398; SSE-LABEL: hsub_pd_test1: 399; SSE: # %bb.0: 400; SSE-NEXT: hsubpd %xmm1, %xmm0 401; SSE-NEXT: retq 402; 403; AVX-LABEL: hsub_pd_test1: 404; AVX: # %bb.0: 405; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 406; AVX-NEXT: retq 407 %vecext = extractelement <2 x double> %A, i32 0 408 %vecext1 = extractelement <2 x double> %A, i32 1 409 %sub = fsub double %vecext, %vecext1 410 %vecinit = insertelement <2 x double> undef, double %sub, i32 0 411 %vecext2 = extractelement <2 x double> %B, i32 0 412 %vecext3 = extractelement <2 x double> %B, i32 1 413 %sub2 = fsub double %vecext2, %vecext3 414 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1 415 ret <2 x double> %vecinit2 416} 417 418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) { 419; SSE-LABEL: hsub_pd_test2: 420; SSE: # %bb.0: 421; SSE-NEXT: hsubpd %xmm1, %xmm0 422; SSE-NEXT: retq 423; 424; AVX-LABEL: hsub_pd_test2: 425; AVX: # %bb.0: 426; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 427; AVX-NEXT: retq 428 %vecext = extractelement <2 x double> %B, i32 0 429 %vecext1 = extractelement <2 x double> %B, i32 1 430 %sub = fsub double %vecext, %vecext1 431 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 432 %vecext2 = extractelement <2 x double> %A, i32 0 433 %vecext3 = extractelement <2 x double> %A, i32 1 434 %sub2 = fsub double %vecext2, %vecext3 435 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 436 ret <2 x double> %vecinit2 437} 438 439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { 440; SSE-LABEL: avx_vhadd_pd_test: 441; SSE: # %bb.0: 442; SSE-NEXT: haddpd %xmm1, %xmm0 443; SSE-NEXT: haddpd %xmm3, %xmm2 444; SSE-NEXT: movapd %xmm2, %xmm1 445; SSE-NEXT: retq 446; 447; AVX1-LABEL: avx_vhadd_pd_test: 448; AVX1: # %bb.0: 449; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 450; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 451; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 452; AVX1-NEXT: retq 453; 454; AVX2-LABEL: avx_vhadd_pd_test: 455; AVX2: # %bb.0: 456; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 457; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 458; AVX2-NEXT: retq 459 %vecext = extractelement <4 x double> %A, i32 0 460 %vecext1 = extractelement <4 x double> %A, i32 1 461 %add = fadd double %vecext, %vecext1 462 %vecinit = insertelement <4 x double> undef, double %add, i32 0 463 %vecext2 = extractelement <4 x double> %A, i32 2 464 %vecext3 = extractelement <4 x double> %A, i32 3 465 %add4 = fadd double %vecext2, %vecext3 466 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 467 %vecext6 = extractelement <4 x double> %B, i32 0 468 %vecext7 = extractelement <4 x double> %B, i32 1 469 %add8 = fadd double %vecext6, %vecext7 470 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 471 %vecext10 = extractelement <4 x double> %B, i32 2 472 %vecext11 = extractelement <4 x double> %B, i32 3 473 %add12 = fadd double %vecext10, %vecext11 474 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 475 ret <4 x double> %vecinit13 476} 477 478define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { 479; SSE-LABEL: avx_vhsub_pd_test: 480; SSE: # %bb.0: 481; SSE-NEXT: hsubpd %xmm1, %xmm0 482; SSE-NEXT: hsubpd %xmm3, %xmm2 483; SSE-NEXT: movapd %xmm2, %xmm1 484; SSE-NEXT: retq 485; 486; AVX1-LABEL: avx_vhsub_pd_test: 487; AVX1: # %bb.0: 488; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 489; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 490; AVX1-NEXT: vhsubpd %ymm2, %ymm0, %ymm0 491; AVX1-NEXT: retq 492; 493; AVX2-LABEL: avx_vhsub_pd_test: 494; AVX2: # %bb.0: 495; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 496; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 497; AVX2-NEXT: retq 498 %vecext = extractelement <4 x double> %A, i32 0 499 %vecext1 = extractelement <4 x double> %A, i32 1 500 %sub = fsub double %vecext, %vecext1 501 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 502 %vecext2 = extractelement <4 x double> %A, i32 2 503 %vecext3 = extractelement <4 x double> %A, i32 3 504 %sub4 = fsub double %vecext2, %vecext3 505 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 506 %vecext6 = extractelement <4 x double> %B, i32 0 507 %vecext7 = extractelement <4 x double> %B, i32 1 508 %sub8 = fsub double %vecext6, %vecext7 509 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 510 %vecext10 = extractelement <4 x double> %B, i32 2 511 %vecext11 = extractelement <4 x double> %B, i32 3 512 %sub12 = fsub double %vecext10, %vecext11 513 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 514 ret <4 x double> %vecinit13 515} 516 517define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) { 518; SSE3-LABEL: avx2_vphadd_d_test: 519; SSE3: # %bb.0: 520; SSE3-NEXT: movd %xmm0, %ecx 521; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 522; SSE3-NEXT: movd %xmm4, %eax 523; SSE3-NEXT: addl %ecx, %eax 524; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 525; SSE3-NEXT: movd %xmm4, %edx 526; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 527; SSE3-NEXT: movd %xmm0, %ecx 528; SSE3-NEXT: addl %edx, %ecx 529; SSE3-NEXT: movd %xmm1, %edx 530; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 531; SSE3-NEXT: movd %xmm0, %esi 532; SSE3-NEXT: addl %edx, %esi 533; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 534; SSE3-NEXT: movd %xmm0, %edx 535; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 536; SSE3-NEXT: movd %xmm0, %edi 537; SSE3-NEXT: addl %edx, %edi 538; SSE3-NEXT: movd %xmm2, %r8d 539; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 540; SSE3-NEXT: movd %xmm0, %edx 541; SSE3-NEXT: addl %r8d, %edx 542; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 543; SSE3-NEXT: movd %xmm0, %r8d 544; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 545; SSE3-NEXT: movd %xmm0, %r9d 546; SSE3-NEXT: addl %r8d, %r9d 547; SSE3-NEXT: movd %xmm3, %r8d 548; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 549; SSE3-NEXT: movd %xmm0, %r10d 550; SSE3-NEXT: addl %r8d, %r10d 551; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 552; SSE3-NEXT: movd %xmm0, %r8d 553; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 554; SSE3-NEXT: movd %xmm0, %r11d 555; SSE3-NEXT: addl %r8d, %r11d 556; SSE3-NEXT: movd %edi, %xmm0 557; SSE3-NEXT: movd %esi, %xmm1 558; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 559; SSE3-NEXT: movd %ecx, %xmm2 560; SSE3-NEXT: movd %eax, %xmm0 561; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 562; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 563; SSE3-NEXT: movd %r11d, %xmm1 564; SSE3-NEXT: movd %r10d, %xmm2 565; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 566; SSE3-NEXT: movd %r9d, %xmm3 567; SSE3-NEXT: movd %edx, %xmm1 568; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 569; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 570; SSE3-NEXT: retq 571; 572; SSSE3-LABEL: avx2_vphadd_d_test: 573; SSSE3: # %bb.0: 574; SSSE3-NEXT: phaddd %xmm1, %xmm0 575; SSSE3-NEXT: phaddd %xmm3, %xmm2 576; SSSE3-NEXT: movdqa %xmm2, %xmm1 577; SSSE3-NEXT: retq 578; 579; AVX1-LABEL: avx2_vphadd_d_test: 580; AVX1: # %bb.0: 581; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 582; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 583; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 584; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 585; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 586; AVX1-NEXT: retq 587; 588; AVX2-LABEL: avx2_vphadd_d_test: 589; AVX2: # %bb.0: 590; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 591; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 592; AVX2-NEXT: retq 593 %vecext = extractelement <8 x i32> %A, i32 0 594 %vecext1 = extractelement <8 x i32> %A, i32 1 595 %add = add i32 %vecext, %vecext1 596 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 597 %vecext2 = extractelement <8 x i32> %A, i32 2 598 %vecext3 = extractelement <8 x i32> %A, i32 3 599 %add4 = add i32 %vecext2, %vecext3 600 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 601 %vecext6 = extractelement <8 x i32> %A, i32 4 602 %vecext7 = extractelement <8 x i32> %A, i32 5 603 %add8 = add i32 %vecext6, %vecext7 604 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 605 %vecext10 = extractelement <8 x i32> %A, i32 6 606 %vecext11 = extractelement <8 x i32> %A, i32 7 607 %add12 = add i32 %vecext10, %vecext11 608 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 609 %vecext14 = extractelement <8 x i32> %B, i32 0 610 %vecext15 = extractelement <8 x i32> %B, i32 1 611 %add16 = add i32 %vecext14, %vecext15 612 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 613 %vecext18 = extractelement <8 x i32> %B, i32 2 614 %vecext19 = extractelement <8 x i32> %B, i32 3 615 %add20 = add i32 %vecext18, %vecext19 616 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 617 %vecext22 = extractelement <8 x i32> %B, i32 4 618 %vecext23 = extractelement <8 x i32> %B, i32 5 619 %add24 = add i32 %vecext22, %vecext23 620 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 621 %vecext26 = extractelement <8 x i32> %B, i32 6 622 %vecext27 = extractelement <8 x i32> %B, i32 7 623 %add28 = add i32 %vecext26, %vecext27 624 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 625 ret <8 x i32> %vecinit29 626} 627 628define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) nounwind { 629; SSE3-LABEL: avx2_vphadd_w_test: 630; SSE3: # %bb.0: 631; SSE3-NEXT: pushq %rbp 632; SSE3-NEXT: pushq %r15 633; SSE3-NEXT: pushq %r14 634; SSE3-NEXT: pushq %r13 635; SSE3-NEXT: pushq %r12 636; SSE3-NEXT: pushq %rbx 637; SSE3-NEXT: movd %xmm0, %ecx 638; SSE3-NEXT: pextrw $1, %xmm0, %eax 639; SSE3-NEXT: addl %ecx, %eax 640; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 641; SSE3-NEXT: pextrw $2, %xmm0, %edx 642; SSE3-NEXT: pextrw $3, %xmm0, %eax 643; SSE3-NEXT: addl %edx, %eax 644; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 645; SSE3-NEXT: pextrw $4, %xmm0, %edx 646; SSE3-NEXT: pextrw $5, %xmm0, %esi 647; SSE3-NEXT: addl %edx, %esi 648; SSE3-NEXT: pextrw $6, %xmm0, %edx 649; SSE3-NEXT: pextrw $7, %xmm0, %r8d 650; SSE3-NEXT: addl %edx, %r8d 651; SSE3-NEXT: movd %xmm1, %edx 652; SSE3-NEXT: pextrw $1, %xmm1, %r10d 653; SSE3-NEXT: addl %edx, %r10d 654; SSE3-NEXT: pextrw $2, %xmm1, %edx 655; SSE3-NEXT: pextrw $3, %xmm1, %ebx 656; SSE3-NEXT: addl %edx, %ebx 657; SSE3-NEXT: pextrw $4, %xmm1, %edx 658; SSE3-NEXT: pextrw $5, %xmm1, %r14d 659; SSE3-NEXT: addl %edx, %r14d 660; SSE3-NEXT: pextrw $6, %xmm1, %edx 661; SSE3-NEXT: pextrw $7, %xmm1, %r12d 662; SSE3-NEXT: addl %edx, %r12d 663; SSE3-NEXT: movd %xmm2, %edi 664; SSE3-NEXT: pextrw $1, %xmm2, %edx 665; SSE3-NEXT: addl %edi, %edx 666; SSE3-NEXT: pextrw $2, %xmm2, %r9d 667; SSE3-NEXT: pextrw $3, %xmm2, %edi 668; SSE3-NEXT: addl %r9d, %edi 669; SSE3-NEXT: pextrw $4, %xmm2, %r11d 670; SSE3-NEXT: pextrw $5, %xmm2, %r9d 671; SSE3-NEXT: addl %r11d, %r9d 672; SSE3-NEXT: pextrw $6, %xmm2, %ebp 673; SSE3-NEXT: pextrw $7, %xmm2, %r11d 674; SSE3-NEXT: addl %ebp, %r11d 675; SSE3-NEXT: movd %xmm3, %r15d 676; SSE3-NEXT: pextrw $1, %xmm3, %ebp 677; SSE3-NEXT: addl %r15d, %ebp 678; SSE3-NEXT: pextrw $2, %xmm3, %r13d 679; SSE3-NEXT: pextrw $3, %xmm3, %r15d 680; SSE3-NEXT: addl %r13d, %r15d 681; SSE3-NEXT: pextrw $4, %xmm3, %r13d 682; SSE3-NEXT: pextrw $5, %xmm3, %ecx 683; SSE3-NEXT: addl %r13d, %ecx 684; SSE3-NEXT: pextrw $6, %xmm3, %r13d 685; SSE3-NEXT: pextrw $7, %xmm3, %eax 686; SSE3-NEXT: addl %r13d, %eax 687; SSE3-NEXT: movd %r12d, %xmm4 688; SSE3-NEXT: movd %r14d, %xmm2 689; SSE3-NEXT: movd %ebx, %xmm5 690; SSE3-NEXT: movd %r10d, %xmm3 691; SSE3-NEXT: movd %r8d, %xmm6 692; SSE3-NEXT: movd %esi, %xmm7 693; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 4-byte Folded Reload 694; SSE3-NEXT: # xmm8 = mem[0],zero,zero,zero 695; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload 696; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero 697; SSE3-NEXT: movd %eax, %xmm9 698; SSE3-NEXT: movd %ecx, %xmm10 699; SSE3-NEXT: movd %r15d, %xmm11 700; SSE3-NEXT: movd %ebp, %xmm12 701; SSE3-NEXT: movd %r11d, %xmm13 702; SSE3-NEXT: movd %r9d, %xmm14 703; SSE3-NEXT: movd %edi, %xmm15 704; SSE3-NEXT: movd %edx, %xmm1 705; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 706; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 707; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 708; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 709; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 710; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 711; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 712; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 713; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 714; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 715; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 716; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 717; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] 718; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0] 719; SSE3-NEXT: popq %rbx 720; SSE3-NEXT: popq %r12 721; SSE3-NEXT: popq %r13 722; SSE3-NEXT: popq %r14 723; SSE3-NEXT: popq %r15 724; SSE3-NEXT: popq %rbp 725; SSE3-NEXT: retq 726; 727; SSSE3-LABEL: avx2_vphadd_w_test: 728; SSSE3: # %bb.0: 729; SSSE3-NEXT: phaddw %xmm1, %xmm0 730; SSSE3-NEXT: phaddw %xmm3, %xmm2 731; SSSE3-NEXT: movdqa %xmm2, %xmm1 732; SSSE3-NEXT: retq 733; 734; AVX1-LABEL: avx2_vphadd_w_test: 735; AVX1: # %bb.0: 736; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 737; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 738; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 739; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 740; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 741; AVX1-NEXT: retq 742; 743; AVX2-LABEL: avx2_vphadd_w_test: 744; AVX2: # %bb.0: 745; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 746; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 747; AVX2-NEXT: retq 748 %vecext = extractelement <16 x i16> %a, i32 0 749 %vecext1 = extractelement <16 x i16> %a, i32 1 750 %add = add i16 %vecext, %vecext1 751 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 752 %vecext4 = extractelement <16 x i16> %a, i32 2 753 %vecext6 = extractelement <16 x i16> %a, i32 3 754 %add8 = add i16 %vecext4, %vecext6 755 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 756 %vecext11 = extractelement <16 x i16> %a, i32 4 757 %vecext13 = extractelement <16 x i16> %a, i32 5 758 %add15 = add i16 %vecext11, %vecext13 759 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 760 %vecext18 = extractelement <16 x i16> %a, i32 6 761 %vecext20 = extractelement <16 x i16> %a, i32 7 762 %add22 = add i16 %vecext18, %vecext20 763 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 764 %vecext25 = extractelement <16 x i16> %a, i32 8 765 %vecext27 = extractelement <16 x i16> %a, i32 9 766 %add29 = add i16 %vecext25, %vecext27 767 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4 768 %vecext32 = extractelement <16 x i16> %a, i32 10 769 %vecext34 = extractelement <16 x i16> %a, i32 11 770 %add36 = add i16 %vecext32, %vecext34 771 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5 772 %vecext39 = extractelement <16 x i16> %a, i32 12 773 %vecext41 = extractelement <16 x i16> %a, i32 13 774 %add43 = add i16 %vecext39, %vecext41 775 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6 776 %vecext46 = extractelement <16 x i16> %a, i32 14 777 %vecext48 = extractelement <16 x i16> %a, i32 15 778 %add50 = add i16 %vecext46, %vecext48 779 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7 780 %vecext53 = extractelement <16 x i16> %b, i32 0 781 %vecext55 = extractelement <16 x i16> %b, i32 1 782 %add57 = add i16 %vecext53, %vecext55 783 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8 784 %vecext60 = extractelement <16 x i16> %b, i32 2 785 %vecext62 = extractelement <16 x i16> %b, i32 3 786 %add64 = add i16 %vecext60, %vecext62 787 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9 788 %vecext67 = extractelement <16 x i16> %b, i32 4 789 %vecext69 = extractelement <16 x i16> %b, i32 5 790 %add71 = add i16 %vecext67, %vecext69 791 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10 792 %vecext74 = extractelement <16 x i16> %b, i32 6 793 %vecext76 = extractelement <16 x i16> %b, i32 7 794 %add78 = add i16 %vecext74, %vecext76 795 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11 796 %vecext81 = extractelement <16 x i16> %b, i32 8 797 %vecext83 = extractelement <16 x i16> %b, i32 9 798 %add85 = add i16 %vecext81, %vecext83 799 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 800 %vecext88 = extractelement <16 x i16> %b, i32 10 801 %vecext90 = extractelement <16 x i16> %b, i32 11 802 %add92 = add i16 %vecext88, %vecext90 803 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 804 %vecext95 = extractelement <16 x i16> %b, i32 12 805 %vecext97 = extractelement <16 x i16> %b, i32 13 806 %add99 = add i16 %vecext95, %vecext97 807 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 808 %vecext102 = extractelement <16 x i16> %b, i32 14 809 %vecext104 = extractelement <16 x i16> %b, i32 15 810 %add106 = add i16 %vecext102, %vecext104 811 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 812 ret <16 x i16> %vecinit108 813} 814 815; Verify that we don't select horizontal subs in the following functions. 816 817define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) { 818; SSE-LABEL: not_a_hsub_1: 819; SSE: # %bb.0: 820; SSE-NEXT: movd %xmm0, %eax 821; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] 822; SSE-NEXT: movd %xmm2, %ecx 823; SSE-NEXT: subl %ecx, %eax 824; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 825; SSE-NEXT: movd %xmm2, %ecx 826; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 827; SSE-NEXT: movd %xmm0, %edx 828; SSE-NEXT: subl %edx, %ecx 829; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 830; SSE-NEXT: movd %xmm0, %edx 831; SSE-NEXT: movd %xmm1, %esi 832; SSE-NEXT: subl %esi, %edx 833; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 834; SSE-NEXT: movd %xmm0, %esi 835; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 836; SSE-NEXT: movd %xmm0, %edi 837; SSE-NEXT: subl %edi, %esi 838; SSE-NEXT: movd %esi, %xmm0 839; SSE-NEXT: movd %edx, %xmm1 840; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 841; SSE-NEXT: movd %ecx, %xmm2 842; SSE-NEXT: movd %eax, %xmm0 843; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 844; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 845; SSE-NEXT: retq 846; 847; AVX-LABEL: not_a_hsub_1: 848; AVX: # %bb.0: 849; AVX-NEXT: vmovd %xmm0, %eax 850; AVX-NEXT: vpextrd $1, %xmm0, %ecx 851; AVX-NEXT: subl %ecx, %eax 852; AVX-NEXT: vpextrd $2, %xmm0, %ecx 853; AVX-NEXT: vpextrd $3, %xmm0, %edx 854; AVX-NEXT: subl %edx, %ecx 855; AVX-NEXT: vpextrd $1, %xmm1, %edx 856; AVX-NEXT: vmovd %xmm1, %esi 857; AVX-NEXT: subl %esi, %edx 858; AVX-NEXT: vpextrd $3, %xmm1, %esi 859; AVX-NEXT: vpextrd $2, %xmm1, %edi 860; AVX-NEXT: subl %edi, %esi 861; AVX-NEXT: vmovd %eax, %xmm0 862; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 863; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 864; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 865; AVX-NEXT: retq 866 %vecext = extractelement <4 x i32> %A, i32 0 867 %vecext1 = extractelement <4 x i32> %A, i32 1 868 %sub = sub i32 %vecext, %vecext1 869 %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0 870 %vecext2 = extractelement <4 x i32> %A, i32 2 871 %vecext3 = extractelement <4 x i32> %A, i32 3 872 %sub4 = sub i32 %vecext2, %vecext3 873 %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1 874 %vecext6 = extractelement <4 x i32> %B, i32 1 875 %vecext7 = extractelement <4 x i32> %B, i32 0 876 %sub8 = sub i32 %vecext6, %vecext7 877 %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2 878 %vecext10 = extractelement <4 x i32> %B, i32 3 879 %vecext11 = extractelement <4 x i32> %B, i32 2 880 %sub12 = sub i32 %vecext10, %vecext11 881 %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3 882 ret <4 x i32> %vecinit13 883} 884 885define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { 886; SSE-LABEL: not_a_hsub_2: 887; SSE: # %bb.0: 888; SSE-NEXT: movaps %xmm0, %xmm2 889; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 890; SSE-NEXT: movaps %xmm0, %xmm3 891; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] 892; SSE-NEXT: subss %xmm3, %xmm2 893; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 894; SSE-NEXT: subss %xmm3, %xmm0 895; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 896; SSE-NEXT: movaps %xmm1, %xmm2 897; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] 898; SSE-NEXT: movaps %xmm1, %xmm3 899; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 900; SSE-NEXT: subss %xmm3, %xmm2 901; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 902; SSE-NEXT: subss %xmm3, %xmm1 903; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 904; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 905; SSE-NEXT: retq 906; 907; AVX-LABEL: not_a_hsub_2: 908; AVX: # %bb.0: 909; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 910; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] 911; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 912; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 913; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 914; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 915; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] 916; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0] 917; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 918; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] 919; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 920; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 921; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] 922; AVX-NEXT: retq 923 %vecext = extractelement <4 x float> %A, i32 2 924 %vecext1 = extractelement <4 x float> %A, i32 3 925 %sub = fsub float %vecext, %vecext1 926 %vecinit = insertelement <4 x float> undef, float %sub, i32 1 927 %vecext2 = extractelement <4 x float> %A, i32 0 928 %vecext3 = extractelement <4 x float> %A, i32 1 929 %sub4 = fsub float %vecext2, %vecext3 930 %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0 931 %vecext6 = extractelement <4 x float> %B, i32 3 932 %vecext7 = extractelement <4 x float> %B, i32 2 933 %sub8 = fsub float %vecext6, %vecext7 934 %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3 935 %vecext10 = extractelement <4 x float> %B, i32 0 936 %vecext11 = extractelement <4 x float> %B, i32 1 937 %sub12 = fsub float %vecext10, %vecext11 938 %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2 939 ret <4 x float> %vecinit13 940} 941 942define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { 943; SSE-LABEL: not_a_hsub_3: 944; SSE: # %bb.0: 945; SSE-NEXT: movapd %xmm1, %xmm2 946; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 947; SSE-NEXT: subsd %xmm2, %xmm1 948; SSE-NEXT: movapd %xmm0, %xmm2 949; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 950; SSE-NEXT: subsd %xmm0, %xmm2 951; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 952; SSE-NEXT: movapd %xmm2, %xmm0 953; SSE-NEXT: retq 954; 955; AVX-LABEL: not_a_hsub_3: 956; AVX: # %bb.0: 957; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 958; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1 959; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0] 960; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0 961; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 962; AVX-NEXT: retq 963 %vecext = extractelement <2 x double> %B, i32 0 964 %vecext1 = extractelement <2 x double> %B, i32 1 965 %sub = fsub double %vecext, %vecext1 966 %vecinit = insertelement <2 x double> undef, double %sub, i32 1 967 %vecext2 = extractelement <2 x double> %A, i32 1 968 %vecext3 = extractelement <2 x double> %A, i32 0 969 %sub2 = fsub double %vecext2, %vecext3 970 %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0 971 ret <2 x double> %vecinit2 972} 973 974; Test AVX horizontal add/sub of packed single/double precision 975; floating point values from 256-bit vectors. 976 977define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) { 978; SSE-LABEL: avx_vhadd_ps: 979; SSE: # %bb.0: 980; SSE-NEXT: haddps %xmm2, %xmm0 981; SSE-NEXT: haddps %xmm3, %xmm1 982; SSE-NEXT: retq 983; 984; AVX-LABEL: avx_vhadd_ps: 985; AVX: # %bb.0: 986; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 987; AVX-NEXT: retq 988 %vecext = extractelement <8 x float> %a, i32 0 989 %vecext1 = extractelement <8 x float> %a, i32 1 990 %add = fadd float %vecext, %vecext1 991 %vecinit = insertelement <8 x float> undef, float %add, i32 0 992 %vecext2 = extractelement <8 x float> %a, i32 2 993 %vecext3 = extractelement <8 x float> %a, i32 3 994 %add4 = fadd float %vecext2, %vecext3 995 %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1 996 %vecext6 = extractelement <8 x float> %b, i32 0 997 %vecext7 = extractelement <8 x float> %b, i32 1 998 %add8 = fadd float %vecext6, %vecext7 999 %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2 1000 %vecext10 = extractelement <8 x float> %b, i32 2 1001 %vecext11 = extractelement <8 x float> %b, i32 3 1002 %add12 = fadd float %vecext10, %vecext11 1003 %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3 1004 %vecext14 = extractelement <8 x float> %a, i32 4 1005 %vecext15 = extractelement <8 x float> %a, i32 5 1006 %add16 = fadd float %vecext14, %vecext15 1007 %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4 1008 %vecext18 = extractelement <8 x float> %a, i32 6 1009 %vecext19 = extractelement <8 x float> %a, i32 7 1010 %add20 = fadd float %vecext18, %vecext19 1011 %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5 1012 %vecext22 = extractelement <8 x float> %b, i32 4 1013 %vecext23 = extractelement <8 x float> %b, i32 5 1014 %add24 = fadd float %vecext22, %vecext23 1015 %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6 1016 %vecext26 = extractelement <8 x float> %b, i32 6 1017 %vecext27 = extractelement <8 x float> %b, i32 7 1018 %add28 = fadd float %vecext26, %vecext27 1019 %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7 1020 ret <8 x float> %vecinit29 1021} 1022 1023define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) { 1024; SSE-LABEL: avx_vhsub_ps: 1025; SSE: # %bb.0: 1026; SSE-NEXT: hsubps %xmm2, %xmm0 1027; SSE-NEXT: hsubps %xmm3, %xmm1 1028; SSE-NEXT: retq 1029; 1030; AVX-LABEL: avx_vhsub_ps: 1031; AVX: # %bb.0: 1032; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 1033; AVX-NEXT: retq 1034 %vecext = extractelement <8 x float> %a, i32 0 1035 %vecext1 = extractelement <8 x float> %a, i32 1 1036 %sub = fsub float %vecext, %vecext1 1037 %vecinit = insertelement <8 x float> undef, float %sub, i32 0 1038 %vecext2 = extractelement <8 x float> %a, i32 2 1039 %vecext3 = extractelement <8 x float> %a, i32 3 1040 %sub4 = fsub float %vecext2, %vecext3 1041 %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1 1042 %vecext6 = extractelement <8 x float> %b, i32 0 1043 %vecext7 = extractelement <8 x float> %b, i32 1 1044 %sub8 = fsub float %vecext6, %vecext7 1045 %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2 1046 %vecext10 = extractelement <8 x float> %b, i32 2 1047 %vecext11 = extractelement <8 x float> %b, i32 3 1048 %sub12 = fsub float %vecext10, %vecext11 1049 %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3 1050 %vecext14 = extractelement <8 x float> %a, i32 4 1051 %vecext15 = extractelement <8 x float> %a, i32 5 1052 %sub16 = fsub float %vecext14, %vecext15 1053 %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4 1054 %vecext18 = extractelement <8 x float> %a, i32 6 1055 %vecext19 = extractelement <8 x float> %a, i32 7 1056 %sub20 = fsub float %vecext18, %vecext19 1057 %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5 1058 %vecext22 = extractelement <8 x float> %b, i32 4 1059 %vecext23 = extractelement <8 x float> %b, i32 5 1060 %sub24 = fsub float %vecext22, %vecext23 1061 %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6 1062 %vecext26 = extractelement <8 x float> %b, i32 6 1063 %vecext27 = extractelement <8 x float> %b, i32 7 1064 %sub28 = fsub float %vecext26, %vecext27 1065 %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7 1066 ret <8 x float> %vecinit29 1067} 1068 1069define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) { 1070; SSE-LABEL: avx_hadd_pd: 1071; SSE: # %bb.0: 1072; SSE-NEXT: haddpd %xmm2, %xmm0 1073; SSE-NEXT: haddpd %xmm3, %xmm1 1074; SSE-NEXT: retq 1075; 1076; AVX-LABEL: avx_hadd_pd: 1077; AVX: # %bb.0: 1078; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1079; AVX-NEXT: retq 1080 %vecext = extractelement <4 x double> %a, i32 0 1081 %vecext1 = extractelement <4 x double> %a, i32 1 1082 %add = fadd double %vecext, %vecext1 1083 %vecinit = insertelement <4 x double> undef, double %add, i32 0 1084 %vecext2 = extractelement <4 x double> %b, i32 0 1085 %vecext3 = extractelement <4 x double> %b, i32 1 1086 %add4 = fadd double %vecext2, %vecext3 1087 %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1 1088 %vecext6 = extractelement <4 x double> %a, i32 2 1089 %vecext7 = extractelement <4 x double> %a, i32 3 1090 %add8 = fadd double %vecext6, %vecext7 1091 %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2 1092 %vecext10 = extractelement <4 x double> %b, i32 2 1093 %vecext11 = extractelement <4 x double> %b, i32 3 1094 %add12 = fadd double %vecext10, %vecext11 1095 %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3 1096 ret <4 x double> %vecinit13 1097} 1098 1099define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) { 1100; SSE-LABEL: avx_hsub_pd: 1101; SSE: # %bb.0: 1102; SSE-NEXT: hsubpd %xmm2, %xmm0 1103; SSE-NEXT: hsubpd %xmm3, %xmm1 1104; SSE-NEXT: retq 1105; 1106; AVX-LABEL: avx_hsub_pd: 1107; AVX: # %bb.0: 1108; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1109; AVX-NEXT: retq 1110 %vecext = extractelement <4 x double> %a, i32 0 1111 %vecext1 = extractelement <4 x double> %a, i32 1 1112 %sub = fsub double %vecext, %vecext1 1113 %vecinit = insertelement <4 x double> undef, double %sub, i32 0 1114 %vecext2 = extractelement <4 x double> %b, i32 0 1115 %vecext3 = extractelement <4 x double> %b, i32 1 1116 %sub4 = fsub double %vecext2, %vecext3 1117 %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1 1118 %vecext6 = extractelement <4 x double> %a, i32 2 1119 %vecext7 = extractelement <4 x double> %a, i32 3 1120 %sub8 = fsub double %vecext6, %vecext7 1121 %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2 1122 %vecext10 = extractelement <4 x double> %b, i32 2 1123 %vecext11 = extractelement <4 x double> %b, i32 3 1124 %sub12 = fsub double %vecext10, %vecext11 1125 %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3 1126 ret <4 x double> %vecinit13 1127} 1128 1129; Test AVX2 horizontal add of packed integer values from 256-bit vectors. 1130 1131define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) { 1132; SSE3-LABEL: avx2_hadd_d: 1133; SSE3: # %bb.0: 1134; SSE3-NEXT: movd %xmm0, %ecx 1135; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1136; SSE3-NEXT: movd %xmm4, %eax 1137; SSE3-NEXT: addl %ecx, %eax 1138; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1139; SSE3-NEXT: movd %xmm4, %edx 1140; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1141; SSE3-NEXT: movd %xmm0, %ecx 1142; SSE3-NEXT: addl %edx, %ecx 1143; SSE3-NEXT: movd %xmm2, %edx 1144; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] 1145; SSE3-NEXT: movd %xmm0, %esi 1146; SSE3-NEXT: addl %edx, %esi 1147; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] 1148; SSE3-NEXT: movd %xmm0, %edx 1149; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] 1150; SSE3-NEXT: movd %xmm0, %edi 1151; SSE3-NEXT: addl %edx, %edi 1152; SSE3-NEXT: movd %xmm1, %r8d 1153; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1154; SSE3-NEXT: movd %xmm0, %edx 1155; SSE3-NEXT: addl %r8d, %edx 1156; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1157; SSE3-NEXT: movd %xmm0, %r8d 1158; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 1159; SSE3-NEXT: movd %xmm0, %r9d 1160; SSE3-NEXT: addl %r8d, %r9d 1161; SSE3-NEXT: movd %xmm3, %r8d 1162; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 1163; SSE3-NEXT: movd %xmm0, %r10d 1164; SSE3-NEXT: addl %r8d, %r10d 1165; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 1166; SSE3-NEXT: movd %xmm0, %r8d 1167; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 1168; SSE3-NEXT: movd %xmm0, %r11d 1169; SSE3-NEXT: addl %r8d, %r11d 1170; SSE3-NEXT: movd %edi, %xmm0 1171; SSE3-NEXT: movd %esi, %xmm1 1172; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1173; SSE3-NEXT: movd %ecx, %xmm2 1174; SSE3-NEXT: movd %eax, %xmm0 1175; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1176; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1177; SSE3-NEXT: movd %r11d, %xmm1 1178; SSE3-NEXT: movd %r10d, %xmm2 1179; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1180; SSE3-NEXT: movd %r9d, %xmm3 1181; SSE3-NEXT: movd %edx, %xmm1 1182; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 1183; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1184; SSE3-NEXT: retq 1185; 1186; SSSE3-LABEL: avx2_hadd_d: 1187; SSSE3: # %bb.0: 1188; SSSE3-NEXT: phaddd %xmm2, %xmm0 1189; SSSE3-NEXT: phaddd %xmm3, %xmm1 1190; SSSE3-NEXT: retq 1191; 1192; AVX1-LABEL: avx2_hadd_d: 1193; AVX1: # %bb.0: 1194; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1195; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1196; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2 1197; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1198; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1199; AVX1-NEXT: retq 1200; 1201; AVX2-LABEL: avx2_hadd_d: 1202; AVX2: # %bb.0: 1203; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1204; AVX2-NEXT: retq 1205 %vecext = extractelement <8 x i32> %a, i32 0 1206 %vecext1 = extractelement <8 x i32> %a, i32 1 1207 %add = add i32 %vecext, %vecext1 1208 %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0 1209 %vecext2 = extractelement <8 x i32> %a, i32 2 1210 %vecext3 = extractelement <8 x i32> %a, i32 3 1211 %add4 = add i32 %vecext2, %vecext3 1212 %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1 1213 %vecext6 = extractelement <8 x i32> %b, i32 0 1214 %vecext7 = extractelement <8 x i32> %b, i32 1 1215 %add8 = add i32 %vecext6, %vecext7 1216 %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2 1217 %vecext10 = extractelement <8 x i32> %b, i32 2 1218 %vecext11 = extractelement <8 x i32> %b, i32 3 1219 %add12 = add i32 %vecext10, %vecext11 1220 %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3 1221 %vecext14 = extractelement <8 x i32> %a, i32 4 1222 %vecext15 = extractelement <8 x i32> %a, i32 5 1223 %add16 = add i32 %vecext14, %vecext15 1224 %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4 1225 %vecext18 = extractelement <8 x i32> %a, i32 6 1226 %vecext19 = extractelement <8 x i32> %a, i32 7 1227 %add20 = add i32 %vecext18, %vecext19 1228 %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5 1229 %vecext22 = extractelement <8 x i32> %b, i32 4 1230 %vecext23 = extractelement <8 x i32> %b, i32 5 1231 %add24 = add i32 %vecext22, %vecext23 1232 %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6 1233 %vecext26 = extractelement <8 x i32> %b, i32 6 1234 %vecext27 = extractelement <8 x i32> %b, i32 7 1235 %add28 = add i32 %vecext26, %vecext27 1236 %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7 1237 ret <8 x i32> %vecinit29 1238} 1239 1240define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) nounwind { 1241; SSE3-LABEL: avx2_hadd_w: 1242; SSE3: # %bb.0: 1243; SSE3-NEXT: pushq %rbp 1244; SSE3-NEXT: pushq %r15 1245; SSE3-NEXT: pushq %r14 1246; SSE3-NEXT: pushq %r13 1247; SSE3-NEXT: pushq %r12 1248; SSE3-NEXT: pushq %rbx 1249; SSE3-NEXT: movd %xmm0, %eax 1250; SSE3-NEXT: pextrw $1, %xmm0, %edx 1251; SSE3-NEXT: addl %eax, %edx 1252; SSE3-NEXT: pextrw $2, %xmm0, %eax 1253; SSE3-NEXT: pextrw $3, %xmm0, %esi 1254; SSE3-NEXT: addl %eax, %esi 1255; SSE3-NEXT: pextrw $4, %xmm0, %eax 1256; SSE3-NEXT: pextrw $5, %xmm0, %r9d 1257; SSE3-NEXT: addl %eax, %r9d 1258; SSE3-NEXT: pextrw $6, %xmm0, %eax 1259; SSE3-NEXT: pextrw $7, %xmm0, %r10d 1260; SSE3-NEXT: addl %eax, %r10d 1261; SSE3-NEXT: movd %xmm1, %ecx 1262; SSE3-NEXT: pextrw $1, %xmm1, %eax 1263; SSE3-NEXT: addl %ecx, %eax 1264; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1265; SSE3-NEXT: pextrw $2, %xmm1, %edi 1266; SSE3-NEXT: pextrw $3, %xmm1, %eax 1267; SSE3-NEXT: addl %edi, %eax 1268; SSE3-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1269; SSE3-NEXT: pextrw $4, %xmm1, %r8d 1270; SSE3-NEXT: pextrw $5, %xmm1, %edi 1271; SSE3-NEXT: addl %r8d, %edi 1272; SSE3-NEXT: pextrw $6, %xmm1, %r11d 1273; SSE3-NEXT: pextrw $7, %xmm1, %r8d 1274; SSE3-NEXT: addl %r11d, %r8d 1275; SSE3-NEXT: movd %xmm2, %r11d 1276; SSE3-NEXT: pextrw $1, %xmm2, %ebp 1277; SSE3-NEXT: addl %r11d, %ebp 1278; SSE3-NEXT: pextrw $2, %xmm2, %r11d 1279; SSE3-NEXT: pextrw $3, %xmm2, %r14d 1280; SSE3-NEXT: addl %r11d, %r14d 1281; SSE3-NEXT: pextrw $4, %xmm2, %r11d 1282; SSE3-NEXT: pextrw $5, %xmm2, %r15d 1283; SSE3-NEXT: addl %r11d, %r15d 1284; SSE3-NEXT: pextrw $6, %xmm2, %r11d 1285; SSE3-NEXT: pextrw $7, %xmm2, %r12d 1286; SSE3-NEXT: addl %r11d, %r12d 1287; SSE3-NEXT: movd %xmm3, %ebx 1288; SSE3-NEXT: pextrw $1, %xmm3, %r11d 1289; SSE3-NEXT: addl %ebx, %r11d 1290; SSE3-NEXT: pextrw $2, %xmm3, %r13d 1291; SSE3-NEXT: pextrw $3, %xmm3, %ebx 1292; SSE3-NEXT: addl %r13d, %ebx 1293; SSE3-NEXT: pextrw $4, %xmm3, %r13d 1294; SSE3-NEXT: pextrw $5, %xmm3, %ecx 1295; SSE3-NEXT: addl %r13d, %ecx 1296; SSE3-NEXT: pextrw $6, %xmm3, %r13d 1297; SSE3-NEXT: pextrw $7, %xmm3, %eax 1298; SSE3-NEXT: addl %r13d, %eax 1299; SSE3-NEXT: movd %r12d, %xmm4 1300; SSE3-NEXT: movd %r15d, %xmm2 1301; SSE3-NEXT: movd %r14d, %xmm5 1302; SSE3-NEXT: movd %ebp, %xmm3 1303; SSE3-NEXT: movd %r10d, %xmm6 1304; SSE3-NEXT: movd %r9d, %xmm7 1305; SSE3-NEXT: movd %esi, %xmm8 1306; SSE3-NEXT: movd %edx, %xmm0 1307; SSE3-NEXT: movd %eax, %xmm9 1308; SSE3-NEXT: movd %ecx, %xmm10 1309; SSE3-NEXT: movd %ebx, %xmm11 1310; SSE3-NEXT: movd %r11d, %xmm12 1311; SSE3-NEXT: movd %r8d, %xmm13 1312; SSE3-NEXT: movd %edi, %xmm14 1313; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload 1314; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero 1315; SSE3-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload 1316; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero 1317; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 1318; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] 1319; SSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1320; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1321; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] 1322; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] 1323; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] 1324; SSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] 1325; SSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] 1326; SSE3-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] 1327; SSE3-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] 1328; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] 1329; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1] 1330; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm12[0] 1331; SSE3-NEXT: popq %rbx 1332; SSE3-NEXT: popq %r12 1333; SSE3-NEXT: popq %r13 1334; SSE3-NEXT: popq %r14 1335; SSE3-NEXT: popq %r15 1336; SSE3-NEXT: popq %rbp 1337; SSE3-NEXT: retq 1338; 1339; SSSE3-LABEL: avx2_hadd_w: 1340; SSSE3: # %bb.0: 1341; SSSE3-NEXT: phaddw %xmm2, %xmm0 1342; SSSE3-NEXT: phaddw %xmm3, %xmm1 1343; SSSE3-NEXT: retq 1344; 1345; AVX1-LABEL: avx2_hadd_w: 1346; AVX1: # %bb.0: 1347; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1348; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1349; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 1350; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 1351; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1352; AVX1-NEXT: retq 1353; 1354; AVX2-LABEL: avx2_hadd_w: 1355; AVX2: # %bb.0: 1356; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1357; AVX2-NEXT: retq 1358 %vecext = extractelement <16 x i16> %a, i32 0 1359 %vecext1 = extractelement <16 x i16> %a, i32 1 1360 %add = add i16 %vecext, %vecext1 1361 %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0 1362 %vecext4 = extractelement <16 x i16> %a, i32 2 1363 %vecext6 = extractelement <16 x i16> %a, i32 3 1364 %add8 = add i16 %vecext4, %vecext6 1365 %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1 1366 %vecext11 = extractelement <16 x i16> %a, i32 4 1367 %vecext13 = extractelement <16 x i16> %a, i32 5 1368 %add15 = add i16 %vecext11, %vecext13 1369 %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2 1370 %vecext18 = extractelement <16 x i16> %a, i32 6 1371 %vecext20 = extractelement <16 x i16> %a, i32 7 1372 %add22 = add i16 %vecext18, %vecext20 1373 %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3 1374 %vecext25 = extractelement <16 x i16> %a, i32 8 1375 %vecext27 = extractelement <16 x i16> %a, i32 9 1376 %add29 = add i16 %vecext25, %vecext27 1377 %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8 1378 %vecext32 = extractelement <16 x i16> %a, i32 10 1379 %vecext34 = extractelement <16 x i16> %a, i32 11 1380 %add36 = add i16 %vecext32, %vecext34 1381 %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9 1382 %vecext39 = extractelement <16 x i16> %a, i32 12 1383 %vecext41 = extractelement <16 x i16> %a, i32 13 1384 %add43 = add i16 %vecext39, %vecext41 1385 %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10 1386 %vecext46 = extractelement <16 x i16> %a, i32 14 1387 %vecext48 = extractelement <16 x i16> %a, i32 15 1388 %add50 = add i16 %vecext46, %vecext48 1389 %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11 1390 %vecext53 = extractelement <16 x i16> %b, i32 0 1391 %vecext55 = extractelement <16 x i16> %b, i32 1 1392 %add57 = add i16 %vecext53, %vecext55 1393 %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4 1394 %vecext60 = extractelement <16 x i16> %b, i32 2 1395 %vecext62 = extractelement <16 x i16> %b, i32 3 1396 %add64 = add i16 %vecext60, %vecext62 1397 %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5 1398 %vecext67 = extractelement <16 x i16> %b, i32 4 1399 %vecext69 = extractelement <16 x i16> %b, i32 5 1400 %add71 = add i16 %vecext67, %vecext69 1401 %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6 1402 %vecext74 = extractelement <16 x i16> %b, i32 6 1403 %vecext76 = extractelement <16 x i16> %b, i32 7 1404 %add78 = add i16 %vecext74, %vecext76 1405 %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7 1406 %vecext81 = extractelement <16 x i16> %b, i32 8 1407 %vecext83 = extractelement <16 x i16> %b, i32 9 1408 %add85 = add i16 %vecext81, %vecext83 1409 %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12 1410 %vecext88 = extractelement <16 x i16> %b, i32 10 1411 %vecext90 = extractelement <16 x i16> %b, i32 11 1412 %add92 = add i16 %vecext88, %vecext90 1413 %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13 1414 %vecext95 = extractelement <16 x i16> %b, i32 12 1415 %vecext97 = extractelement <16 x i16> %b, i32 13 1416 %add99 = add i16 %vecext95, %vecext97 1417 %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14 1418 %vecext102 = extractelement <16 x i16> %b, i32 14 1419 %vecext104 = extractelement <16 x i16> %b, i32 15 1420 %add106 = add i16 %vecext102, %vecext104 1421 %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15 1422 ret <16 x i16> %vecinit108 1423} 1424