1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST 10 11; The next 8 tests check for matching the horizontal op and eliminating the shuffle. 12; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 13 14define <4 x float> @hadd_v4f32(<4 x float> %a) { 15; SSE-LABEL: hadd_v4f32: 16; SSE: # %bb.0: 17; SSE-NEXT: haddps %xmm0, %xmm0 18; SSE-NEXT: retq 19; 20; AVX-LABEL: hadd_v4f32: 21; AVX: # %bb.0: 22; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 23; AVX-NEXT: retq 24 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2> 25 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3> 26 %hop = fadd <2 x float> %a02, %a13 27 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 28 ret <4 x float> %shuf 29} 30 31define <8 x float> @hadd_v8f32a(<8 x float> %a) { 32; SSE_SLOW-LABEL: hadd_v8f32a: 33; SSE_SLOW: # %bb.0: 34; SSE_SLOW-NEXT: movaps %xmm0, %xmm2 35; SSE_SLOW-NEXT: haddps %xmm1, %xmm2 36; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 37; SSE_SLOW-NEXT: movaps %xmm2, %xmm1 38; SSE_SLOW-NEXT: retq 39; 40; SSE_FAST-LABEL: hadd_v8f32a: 41; SSE_FAST: # %bb.0: 42; SSE_FAST-NEXT: movaps %xmm0, %xmm2 43; SSE_FAST-NEXT: haddps %xmm1, %xmm2 44; SSE_FAST-NEXT: haddps %xmm0, %xmm0 45; SSE_FAST-NEXT: movaps %xmm2, %xmm1 46; SSE_FAST-NEXT: retq 47; 48; AVX1_SLOW-LABEL: hadd_v8f32a: 49; AVX1_SLOW: # %bb.0: 50; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 51; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 52; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 53; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 54; AVX1_SLOW-NEXT: retq 55; 56; AVX1_FAST-LABEL: hadd_v8f32a: 57; AVX1_FAST: # %bb.0: 58; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 59; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm1, %ymm0 60; AVX1_FAST-NEXT: retq 61; 62; AVX2-LABEL: hadd_v8f32a: 63; AVX2: # %bb.0: 64; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 65; AVX2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 66; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] 67; AVX2-NEXT: retq 68 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 69 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 70 %hop = fadd <4 x float> %a0, %a1 71 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 72 ret <8 x float> %shuf 73} 74 75define <8 x float> @hadd_v8f32b(<8 x float> %a) { 76; SSE-LABEL: hadd_v8f32b: 77; SSE: # %bb.0: 78; SSE-NEXT: haddps %xmm0, %xmm0 79; SSE-NEXT: haddps %xmm1, %xmm1 80; SSE-NEXT: retq 81; 82; AVX-LABEL: hadd_v8f32b: 83; AVX: # %bb.0: 84; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 85; AVX-NEXT: retq 86 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 87 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 88 %hop = fadd <8 x float> %a0, %a1 89 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 90 ret <8 x float> %shuf 91} 92 93define <4 x float> @hsub_v4f32(<4 x float> %a) { 94; SSE-LABEL: hsub_v4f32: 95; SSE: # %bb.0: 96; SSE-NEXT: hsubps %xmm0, %xmm0 97; SSE-NEXT: retq 98; 99; AVX-LABEL: hsub_v4f32: 100; AVX: # %bb.0: 101; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 102; AVX-NEXT: retq 103 %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2> 104 %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3> 105 %hop = fsub <2 x float> %a02, %a13 106 %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 107 ret <4 x float> %shuf 108} 109 110define <8 x float> @hsub_v8f32a(<8 x float> %a) { 111; SSE_SLOW-LABEL: hsub_v8f32a: 112; SSE_SLOW: # %bb.0: 113; SSE_SLOW-NEXT: movaps %xmm0, %xmm2 114; SSE_SLOW-NEXT: hsubps %xmm1, %xmm2 115; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 116; SSE_SLOW-NEXT: movaps %xmm2, %xmm1 117; SSE_SLOW-NEXT: retq 118; 119; SSE_FAST-LABEL: hsub_v8f32a: 120; SSE_FAST: # %bb.0: 121; SSE_FAST-NEXT: movaps %xmm0, %xmm2 122; SSE_FAST-NEXT: hsubps %xmm1, %xmm2 123; SSE_FAST-NEXT: hsubps %xmm0, %xmm0 124; SSE_FAST-NEXT: movaps %xmm2, %xmm1 125; SSE_FAST-NEXT: retq 126; 127; AVX1_SLOW-LABEL: hsub_v8f32a: 128; AVX1_SLOW: # %bb.0: 129; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 130; AVX1_SLOW-NEXT: vhsubps %xmm1, %xmm0, %xmm0 131; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] 132; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 133; AVX1_SLOW-NEXT: retq 134; 135; AVX1_FAST-LABEL: hsub_v8f32a: 136; AVX1_FAST: # %bb.0: 137; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 138; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm1, %ymm0 139; AVX1_FAST-NEXT: retq 140; 141; AVX2-LABEL: hsub_v8f32a: 142; AVX2: # %bb.0: 143; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 144; AVX2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 145; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] 146; AVX2-NEXT: retq 147 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 148 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 149 %hop = fsub <4 x float> %a0, %a1 150 %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 151 ret <8 x float> %shuf 152} 153 154define <8 x float> @hsub_v8f32b(<8 x float> %a) { 155; SSE-LABEL: hsub_v8f32b: 156; SSE: # %bb.0: 157; SSE-NEXT: hsubps %xmm0, %xmm0 158; SSE-NEXT: hsubps %xmm1, %xmm1 159; SSE-NEXT: retq 160; 161; AVX-LABEL: hsub_v8f32b: 162; AVX: # %bb.0: 163; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 164; AVX-NEXT: retq 165 %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 166 %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 167 %hop = fsub <8 x float> %a0, %a1 168 %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 169 ret <8 x float> %shuf 170} 171 172define <2 x double> @hadd_v2f64(<2 x double> %a) { 173; SSE_SLOW-LABEL: hadd_v2f64: 174; SSE_SLOW: # %bb.0: 175; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 176; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 177; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 178; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 179; SSE_SLOW-NEXT: retq 180; 181; SSE_FAST-LABEL: hadd_v2f64: 182; SSE_FAST: # %bb.0: 183; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 184; SSE_FAST-NEXT: retq 185; 186; AVX1_SLOW-LABEL: hadd_v2f64: 187; AVX1_SLOW: # %bb.0: 188; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 189; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 190; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 191; AVX1_SLOW-NEXT: retq 192; 193; AVX1_FAST-LABEL: hadd_v2f64: 194; AVX1_FAST: # %bb.0: 195; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 196; AVX1_FAST-NEXT: retq 197; 198; AVX2_SLOW-LABEL: hadd_v2f64: 199; AVX2_SLOW: # %bb.0: 200; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 201; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 202; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 203; AVX2_SLOW-NEXT: retq 204; 205; AVX2_FAST-LABEL: hadd_v2f64: 206; AVX2_FAST: # %bb.0: 207; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 208; AVX2_FAST-NEXT: retq 209 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 210 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 211 %hop = fadd <2 x double> %a0, %a1 212 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0> 213 ret <2 x double> %shuf 214} 215 216define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { 217; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat: 218; SSE_SLOW: # %bb.0: 219; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 220; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 221; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 222; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 223; SSE_SLOW-NEXT: retq 224; 225; SSE_FAST-LABEL: hadd_v2f64_scalar_splat: 226; SSE_FAST: # %bb.0: 227; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 228; SSE_FAST-NEXT: retq 229; 230; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat: 231; AVX1_SLOW: # %bb.0: 232; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 233; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 234; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 235; AVX1_SLOW-NEXT: retq 236; 237; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat: 238; AVX1_FAST: # %bb.0: 239; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 240; AVX1_FAST-NEXT: retq 241; 242; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat: 243; AVX2_SLOW: # %bb.0: 244; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 245; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 246; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 247; AVX2_SLOW-NEXT: retq 248; 249; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat: 250; AVX2_FAST: # %bb.0: 251; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 252; AVX2_FAST-NEXT: retq 253 %a0 = extractelement <2 x double> %a, i32 0 254 %a1 = extractelement <2 x double> %a, i32 1 255 %hop = fadd double %a0, %a1 256 %ins = insertelement <2 x double> undef, double %hop, i32 0 257 %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0> 258 ret <2 x double> %shuf 259} 260 261define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) { 262; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat: 263; SSE_SLOW: # %bb.0: 264; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 265; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 266; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 267; SSE_SLOW-NEXT: movapd %xmm1, %xmm3 268; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] 269; SSE_SLOW-NEXT: addsd %xmm1, %xmm3 270; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 271; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] 272; SSE_SLOW-NEXT: retq 273; 274; SSE_FAST-LABEL: hadd_v4f64_scalar_splat: 275; SSE_FAST: # %bb.0: 276; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 277; SSE_FAST-NEXT: haddpd %xmm1, %xmm1 278; SSE_FAST-NEXT: retq 279; 280; AVX-LABEL: hadd_v4f64_scalar_splat: 281; AVX: # %bb.0: 282; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 283; AVX-NEXT: retq 284 %a0 = extractelement <4 x double> %a, i32 0 285 %a1 = extractelement <4 x double> %a, i32 1 286 %hop0 = fadd double %a0, %a1 287 %a2 = extractelement <4 x double> %a, i32 2 288 %a3 = extractelement <4 x double> %a, i32 3 289 %hop1 = fadd double %a2, %a3 290 %ins = insertelement <4 x double> undef, double %hop0, i32 0 291 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2 292 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 293 ret <4 x double> %shuf 294} 295 296define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { 297; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 298; SSE_SLOW: # %bb.0: 299; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 300; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 301; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 302; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 303; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 304; SSE_SLOW-NEXT: retq 305; 306; SSE_FAST-LABEL: hadd_v4f64_scalar_broadcast: 307; SSE_FAST: # %bb.0: 308; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 309; SSE_FAST-NEXT: movapd %xmm0, %xmm1 310; SSE_FAST-NEXT: retq 311; 312; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 313; AVX1_SLOW: # %bb.0: 314; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 315; AVX1_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 316; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 317; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 318; AVX1_SLOW-NEXT: retq 319; 320; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast: 321; AVX1_FAST: # %bb.0: 322; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 323; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 324; AVX1_FAST-NEXT: retq 325; 326; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast: 327; AVX2_SLOW: # %bb.0: 328; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 329; AVX2_SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 330; AVX2_SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 331; AVX2_SLOW-NEXT: retq 332; 333; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast: 334; AVX2_FAST: # %bb.0: 335; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 336; AVX2_FAST-NEXT: vbroadcastsd %xmm0, %ymm0 337; AVX2_FAST-NEXT: retq 338 %a0 = extractelement <4 x double> %a, i32 0 339 %a1 = extractelement <4 x double> %a, i32 1 340 %hop0 = fadd double %a0, %a1 341 %a2 = extractelement <4 x double> %a, i32 2 342 %a3 = extractelement <4 x double> %a, i32 3 343 %hop1 = fadd double %a2, %a3 344 %ins = insertelement <4 x double> undef, double %hop0, i32 0 345 %ins2 = insertelement <4 x double> %ins, double %hop1, i32 2 346 %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> 347 ret <4 x double> %shuf 348} 349 350define <4 x double> @hadd_v4f64(<4 x double> %a) { 351; SSE_SLOW-LABEL: hadd_v4f64: 352; SSE_SLOW: # %bb.0: 353; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 354; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 355; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 356; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] 357; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 358; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 359; SSE_SLOW-NEXT: addsd %xmm1, %xmm2 360; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] 361; SSE_SLOW-NEXT: retq 362; 363; SSE_FAST-LABEL: hadd_v4f64: 364; SSE_FAST: # %bb.0: 365; SSE_FAST-NEXT: haddpd %xmm0, %xmm0 366; SSE_FAST-NEXT: haddpd %xmm1, %xmm1 367; SSE_FAST-NEXT: retq 368; 369; AVX-LABEL: hadd_v4f64: 370; AVX: # %bb.0: 371; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 372; AVX-NEXT: retq 373 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef> 374 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef> 375 %hop = fadd <4 x double> %a0, %a1 376 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 377 ret <4 x double> %shuf 378} 379 380define <2 x double> @hsub_v2f64(<2 x double> %a) { 381; SSE_SLOW-LABEL: hsub_v2f64: 382; SSE_SLOW: # %bb.0: 383; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 384; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 385; SSE_SLOW-NEXT: subsd %xmm1, %xmm0 386; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 387; SSE_SLOW-NEXT: retq 388; 389; SSE_FAST-LABEL: hsub_v2f64: 390; SSE_FAST: # %bb.0: 391; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0 392; SSE_FAST-NEXT: retq 393; 394; AVX1_SLOW-LABEL: hsub_v2f64: 395; AVX1_SLOW: # %bb.0: 396; AVX1_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 397; AVX1_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 398; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 399; AVX1_SLOW-NEXT: retq 400; 401; AVX1_FAST-LABEL: hsub_v2f64: 402; AVX1_FAST: # %bb.0: 403; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 404; AVX1_FAST-NEXT: retq 405; 406; AVX2_SLOW-LABEL: hsub_v2f64: 407; AVX2_SLOW: # %bb.0: 408; AVX2_SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] 409; AVX2_SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 410; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 411; AVX2_SLOW-NEXT: retq 412; 413; AVX2_FAST-LABEL: hsub_v2f64: 414; AVX2_FAST: # %bb.0: 415; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 416; AVX2_FAST-NEXT: retq 417 %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 418 %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 419 %hop = fsub <2 x double> %a0, %a1 420 %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0> 421 ret <2 x double> %shuf 422} 423 424define <4 x double> @hsub_v4f64(<4 x double> %a) { 425; SSE_SLOW-LABEL: hsub_v4f64: 426; SSE_SLOW: # %bb.0: 427; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 428; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 429; SSE_SLOW-NEXT: subsd %xmm2, %xmm0 430; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 431; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 432; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 433; SSE_SLOW-NEXT: subsd %xmm2, %xmm1 434; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] 435; SSE_SLOW-NEXT: retq 436; 437; SSE_FAST-LABEL: hsub_v4f64: 438; SSE_FAST: # %bb.0: 439; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0 440; SSE_FAST-NEXT: hsubpd %xmm1, %xmm1 441; SSE_FAST-NEXT: retq 442; 443; AVX-LABEL: hsub_v4f64: 444; AVX: # %bb.0: 445; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 446; AVX-NEXT: retq 447 %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef> 448 %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef> 449 %hop = fsub <4 x double> %a0, %a1 450 %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 451 ret <4 x double> %shuf 452} 453 454define <4 x i32> @hadd_v4i32(<4 x i32> %a) { 455; SSE3-LABEL: hadd_v4i32: 456; SSE3: # %bb.0: 457; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 458; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 459; SSE3-NEXT: paddd %xmm1, %xmm0 460; SSE3-NEXT: retq 461; 462; SSSE3-LABEL: hadd_v4i32: 463; SSSE3: # %bb.0: 464; SSSE3-NEXT: phaddd %xmm0, %xmm0 465; SSSE3-NEXT: retq 466; 467; AVX-LABEL: hadd_v4i32: 468; AVX: # %bb.0: 469; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 470; AVX-NEXT: retq 471 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 472 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 473 %hop = add <4 x i32> %a02, %a13 474 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1> 475 ret <4 x i32> %shuf 476} 477 478define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { 479; SSE3-LABEL: hadd_v8i32a: 480; SSE3: # %bb.0: 481; SSE3-NEXT: movaps %xmm0, %xmm2 482; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 483; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 484; SSE3-NEXT: paddd %xmm0, %xmm2 485; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 486; SSE3-NEXT: movdqa %xmm2, %xmm1 487; SSE3-NEXT: retq 488; 489; SSSE3_SLOW-LABEL: hadd_v8i32a: 490; SSSE3_SLOW: # %bb.0: 491; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 492; SSSE3_SLOW-NEXT: phaddd %xmm1, %xmm2 493; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 494; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 495; SSSE3_SLOW-NEXT: retq 496; 497; SSSE3_FAST-LABEL: hadd_v8i32a: 498; SSSE3_FAST: # %bb.0: 499; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 500; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm2 501; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 502; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 503; SSSE3_FAST-NEXT: retq 504; 505; AVX1_SLOW-LABEL: hadd_v8i32a: 506; AVX1_SLOW: # %bb.0: 507; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 508; AVX1_SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 509; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 510; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 511; AVX1_SLOW-NEXT: retq 512; 513; AVX1_FAST-LABEL: hadd_v8i32a: 514; AVX1_FAST: # %bb.0: 515; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 516; AVX1_FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm1 517; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 518; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 519; AVX1_FAST-NEXT: retq 520; 521; AVX2-LABEL: hadd_v8i32a: 522; AVX2: # %bb.0: 523; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 524; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 525; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 526; AVX2-NEXT: retq 527 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 528 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 529 %hop = add <4 x i32> %a0, %a1 530 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 531 ret <8 x i32> %shuf 532} 533 534define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { 535; SSE3-LABEL: hadd_v8i32b: 536; SSE3: # %bb.0: 537; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3] 538; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 539; SSE3-NEXT: paddd %xmm2, %xmm0 540; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3] 541; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] 542; SSE3-NEXT: paddd %xmm2, %xmm1 543; SSE3-NEXT: retq 544; 545; SSSE3-LABEL: hadd_v8i32b: 546; SSSE3: # %bb.0: 547; SSSE3-NEXT: phaddd %xmm0, %xmm0 548; SSSE3-NEXT: phaddd %xmm1, %xmm1 549; SSSE3-NEXT: retq 550; 551; AVX1-LABEL: hadd_v8i32b: 552; AVX1: # %bb.0: 553; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 554; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 555; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 556; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 557; AVX1-NEXT: retq 558; 559; AVX2-LABEL: hadd_v8i32b: 560; AVX2: # %bb.0: 561; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 562; AVX2-NEXT: retq 563 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 564 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 565 %hop = add <8 x i32> %a0, %a1 566 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 567 ret <8 x i32> %shuf 568} 569 570define <4 x i32> @hsub_v4i32(<4 x i32> %a) { 571; SSE3-LABEL: hsub_v4i32: 572; SSE3: # %bb.0: 573; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,1,3] 574; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] 575; SSE3-NEXT: psubd %xmm1, %xmm0 576; SSE3-NEXT: retq 577; 578; SSSE3-LABEL: hsub_v4i32: 579; SSSE3: # %bb.0: 580; SSSE3-NEXT: phsubd %xmm0, %xmm0 581; SSSE3-NEXT: retq 582; 583; AVX-LABEL: hsub_v4i32: 584; AVX: # %bb.0: 585; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 586; AVX-NEXT: retq 587 %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 588 %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 589 %hop = sub <4 x i32> %a02, %a13 590 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef> 591 ret <4 x i32> %shuf 592} 593 594define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { 595; SSE3-LABEL: hsub_v8i32a: 596; SSE3: # %bb.0: 597; SSE3-NEXT: movaps %xmm0, %xmm2 598; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 599; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 600; SSE3-NEXT: psubd %xmm0, %xmm2 601; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 602; SSE3-NEXT: movdqa %xmm2, %xmm1 603; SSE3-NEXT: retq 604; 605; SSSE3_SLOW-LABEL: hsub_v8i32a: 606; SSSE3_SLOW: # %bb.0: 607; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 608; SSSE3_SLOW-NEXT: phsubd %xmm1, %xmm2 609; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 610; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 611; SSSE3_SLOW-NEXT: retq 612; 613; SSSE3_FAST-LABEL: hsub_v8i32a: 614; SSSE3_FAST: # %bb.0: 615; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 616; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm2 617; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 618; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 619; SSSE3_FAST-NEXT: retq 620; 621; AVX1_SLOW-LABEL: hsub_v8i32a: 622; AVX1_SLOW: # %bb.0: 623; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 624; AVX1_SLOW-NEXT: vphsubd %xmm1, %xmm0, %xmm0 625; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 626; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 627; AVX1_SLOW-NEXT: retq 628; 629; AVX1_FAST-LABEL: hsub_v8i32a: 630; AVX1_FAST: # %bb.0: 631; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 632; AVX1_FAST-NEXT: vphsubd %xmm1, %xmm0, %xmm1 633; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 634; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 635; AVX1_FAST-NEXT: retq 636; 637; AVX2-LABEL: hsub_v8i32a: 638; AVX2: # %bb.0: 639; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 640; AVX2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 641; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 642; AVX2-NEXT: retq 643 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 644 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 645 %hop = sub <4 x i32> %a0, %a1 646 %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3> 647 ret <8 x i32> %shuf 648} 649 650define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { 651; SSE3-LABEL: hsub_v8i32b: 652; SSE3: # %bb.0: 653; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3] 654; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 655; SSE3-NEXT: psubd %xmm2, %xmm0 656; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3] 657; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] 658; SSE3-NEXT: psubd %xmm2, %xmm1 659; SSE3-NEXT: retq 660; 661; SSSE3-LABEL: hsub_v8i32b: 662; SSSE3: # %bb.0: 663; SSSE3-NEXT: phsubd %xmm0, %xmm0 664; SSSE3-NEXT: phsubd %xmm1, %xmm1 665; SSSE3-NEXT: retq 666; 667; AVX1-LABEL: hsub_v8i32b: 668; AVX1: # %bb.0: 669; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 670; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 671; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 672; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 673; AVX1-NEXT: retq 674; 675; AVX2-LABEL: hsub_v8i32b: 676; AVX2: # %bb.0: 677; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0 678; AVX2-NEXT: retq 679 %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef> 680 %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef> 681 %hop = sub <8 x i32> %a0, %a1 682 %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> 683 ret <8 x i32> %shuf 684} 685 686define <8 x i16> @hadd_v8i16(<8 x i16> %a) { 687; SSE3-LABEL: hadd_v8i16: 688; SSE3: # %bb.0: 689; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] 690; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 691; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] 692; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 693; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] 694; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] 695; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 696; SSE3-NEXT: paddw %xmm1, %xmm0 697; SSE3-NEXT: retq 698; 699; SSSE3-LABEL: hadd_v8i16: 700; SSSE3: # %bb.0: 701; SSSE3-NEXT: phaddw %xmm0, %xmm0 702; SSSE3-NEXT: retq 703; 704; AVX-LABEL: hadd_v8i16: 705; AVX: # %bb.0: 706; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 707; AVX-NEXT: retq 708 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 709 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 710 %hop = add <8 x i16> %a0246, %a1357 711 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3> 712 ret <8 x i16> %shuf 713} 714 715define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { 716; SSE3-LABEL: hadd_v16i16a: 717; SSE3: # %bb.0: 718; SSE3-NEXT: movdqa %xmm1, %xmm3 719; SSE3-NEXT: pslld $16, %xmm3 720; SSE3-NEXT: psrad $16, %xmm3 721; SSE3-NEXT: movdqa %xmm0, %xmm2 722; SSE3-NEXT: pslld $16, %xmm2 723; SSE3-NEXT: psrad $16, %xmm2 724; SSE3-NEXT: packssdw %xmm3, %xmm2 725; SSE3-NEXT: psrad $16, %xmm1 726; SSE3-NEXT: psrad $16, %xmm0 727; SSE3-NEXT: packssdw %xmm1, %xmm0 728; SSE3-NEXT: paddw %xmm0, %xmm2 729; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 730; SSE3-NEXT: movdqa %xmm2, %xmm1 731; SSE3-NEXT: retq 732; 733; SSSE3_SLOW-LABEL: hadd_v16i16a: 734; SSSE3_SLOW: # %bb.0: 735; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 736; SSSE3_SLOW-NEXT: phaddw %xmm1, %xmm2 737; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 738; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 739; SSSE3_SLOW-NEXT: retq 740; 741; SSSE3_FAST-LABEL: hadd_v16i16a: 742; SSSE3_FAST: # %bb.0: 743; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 744; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm2 745; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 746; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 747; SSSE3_FAST-NEXT: retq 748; 749; AVX1_SLOW-LABEL: hadd_v16i16a: 750; AVX1_SLOW: # %bb.0: 751; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 752; AVX1_SLOW-NEXT: vphaddw %xmm1, %xmm0, %xmm0 753; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 754; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 755; AVX1_SLOW-NEXT: retq 756; 757; AVX1_FAST-LABEL: hadd_v16i16a: 758; AVX1_FAST: # %bb.0: 759; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 760; AVX1_FAST-NEXT: vphaddw %xmm1, %xmm0, %xmm1 761; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 762; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 763; AVX1_FAST-NEXT: retq 764; 765; AVX2-LABEL: hadd_v16i16a: 766; AVX2: # %bb.0: 767; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 768; AVX2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 769; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 770; AVX2-NEXT: retq 771 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 772 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 773 %hop = add <8 x i16> %a0, %a1 774 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7> 775 ret <16 x i16> %shuf 776} 777 778define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { 779; SSE3-LABEL: hadd_v16i16b: 780; SSE3: # %bb.0: 781; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7] 782; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] 783; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 784; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 785; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] 786; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] 787; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] 788; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 789; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] 790; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] 791; SSE3-NEXT: paddw %xmm2, %xmm0 792; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7] 793; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] 794; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 795; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 796; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] 797; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] 798; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4] 799; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 800; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 801; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5] 802; SSE3-NEXT: paddw %xmm2, %xmm1 803; SSE3-NEXT: retq 804; 805; SSSE3-LABEL: hadd_v16i16b: 806; SSSE3: # %bb.0: 807; SSSE3-NEXT: phaddw %xmm0, %xmm0 808; SSSE3-NEXT: phaddw %xmm1, %xmm1 809; SSSE3-NEXT: retq 810; 811; AVX1-LABEL: hadd_v16i16b: 812; AVX1: # %bb.0: 813; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 814; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 815; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 816; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 817; AVX1-NEXT: retq 818; 819; AVX2-LABEL: hadd_v16i16b: 820; AVX2: # %bb.0: 821; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 822; AVX2-NEXT: retq 823 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef> 824 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef> 825 %hop = add <16 x i16> %a0, %a1 826 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11> 827 ret <16 x i16> %shuf 828} 829 830define <8 x i16> @hsub_v8i16(<8 x i16> %a) { 831; SSE3-LABEL: hsub_v8i16: 832; SSE3: # %bb.0: 833; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 834; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7] 835; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] 836; SSE3-NEXT: psubw %xmm1, %xmm0 837; SSE3-NEXT: retq 838; 839; SSSE3-LABEL: hsub_v8i16: 840; SSSE3: # %bb.0: 841; SSSE3-NEXT: phsubw %xmm0, %xmm0 842; SSSE3-NEXT: retq 843; 844; AVX-LABEL: hsub_v8i16: 845; AVX: # %bb.0: 846; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 847; AVX-NEXT: retq 848 %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 849 %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 850 %hop = sub <8 x i16> %a0246, %a1357 851 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3> 852 ret <8 x i16> %shuf 853} 854 855define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { 856; SSE3-LABEL: hsub_v16i16a: 857; SSE3: # %bb.0: 858; SSE3-NEXT: movdqa %xmm1, %xmm3 859; SSE3-NEXT: pslld $16, %xmm3 860; SSE3-NEXT: psrad $16, %xmm3 861; SSE3-NEXT: movdqa %xmm0, %xmm2 862; SSE3-NEXT: pslld $16, %xmm2 863; SSE3-NEXT: psrad $16, %xmm2 864; SSE3-NEXT: packssdw %xmm3, %xmm2 865; SSE3-NEXT: psrad $16, %xmm1 866; SSE3-NEXT: psrad $16, %xmm0 867; SSE3-NEXT: packssdw %xmm1, %xmm0 868; SSE3-NEXT: psubw %xmm0, %xmm2 869; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 870; SSE3-NEXT: movdqa %xmm2, %xmm1 871; SSE3-NEXT: retq 872; 873; SSSE3_SLOW-LABEL: hsub_v16i16a: 874; SSSE3_SLOW: # %bb.0: 875; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2 876; SSSE3_SLOW-NEXT: phsubw %xmm1, %xmm2 877; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] 878; SSSE3_SLOW-NEXT: movdqa %xmm2, %xmm1 879; SSSE3_SLOW-NEXT: retq 880; 881; SSSE3_FAST-LABEL: hsub_v16i16a: 882; SSSE3_FAST: # %bb.0: 883; SSSE3_FAST-NEXT: movdqa %xmm0, %xmm2 884; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm2 885; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 886; SSSE3_FAST-NEXT: movdqa %xmm2, %xmm1 887; SSSE3_FAST-NEXT: retq 888; 889; AVX1_SLOW-LABEL: hsub_v16i16a: 890; AVX1_SLOW: # %bb.0: 891; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 892; AVX1_SLOW-NEXT: vphsubw %xmm1, %xmm0, %xmm0 893; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] 894; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 895; AVX1_SLOW-NEXT: retq 896; 897; AVX1_FAST-LABEL: hsub_v16i16a: 898; AVX1_FAST: # %bb.0: 899; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 900; AVX1_FAST-NEXT: vphsubw %xmm1, %xmm0, %xmm1 901; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 902; AVX1_FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 903; AVX1_FAST-NEXT: retq 904; 905; AVX2-LABEL: hsub_v16i16a: 906; AVX2: # %bb.0: 907; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 908; AVX2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 909; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] 910; AVX2-NEXT: retq 911 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 912 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 913 %hop = sub <8 x i16> %a0, %a1 914 %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7> 915 ret <16 x i16> %shuf 916} 917 918define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { 919; SSE3-LABEL: hsub_v16i16b: 920; SSE3: # %bb.0: 921; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7] 922; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] 923; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 924; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 925; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] 926; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] 927; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] 928; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 929; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] 930; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] 931; SSE3-NEXT: psubw %xmm2, %xmm0 932; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7] 933; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7] 934; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1] 935; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7] 936; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4] 937; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7] 938; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4] 939; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] 940; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 941; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5] 942; SSE3-NEXT: psubw %xmm2, %xmm1 943; SSE3-NEXT: retq 944; 945; SSSE3-LABEL: hsub_v16i16b: 946; SSSE3: # %bb.0: 947; SSSE3-NEXT: phsubw %xmm0, %xmm0 948; SSSE3-NEXT: phsubw %xmm1, %xmm1 949; SSSE3-NEXT: retq 950; 951; AVX1-LABEL: hsub_v16i16b: 952; AVX1: # %bb.0: 953; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 954; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 955; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 956; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 957; AVX1-NEXT: retq 958; 959; AVX2-LABEL: hsub_v16i16b: 960; AVX2: # %bb.0: 961; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0 962; AVX2-NEXT: retq 963 %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef> 964 %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef> 965 %hop = sub <16 x i16> %a0, %a1 966 %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11> 967 ret <16 x i16> %shuf 968} 969 970define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { 971; SSE-LABEL: broadcast_haddps_v4f32: 972; SSE: # %bb.0: 973; SSE-NEXT: haddps %xmm0, %xmm0 974; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 975; SSE-NEXT: retq 976; 977; AVX1-LABEL: broadcast_haddps_v4f32: 978; AVX1: # %bb.0: 979; AVX1-NEXT: vhaddps %xmm0, %xmm0, %xmm0 980; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 981; AVX1-NEXT: retq 982; 983; AVX2-LABEL: broadcast_haddps_v4f32: 984; AVX2: # %bb.0: 985; AVX2-NEXT: vhaddps %xmm0, %xmm0, %xmm0 986; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 987; AVX2-NEXT: retq 988 %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0) 989 %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer 990 ret <4 x float> %2 991} 992 993declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 994 995define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { 996; SSE-LABEL: PR34724_1: 997; SSE: # %bb.0: 998; SSE-NEXT: haddps %xmm1, %xmm0 999; SSE-NEXT: retq 1000; 1001; AVX-LABEL: PR34724_1: 1002; AVX: # %bb.0: 1003; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1004; AVX-NEXT: retq 1005 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4> 1006 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5> 1007 %t2 = fadd <2 x float> %t0, %t1 1008 %vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 1009 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 1010 %t4 = fadd <4 x float> %t3, %b 1011 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7> 1012 ret <4 x float> %vecinit13 1013} 1014 1015define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { 1016; SSE-LABEL: PR34724_2: 1017; SSE: # %bb.0: 1018; SSE-NEXT: haddps %xmm1, %xmm0 1019; SSE-NEXT: retq 1020; 1021; AVX-LABEL: PR34724_2: 1022; AVX: # %bb.0: 1023; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1024; AVX-NEXT: retq 1025 %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef> 1026 %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef> 1027 %t2 = fadd <4 x float> %t0, %t1 1028 %vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef> 1029 %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 1030 %t4 = fadd <4 x float> %t3, %b 1031 %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7> 1032 ret <4 x float> %vecinit13 1033} 1034 1035; 1036; fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) 1037; --> SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))). 1038; 1039 1040define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) { 1041; SSE-LABEL: hadd_4f32_v8f32_shuffle: 1042; SSE: # %bb.0: 1043; SSE-NEXT: haddps %xmm1, %xmm0 1044; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1045; SSE-NEXT: retq 1046; 1047; AVX-LABEL: hadd_4f32_v8f32_shuffle: 1048; AVX: # %bb.0: 1049; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1050; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1051; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1052; AVX-NEXT: vzeroupper 1053; AVX-NEXT: retq 1054 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1055 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1056 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1057 %hadd0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1058 %hadd1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1059 %hadd = fadd <4 x float> %hadd0, %hadd1 1060 ret <4 x float> %hadd 1061} 1062 1063define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) { 1064; SSE-LABEL: hsub_4f32_v8f32_shuffle: 1065; SSE: # %bb.0: 1066; SSE-NEXT: haddps %xmm1, %xmm0 1067; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1068; SSE-NEXT: retq 1069; 1070; AVX-LABEL: hsub_4f32_v8f32_shuffle: 1071; AVX: # %bb.0: 1072; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1073; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1074; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1075; AVX-NEXT: vzeroupper 1076; AVX-NEXT: retq 1077 %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1078 %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1079 %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1080 %hsub0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1081 %hsub1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1082 %hsub = fadd <4 x float> %hsub0, %hsub1 1083 ret <4 x float> %hsub 1084} 1085 1086define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) { 1087; SSE3-LABEL: hadd_4i32_v8i32_shuffle: 1088; SSE3: # %bb.0: 1089; SSE3-NEXT: movaps %xmm0, %xmm2 1090; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] 1091; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] 1092; SSE3-NEXT: paddd %xmm2, %xmm0 1093; SSE3-NEXT: retq 1094; 1095; SSSE3-LABEL: hadd_4i32_v8i32_shuffle: 1096; SSSE3: # %bb.0: 1097; SSSE3-NEXT: phaddd %xmm1, %xmm0 1098; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1099; SSSE3-NEXT: retq 1100; 1101; AVX1-LABEL: hadd_4i32_v8i32_shuffle: 1102; AVX1: # %bb.0: 1103; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1104; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1105; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1106; AVX1-NEXT: vzeroupper 1107; AVX1-NEXT: retq 1108; 1109; AVX2-LABEL: hadd_4i32_v8i32_shuffle: 1110; AVX2: # %bb.0: 1111; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1112; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1113; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1114; AVX2-NEXT: vzeroupper 1115; AVX2-NEXT: retq 1116 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1117 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1118 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1119 %hadd0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1120 %hadd1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1121 %hadd = add <4 x i32> %hadd0, %hadd1 1122 ret <4 x i32> %hadd 1123} 1124 1125define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) { 1126; SSE3-LABEL: hsub_4i32_v8i32_shuffle: 1127; SSE3: # %bb.0: 1128; SSE3-NEXT: movaps %xmm0, %xmm2 1129; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2] 1130; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] 1131; SSE3-NEXT: paddd %xmm2, %xmm0 1132; SSE3-NEXT: retq 1133; 1134; SSSE3-LABEL: hsub_4i32_v8i32_shuffle: 1135; SSSE3: # %bb.0: 1136; SSSE3-NEXT: phaddd %xmm1, %xmm0 1137; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1138; SSSE3-NEXT: retq 1139; 1140; AVX1-LABEL: hsub_4i32_v8i32_shuffle: 1141; AVX1: # %bb.0: 1142; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1143; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1144; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1145; AVX1-NEXT: vzeroupper 1146; AVX1-NEXT: retq 1147; 1148; AVX2-LABEL: hsub_4i32_v8i32_shuffle: 1149; AVX2: # %bb.0: 1150; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1151; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1152; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1153; AVX2-NEXT: vzeroupper 1154; AVX2-NEXT: retq 1155 %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7> 1156 %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1157 %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 1158 %hsub0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1159 %hsub1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1160 %hsub = add <4 x i32> %hsub0, %hsub1 1161 ret <4 x i32> %hsub 1162} 1163 1164; 1165; fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) --> SHUFFLE(HOP(X,Y)). 1166; 1167 1168define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { 1169; SSE-LABEL: hadd_4f64_v4f64_shuffle: 1170; SSE: # %bb.0: 1171; SSE-NEXT: haddpd %xmm1, %xmm0 1172; SSE-NEXT: haddpd %xmm3, %xmm2 1173; SSE-NEXT: movapd %xmm2, %xmm1 1174; SSE-NEXT: retq 1175; 1176; AVX1-LABEL: hadd_4f64_v4f64_shuffle: 1177; AVX1: # %bb.0: 1178; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1179; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1180; AVX1-NEXT: vhaddpd %ymm0, %ymm2, %ymm0 1181; AVX1-NEXT: retq 1182; 1183; AVX2-LABEL: hadd_4f64_v4f64_shuffle: 1184; AVX2: # %bb.0: 1185; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 1186; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1187; AVX2-NEXT: retq 1188 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1189 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1190 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 1191 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 1192 %hadd = fadd <4 x double> %hadd0, %hadd1 1193 ret <4 x double> %hadd 1194} 1195 1196define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) { 1197; SSE-LABEL: hsub_4f64_v4f64_shuffle: 1198; SSE: # %bb.0: 1199; SSE-NEXT: hsubpd %xmm1, %xmm0 1200; SSE-NEXT: hsubpd %xmm3, %xmm2 1201; SSE-NEXT: movapd %xmm2, %xmm1 1202; SSE-NEXT: retq 1203; 1204; AVX1-LABEL: hsub_4f64_v4f64_shuffle: 1205; AVX1: # %bb.0: 1206; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1207; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1208; AVX1-NEXT: vhsubpd %ymm0, %ymm2, %ymm0 1209; AVX1-NEXT: retq 1210; 1211; AVX2-LABEL: hsub_4f64_v4f64_shuffle: 1212; AVX2: # %bb.0: 1213; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 1214; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1215; AVX2-NEXT: retq 1216 %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1217 %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1218 %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 1219 %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 1220 %hadd = fsub <4 x double> %hadd0, %hadd1 1221 ret <4 x double> %hadd 1222} 1223 1224define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { 1225; SSE-LABEL: hadd_8f32_v8f32_shuffle: 1226; SSE: # %bb.0: 1227; SSE-NEXT: haddps %xmm1, %xmm0 1228; SSE-NEXT: haddps %xmm3, %xmm2 1229; SSE-NEXT: movaps %xmm2, %xmm1 1230; SSE-NEXT: retq 1231; 1232; AVX1-LABEL: hadd_8f32_v8f32_shuffle: 1233; AVX1: # %bb.0: 1234; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1235; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1236; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 1237; AVX1-NEXT: retq 1238; 1239; AVX2-LABEL: hadd_8f32_v8f32_shuffle: 1240; AVX2: # %bb.0: 1241; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1242; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1243; AVX2-NEXT: retq 1244 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1245 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1246 %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1247 %hadd1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1248 %hadd = fadd <8 x float> %hadd0, %hadd1 1249 ret <8 x float> %hadd 1250} 1251 1252define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) { 1253; SSE-LABEL: hsub_8f32_v8f32_shuffle: 1254; SSE: # %bb.0: 1255; SSE-NEXT: haddps %xmm1, %xmm0 1256; SSE-NEXT: haddps %xmm3, %xmm2 1257; SSE-NEXT: movaps %xmm2, %xmm1 1258; SSE-NEXT: retq 1259; 1260; AVX1-LABEL: hsub_8f32_v8f32_shuffle: 1261; AVX1: # %bb.0: 1262; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 1263; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1264; AVX1-NEXT: vhaddps %ymm0, %ymm2, %ymm0 1265; AVX1-NEXT: retq 1266; 1267; AVX2-LABEL: hsub_8f32_v8f32_shuffle: 1268; AVX2: # %bb.0: 1269; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 1270; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1271; AVX2-NEXT: retq 1272 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1273 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1274 %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1275 %hsub1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1276 %hsub = fadd <8 x float> %hsub0, %hsub1 1277 ret <8 x float> %hsub 1278} 1279 1280define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { 1281; SSE3-LABEL: hadd_8i32_v8i32_shuffle: 1282; SSE3: # %bb.0: 1283; SSE3-NEXT: movaps %xmm2, %xmm4 1284; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] 1285; SSE3-NEXT: movaps %xmm0, %xmm5 1286; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] 1287; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 1288; SSE3-NEXT: paddd %xmm2, %xmm4 1289; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1290; SSE3-NEXT: paddd %xmm5, %xmm0 1291; SSE3-NEXT: movdqa %xmm4, %xmm1 1292; SSE3-NEXT: retq 1293; 1294; SSSE3-LABEL: hadd_8i32_v8i32_shuffle: 1295; SSSE3: # %bb.0: 1296; SSSE3-NEXT: phaddd %xmm1, %xmm0 1297; SSSE3-NEXT: phaddd %xmm3, %xmm2 1298; SSSE3-NEXT: movdqa %xmm2, %xmm1 1299; SSSE3-NEXT: retq 1300; 1301; AVX1-LABEL: hadd_8i32_v8i32_shuffle: 1302; AVX1: # %bb.0: 1303; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1304; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1 1305; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1306; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0 1307; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1308; AVX1-NEXT: retq 1309; 1310; AVX2-LABEL: hadd_8i32_v8i32_shuffle: 1311; AVX2: # %bb.0: 1312; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0 1313; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1314; AVX2-NEXT: retq 1315 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1316 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1317 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1318 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1319 %hadd = add <8 x i32> %hadd0, %hadd1 1320 ret <8 x i32> %hadd 1321} 1322 1323define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) { 1324; SSE3-LABEL: hsub_8i32_v8i32_shuffle: 1325; SSE3: # %bb.0: 1326; SSE3-NEXT: movaps %xmm2, %xmm4 1327; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2] 1328; SSE3-NEXT: movaps %xmm0, %xmm5 1329; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] 1330; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 1331; SSE3-NEXT: psubd %xmm2, %xmm4 1332; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1333; SSE3-NEXT: psubd %xmm0, %xmm5 1334; SSE3-NEXT: movdqa %xmm5, %xmm0 1335; SSE3-NEXT: movdqa %xmm4, %xmm1 1336; SSE3-NEXT: retq 1337; 1338; SSSE3-LABEL: hsub_8i32_v8i32_shuffle: 1339; SSSE3: # %bb.0: 1340; SSSE3-NEXT: phsubd %xmm1, %xmm0 1341; SSSE3-NEXT: phsubd %xmm3, %xmm2 1342; SSSE3-NEXT: movdqa %xmm2, %xmm1 1343; SSSE3-NEXT: retq 1344; 1345; AVX1-LABEL: hsub_8i32_v8i32_shuffle: 1346; AVX1: # %bb.0: 1347; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1348; AVX1-NEXT: vphsubd %xmm2, %xmm1, %xmm1 1349; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1350; AVX1-NEXT: vphsubd %xmm2, %xmm0, %xmm0 1351; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1352; AVX1-NEXT: retq 1353; 1354; AVX2-LABEL: hsub_8i32_v8i32_shuffle: 1355; AVX2: # %bb.0: 1356; AVX2-NEXT: vphsubd %ymm1, %ymm0, %ymm0 1357; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1358; AVX2-NEXT: retq 1359 %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 1360 %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1361 %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 1362 %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 1363 %hadd = sub <8 x i32> %hadd0, %hadd1 1364 ret <8 x i32> %hadd 1365} 1366 1367define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) { 1368; SSE3-LABEL: hadd_16i16_16i16_shuffle: 1369; SSE3: # %bb.0: 1370; SSE3-NEXT: movdqa %xmm3, %xmm5 1371; SSE3-NEXT: pslld $16, %xmm5 1372; SSE3-NEXT: psrad $16, %xmm5 1373; SSE3-NEXT: movdqa %xmm2, %xmm4 1374; SSE3-NEXT: pslld $16, %xmm4 1375; SSE3-NEXT: psrad $16, %xmm4 1376; SSE3-NEXT: packssdw %xmm5, %xmm4 1377; SSE3-NEXT: movdqa %xmm1, %xmm5 1378; SSE3-NEXT: pslld $16, %xmm5 1379; SSE3-NEXT: psrad $16, %xmm5 1380; SSE3-NEXT: movdqa %xmm0, %xmm6 1381; SSE3-NEXT: pslld $16, %xmm6 1382; SSE3-NEXT: psrad $16, %xmm6 1383; SSE3-NEXT: packssdw %xmm5, %xmm6 1384; SSE3-NEXT: psrad $16, %xmm3 1385; SSE3-NEXT: psrad $16, %xmm2 1386; SSE3-NEXT: packssdw %xmm3, %xmm2 1387; SSE3-NEXT: paddw %xmm2, %xmm4 1388; SSE3-NEXT: psrad $16, %xmm1 1389; SSE3-NEXT: psrad $16, %xmm0 1390; SSE3-NEXT: packssdw %xmm1, %xmm0 1391; SSE3-NEXT: paddw %xmm6, %xmm0 1392; SSE3-NEXT: movdqa %xmm4, %xmm1 1393; SSE3-NEXT: retq 1394; 1395; SSSE3-LABEL: hadd_16i16_16i16_shuffle: 1396; SSSE3: # %bb.0: 1397; SSSE3-NEXT: phaddw %xmm1, %xmm0 1398; SSSE3-NEXT: phaddw %xmm3, %xmm2 1399; SSSE3-NEXT: movdqa %xmm2, %xmm1 1400; SSSE3-NEXT: retq 1401; 1402; AVX1-LABEL: hadd_16i16_16i16_shuffle: 1403; AVX1: # %bb.0: 1404; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1405; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1 1406; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1407; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0 1408; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1409; AVX1-NEXT: retq 1410; 1411; AVX2-LABEL: hadd_16i16_16i16_shuffle: 1412; AVX2: # %bb.0: 1413; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 1414; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1415; AVX2-NEXT: retq 1416 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 1417 %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1418 %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> 1419 %hadd1 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> 1420 %hadd = add <16 x i16> %hadd0, %hadd1 1421 ret <16 x i16> %hadd 1422} 1423