1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF 10 11define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { 12; SSSE3-LABEL: phaddw1: 13; SSSE3: # %bb.0: 14; SSSE3-NEXT: phaddw %xmm1, %xmm0 15; SSSE3-NEXT: retq 16; 17; AVX-LABEL: phaddw1: 18; AVX: # %bb.0: 19; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 22 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 23 %r = add <8 x i16> %a, %b 24 ret <8 x i16> %r 25} 26 27define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) { 28; SSSE3-LABEL: phaddw2: 29; SSSE3: # %bb.0: 30; SSSE3-NEXT: phaddw %xmm1, %xmm0 31; SSSE3-NEXT: retq 32; 33; AVX-LABEL: phaddw2: 34; AVX: # %bb.0: 35; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 36; AVX-NEXT: retq 37 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> 38 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7> 39 %r = add <8 x i16> %a, %b 40 ret <8 x i16> %r 41} 42 43define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) { 44; SSSE3-LABEL: phaddd1: 45; SSSE3: # %bb.0: 46; SSSE3-NEXT: phaddd %xmm1, %xmm0 47; SSSE3-NEXT: retq 48; 49; AVX-LABEL: phaddd1: 50; AVX: # %bb.0: 51; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 52; AVX-NEXT: retq 53 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 54 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 55 %r = add <4 x i32> %a, %b 56 ret <4 x i32> %r 57} 58 59define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) { 60; SSSE3-LABEL: phaddd2: 61; SSSE3: # %bb.0: 62; SSSE3-NEXT: phaddd %xmm1, %xmm0 63; SSSE3-NEXT: retq 64; 65; AVX-LABEL: phaddd2: 66; AVX: # %bb.0: 67; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 70 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 71 %r = add <4 x i32> %a, %b 72 ret <4 x i32> %r 73} 74 75define <4 x i32> @phaddd3(<4 x i32> %x) { 76; SSSE3-LABEL: phaddd3: 77; SSSE3: # %bb.0: 78; SSSE3-NEXT: phaddd %xmm0, %xmm0 79; SSSE3-NEXT: retq 80; 81; AVX-LABEL: phaddd3: 82; AVX: # %bb.0: 83; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 84; AVX-NEXT: retq 85 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 86 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 87 %r = add <4 x i32> %a, %b 88 ret <4 x i32> %r 89} 90 91define <4 x i32> @phaddd4(<4 x i32> %x) { 92; SSSE3-LABEL: phaddd4: 93; SSSE3: # %bb.0: 94; SSSE3-NEXT: phaddd %xmm0, %xmm0 95; SSSE3-NEXT: retq 96; 97; AVX-LABEL: phaddd4: 98; AVX: # %bb.0: 99; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 100; AVX-NEXT: retq 101 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 102 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 103 %r = add <4 x i32> %a, %b 104 ret <4 x i32> %r 105} 106 107define <4 x i32> @phaddd5(<4 x i32> %x) { 108; SSSE3-LABEL: phaddd5: 109; SSSE3: # %bb.0: 110; SSSE3-NEXT: phaddd %xmm0, %xmm0 111; SSSE3-NEXT: retq 112; 113; AVX-LABEL: phaddd5: 114; AVX: # %bb.0: 115; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 116; AVX-NEXT: retq 117 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 118 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 119 %r = add <4 x i32> %a, %b 120 ret <4 x i32> %r 121} 122 123define <4 x i32> @phaddd6(<4 x i32> %x) { 124; SSSE3-SLOW-LABEL: phaddd6: 125; SSSE3-SLOW: # %bb.0: 126; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 127; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 128; SSSE3-SLOW-NEXT: retq 129; 130; SSSE3-FAST-LABEL: phaddd6: 131; SSSE3-FAST: # %bb.0: 132; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 133; SSSE3-FAST-NEXT: retq 134; 135; AVX-SLOW-LABEL: phaddd6: 136; AVX-SLOW: # %bb.0: 137; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 138; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 139; AVX-SLOW-NEXT: retq 140; 141; AVX-FAST-LABEL: phaddd6: 142; AVX-FAST: # %bb.0: 143; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 144; AVX-FAST-NEXT: retq 145; 146; AVX2-SHUF-LABEL: phaddd6: 147; AVX2-SHUF: # %bb.0: 148; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 149; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 150; AVX2-SHUF-NEXT: retq 151 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 152 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 153 %r = add <4 x i32> %a, %b 154 ret <4 x i32> %r 155} 156 157define <4 x i32> @phaddd7(<4 x i32> %x) { 158; SSSE3-LABEL: phaddd7: 159; SSSE3: # %bb.0: 160; SSSE3-NEXT: phaddd %xmm0, %xmm0 161; SSSE3-NEXT: retq 162; 163; AVX-LABEL: phaddd7: 164; AVX: # %bb.0: 165; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 166; AVX-NEXT: retq 167 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 168 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 169 %r = add <4 x i32> %a, %b 170 ret <4 x i32> %r 171} 172 173define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) { 174; SSSE3-LABEL: phsubw1: 175; SSSE3: # %bb.0: 176; SSSE3-NEXT: phsubw %xmm1, %xmm0 177; SSSE3-NEXT: retq 178; 179; AVX-LABEL: phsubw1: 180; AVX: # %bb.0: 181; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0 182; AVX-NEXT: retq 183 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 184 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 185 %r = sub <8 x i16> %a, %b 186 ret <8 x i16> %r 187} 188 189define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) { 190; SSSE3-LABEL: phsubd1: 191; SSSE3: # %bb.0: 192; SSSE3-NEXT: phsubd %xmm1, %xmm0 193; SSSE3-NEXT: retq 194; 195; AVX-LABEL: phsubd1: 196; AVX: # %bb.0: 197; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 198; AVX-NEXT: retq 199 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 200 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 201 %r = sub <4 x i32> %a, %b 202 ret <4 x i32> %r 203} 204 205define <4 x i32> @phsubd2(<4 x i32> %x) { 206; SSSE3-LABEL: phsubd2: 207; SSSE3: # %bb.0: 208; SSSE3-NEXT: phsubd %xmm0, %xmm0 209; SSSE3-NEXT: retq 210; 211; AVX-LABEL: phsubd2: 212; AVX: # %bb.0: 213; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 214; AVX-NEXT: retq 215 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 216 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 217 %r = sub <4 x i32> %a, %b 218 ret <4 x i32> %r 219} 220 221define <4 x i32> @phsubd3(<4 x i32> %x) { 222; SSSE3-LABEL: phsubd3: 223; SSSE3: # %bb.0: 224; SSSE3-NEXT: phsubd %xmm0, %xmm0 225; SSSE3-NEXT: retq 226; 227; AVX-LABEL: phsubd3: 228; AVX: # %bb.0: 229; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 230; AVX-NEXT: retq 231 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 232 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 233 %r = sub <4 x i32> %a, %b 234 ret <4 x i32> %r 235} 236 237define <4 x i32> @phsubd4(<4 x i32> %x) { 238; SSSE3-SLOW-LABEL: phsubd4: 239; SSSE3-SLOW: # %bb.0: 240; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 241; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0 242; SSSE3-SLOW-NEXT: retq 243; 244; SSSE3-FAST-LABEL: phsubd4: 245; SSSE3-FAST: # %bb.0: 246; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 247; SSSE3-FAST-NEXT: retq 248; 249; AVX-SLOW-LABEL: phsubd4: 250; AVX-SLOW: # %bb.0: 251; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 252; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 253; AVX-SLOW-NEXT: retq 254; 255; AVX-FAST-LABEL: phsubd4: 256; AVX-FAST: # %bb.0: 257; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 258; AVX-FAST-NEXT: retq 259; 260; AVX2-SHUF-LABEL: phsubd4: 261; AVX2-SHUF: # %bb.0: 262; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 263; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0 264; AVX2-SHUF-NEXT: retq 265 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 266 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 267 %r = sub <4 x i32> %a, %b 268 ret <4 x i32> %r 269} 270 271define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) { 272; SSSE3-LABEL: phsubw1_reverse: 273; SSSE3: # %bb.0: 274; SSSE3-NEXT: movdqa %xmm1, %xmm3 275; SSSE3-NEXT: psrad $16, %xmm3 276; SSSE3-NEXT: movdqa %xmm0, %xmm2 277; SSSE3-NEXT: psrad $16, %xmm2 278; SSSE3-NEXT: packssdw %xmm3, %xmm2 279; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 280; SSSE3-NEXT: pshufb %xmm3, %xmm1 281; SSSE3-NEXT: pshufb %xmm3, %xmm0 282; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 283; SSSE3-NEXT: psubw %xmm0, %xmm2 284; SSSE3-NEXT: movdqa %xmm2, %xmm0 285; SSSE3-NEXT: retq 286; 287; AVX-LABEL: phsubw1_reverse: 288; AVX: # %bb.0: 289; AVX-NEXT: vpsrld $16, %xmm1, %xmm2 290; AVX-NEXT: vpsrld $16, %xmm0, %xmm3 291; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 292; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 293; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] 294; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] 295; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 296; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0 297; AVX-NEXT: retq 298 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 299 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 300 %r = sub <8 x i16> %a, %b 301 ret <8 x i16> %r 302} 303 304define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { 305; SSSE3-LABEL: phsubd1_reverse: 306; SSSE3: # %bb.0: 307; SSSE3-NEXT: movaps %xmm0, %xmm2 308; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] 309; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 310; SSSE3-NEXT: psubd %xmm0, %xmm2 311; SSSE3-NEXT: movdqa %xmm2, %xmm0 312; SSSE3-NEXT: retq 313; 314; AVX-LABEL: phsubd1_reverse: 315; AVX: # %bb.0: 316; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3] 317; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 318; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 319; AVX-NEXT: retq 320 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 321 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 322 %r = sub <4 x i32> %a, %b 323 ret <4 x i32> %r 324} 325 326define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { 327; SSSE3-LABEL: phaddd_single_source1: 328; SSSE3: # %bb.0: 329; SSSE3-NEXT: phaddd %xmm0, %xmm0 330; SSSE3-NEXT: retq 331; 332; AVX-LABEL: phaddd_single_source1: 333; AVX: # %bb.0: 334; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 335; AVX-NEXT: retq 336 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 337 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 338 %add = add <4 x i32> %l, %r 339 ret <4 x i32> %add 340} 341 342define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { 343; SSSE3-SLOW-LABEL: phaddd_single_source2: 344; SSSE3-SLOW: # %bb.0: 345; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 346; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 347; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 348; SSSE3-SLOW-NEXT: retq 349; 350; SSSE3-FAST-LABEL: phaddd_single_source2: 351; SSSE3-FAST: # %bb.0: 352; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 353; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 354; SSSE3-FAST-NEXT: retq 355; 356; AVX-SLOW-LABEL: phaddd_single_source2: 357; AVX-SLOW: # %bb.0: 358; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 359; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 360; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 361; AVX-SLOW-NEXT: retq 362; 363; AVX-FAST-LABEL: phaddd_single_source2: 364; AVX-FAST: # %bb.0: 365; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 366; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 367; AVX-FAST-NEXT: retq 368; 369; AVX2-SHUF-LABEL: phaddd_single_source2: 370; AVX2-SHUF: # %bb.0: 371; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 372; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 373; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 374; AVX2-SHUF-NEXT: retq 375 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 376 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 377 %add = add <4 x i32> %l, %r 378 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef> 379 ret <4 x i32> %shuffle2 380} 381 382define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { 383; SSSE3-LABEL: phaddd_single_source3: 384; SSSE3: # %bb.0: 385; SSSE3-NEXT: phaddd %xmm0, %xmm0 386; SSSE3-NEXT: retq 387; 388; AVX-LABEL: phaddd_single_source3: 389; AVX: # %bb.0: 390; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 391; AVX-NEXT: retq 392 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 393 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 394 %add = add <4 x i32> %l, %r 395 ret <4 x i32> %add 396} 397 398define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { 399; SSSE3-SLOW-LABEL: phaddd_single_source4: 400; SSSE3-SLOW: # %bb.0: 401; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 402; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 403; SSSE3-SLOW-NEXT: retq 404; 405; SSSE3-FAST-LABEL: phaddd_single_source4: 406; SSSE3-FAST: # %bb.0: 407; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 408; SSSE3-FAST-NEXT: retq 409; 410; AVX-SLOW-LABEL: phaddd_single_source4: 411; AVX-SLOW: # %bb.0: 412; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 413; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 414; AVX-SLOW-NEXT: retq 415; 416; AVX-FAST-LABEL: phaddd_single_source4: 417; AVX-FAST: # %bb.0: 418; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 419; AVX-FAST-NEXT: retq 420; 421; AVX2-SHUF-LABEL: phaddd_single_source4: 422; AVX2-SHUF: # %bb.0: 423; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 424; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 425; AVX2-SHUF-NEXT: retq 426 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 427 %add = add <4 x i32> %l, %x 428 ret <4 x i32> %add 429} 430 431define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { 432; SSSE3-SLOW-LABEL: phaddd_single_source5: 433; SSSE3-SLOW: # %bb.0: 434; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 435; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 436; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 437; SSSE3-SLOW-NEXT: retq 438; 439; SSSE3-FAST-LABEL: phaddd_single_source5: 440; SSSE3-FAST: # %bb.0: 441; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 442; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 443; SSSE3-FAST-NEXT: retq 444; 445; AVX-SLOW-LABEL: phaddd_single_source5: 446; AVX-SLOW: # %bb.0: 447; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 448; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 449; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 450; AVX-SLOW-NEXT: retq 451; 452; AVX-FAST-LABEL: phaddd_single_source5: 453; AVX-FAST: # %bb.0: 454; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 455; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 456; AVX-FAST-NEXT: retq 457; 458; AVX2-SHUF-LABEL: phaddd_single_source5: 459; AVX2-SHUF: # %bb.0: 460; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 461; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 462; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 463; AVX2-SHUF-NEXT: retq 464 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 465 %add = add <4 x i32> %l, %x 466 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 467 ret <4 x i32> %shuffle2 468} 469 470define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { 471; SSSE3-LABEL: phaddd_single_source6: 472; SSSE3: # %bb.0: 473; SSSE3-NEXT: phaddd %xmm0, %xmm0 474; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 475; SSSE3-NEXT: retq 476; 477; AVX-LABEL: phaddd_single_source6: 478; AVX: # %bb.0: 479; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 480; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 481; AVX-NEXT: retq 482 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 483 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 484 %add = add <4 x i32> %l, %r 485 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 486 ret <4 x i32> %shuffle2 487} 488 489define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { 490; SSSE3-LABEL: phaddw_single_source1: 491; SSSE3: # %bb.0: 492; SSSE3-NEXT: phaddw %xmm0, %xmm0 493; SSSE3-NEXT: retq 494; 495; AVX-LABEL: phaddw_single_source1: 496; AVX: # %bb.0: 497; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 498; AVX-NEXT: retq 499 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 500 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 501 %add = add <8 x i16> %l, %r 502 ret <8 x i16> %add 503} 504 505define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { 506; SSSE3-SLOW-LABEL: phaddw_single_source2: 507; SSSE3-SLOW: # %bb.0: 508; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 509; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 510; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 511; SSSE3-SLOW-NEXT: retq 512; 513; SSSE3-FAST-LABEL: phaddw_single_source2: 514; SSSE3-FAST: # %bb.0: 515; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 516; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] 517; SSSE3-FAST-NEXT: retq 518; 519; AVX-SLOW-LABEL: phaddw_single_source2: 520; AVX-SLOW: # %bb.0: 521; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 522; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 523; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 524; AVX-SLOW-NEXT: retq 525; 526; AVX-FAST-LABEL: phaddw_single_source2: 527; AVX-FAST: # %bb.0: 528; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 529; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] 530; AVX-FAST-NEXT: retq 531; 532; AVX2-SHUF-LABEL: phaddw_single_source2: 533; AVX2-SHUF: # %bb.0: 534; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 535; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 536; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0 537; AVX2-SHUF-NEXT: retq 538 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 539 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 540 %add = add <8 x i16> %l, %r 541 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef> 542 ret <8 x i16> %shuffle2 543} 544 545define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { 546; SSSE3-LABEL: phaddw_single_source3: 547; SSSE3: # %bb.0: 548; SSSE3-NEXT: phaddw %xmm0, %xmm0 549; SSSE3-NEXT: retq 550; 551; AVX-LABEL: phaddw_single_source3: 552; AVX: # %bb.0: 553; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 554; AVX-NEXT: retq 555 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef> 556 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef> 557 %add = add <8 x i16> %l, %r 558 ret <8 x i16> %add 559} 560 561define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { 562; SSSE3-SLOW-LABEL: phaddw_single_source4: 563; SSSE3-SLOW: # %bb.0: 564; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 565; SSSE3-SLOW-NEXT: pslld $16, %xmm1 566; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 567; SSSE3-SLOW-NEXT: retq 568; 569; SSSE3-FAST-LABEL: phaddw_single_source4: 570; SSSE3-FAST: # %bb.0: 571; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 572; SSSE3-FAST-NEXT: retq 573; 574; AVX-SLOW-LABEL: phaddw_single_source4: 575; AVX-SLOW: # %bb.0: 576; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 577; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 578; AVX-SLOW-NEXT: retq 579; 580; AVX-FAST-LABEL: phaddw_single_source4: 581; AVX-FAST: # %bb.0: 582; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 583; AVX-FAST-NEXT: retq 584; 585; AVX2-SHUF-LABEL: phaddw_single_source4: 586; AVX2-SHUF: # %bb.0: 587; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1 588; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0 589; AVX2-SHUF-NEXT: retq 590 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6> 591 %add = add <8 x i16> %l, %x 592 ret <8 x i16> %add 593} 594 595define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { 596; SSSE3-LABEL: phaddw_single_source6: 597; SSSE3: # %bb.0: 598; SSSE3-NEXT: phaddw %xmm0, %xmm0 599; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 600; SSSE3-NEXT: retq 601; 602; AVX-LABEL: phaddw_single_source6: 603; AVX: # %bb.0: 604; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 605; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 606; AVX-NEXT: retq 607 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef> 608 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef> 609 %add = add <8 x i16> %l, %r 610 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 611 ret <8 x i16> %shuffle2 612} 613 614; PR39921 + PR39936 615define i32 @PR39936_v8i32(<8 x i32>) { 616; SSSE3-SLOW-LABEL: PR39936_v8i32: 617; SSSE3-SLOW: # %bb.0: 618; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 619; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 620; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 621; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 622; SSSE3-SLOW-NEXT: movd %xmm1, %eax 623; SSSE3-SLOW-NEXT: retq 624; 625; SSSE3-FAST-LABEL: PR39936_v8i32: 626; SSSE3-FAST: # %bb.0: 627; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 628; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 629; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 630; SSSE3-FAST-NEXT: movd %xmm0, %eax 631; SSSE3-FAST-NEXT: retq 632; 633; AVX1-SLOW-LABEL: PR39936_v8i32: 634; AVX1-SLOW: # %bb.0: 635; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 636; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 637; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 638; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 639; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 640; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 641; AVX1-SLOW-NEXT: vzeroupper 642; AVX1-SLOW-NEXT: retq 643; 644; AVX1-FAST-LABEL: PR39936_v8i32: 645; AVX1-FAST: # %bb.0: 646; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 647; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 648; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 649; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 650; AVX1-FAST-NEXT: vmovd %xmm0, %eax 651; AVX1-FAST-NEXT: vzeroupper 652; AVX1-FAST-NEXT: retq 653; 654; AVX2-SLOW-LABEL: PR39936_v8i32: 655; AVX2-SLOW: # %bb.0: 656; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 657; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 658; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 659; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 660; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 661; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 662; AVX2-SLOW-NEXT: vzeroupper 663; AVX2-SLOW-NEXT: retq 664; 665; AVX2-FAST-LABEL: PR39936_v8i32: 666; AVX2-FAST: # %bb.0: 667; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 668; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 669; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 670; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 671; AVX2-FAST-NEXT: vmovd %xmm0, %eax 672; AVX2-FAST-NEXT: vzeroupper 673; AVX2-FAST-NEXT: retq 674; 675; AVX2-SHUF-LABEL: PR39936_v8i32: 676; AVX2-SHUF: # %bb.0: 677; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1 678; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0 679; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0 680; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 681; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 682; AVX2-SHUF-NEXT: vmovd %xmm0, %eax 683; AVX2-SHUF-NEXT: vzeroupper 684; AVX2-SHUF-NEXT: retq 685 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 686 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 687 %4 = add <8 x i32> %2, %3 688 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 689 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 690 %7 = add <8 x i32> %5, %6 691 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 692 %9 = add <8 x i32> %8, %7 693 %10 = extractelement <8 x i32> %9, i32 0 694 ret i32 %10 695} 696 697