1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL 8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 9; 10; Verify that the DAG combiner correctly folds bitwise operations across 11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 12; basic and always-safe patterns. Also test that the DAG combiner will combine 13; target-specific shuffle instructions where reasonable. 14 15target triple = "x86_64-unknown-unknown" 16 17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 20 21define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 22; CHECK-LABEL: combine_pshufd1: 23; CHECK: # %bb.0: # %entry 24; CHECK-NEXT: retq 25entry: 26 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 27 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 28 ret <4 x i32> %c 29} 30 31define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 32; CHECK-LABEL: combine_pshufd2: 33; CHECK: # %bb.0: # %entry 34; CHECK-NEXT: retq 35entry: 36 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 37 %b.cast = bitcast <4 x i32> %b to <8 x i16> 38 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 39 %c.cast = bitcast <8 x i16> %c to <4 x i32> 40 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 41 ret <4 x i32> %d 42} 43 44define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 45; CHECK-LABEL: combine_pshufd3: 46; CHECK: # %bb.0: # %entry 47; CHECK-NEXT: retq 48entry: 49 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 50 %b.cast = bitcast <4 x i32> %b to <8 x i16> 51 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 52 %c.cast = bitcast <8 x i16> %c to <4 x i32> 53 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 54 ret <4 x i32> %d 55} 56 57define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 58; SSE-LABEL: combine_pshufd4: 59; SSE: # %bb.0: # %entry 60; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 61; SSE-NEXT: retq 62; 63; AVX-LABEL: combine_pshufd4: 64; AVX: # %bb.0: # %entry 65; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 66; AVX-NEXT: retq 67entry: 68 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 69 %b.cast = bitcast <4 x i32> %b to <8 x i16> 70 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 71 %c.cast = bitcast <8 x i16> %c to <4 x i32> 72 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 73 ret <4 x i32> %d 74} 75 76define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 77; SSE-LABEL: combine_pshufd5: 78; SSE: # %bb.0: # %entry 79; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 80; SSE-NEXT: retq 81; 82; AVX-LABEL: combine_pshufd5: 83; AVX: # %bb.0: # %entry 84; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 85; AVX-NEXT: retq 86entry: 87 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 88 %b.cast = bitcast <4 x i32> %b to <8 x i16> 89 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 90 %c.cast = bitcast <8 x i16> %c to <4 x i32> 91 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 92 ret <4 x i32> %d 93} 94 95define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 96; SSE-LABEL: combine_pshufd6: 97; SSE: # %bb.0: # %entry 98; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 99; SSE-NEXT: retq 100; 101; AVX1-LABEL: combine_pshufd6: 102; AVX1: # %bb.0: # %entry 103; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 104; AVX1-NEXT: retq 105; 106; AVX2-LABEL: combine_pshufd6: 107; AVX2: # %bb.0: # %entry 108; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 109; AVX2-NEXT: retq 110entry: 111 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 112 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 113 ret <4 x i32> %c 114} 115 116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 117; CHECK-LABEL: combine_pshuflw1: 118; CHECK: # %bb.0: # %entry 119; CHECK-NEXT: retq 120entry: 121 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 122 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 123 ret <8 x i16> %c 124} 125 126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 127; CHECK-LABEL: combine_pshuflw2: 128; CHECK: # %bb.0: # %entry 129; CHECK-NEXT: retq 130entry: 131 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 132 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 133 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 134 ret <8 x i16> %d 135} 136 137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 138; SSE-LABEL: combine_pshuflw3: 139; SSE: # %bb.0: # %entry 140; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 141; SSE-NEXT: retq 142; 143; AVX-LABEL: combine_pshuflw3: 144; AVX: # %bb.0: # %entry 145; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 146; AVX-NEXT: retq 147entry: 148 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 149 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 150 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 151 ret <8 x i16> %d 152} 153 154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 155; SSE-LABEL: combine_pshufhw1: 156; SSE: # %bb.0: # %entry 157; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 158; SSE-NEXT: retq 159; 160; AVX-LABEL: combine_pshufhw1: 161; AVX: # %bb.0: # %entry 162; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 163; AVX-NEXT: retq 164entry: 165 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 166 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 167 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 168 ret <8 x i16> %d 169} 170 171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 172; SSE-LABEL: combine_bitwise_ops_test1: 173; SSE: # %bb.0: 174; SSE-NEXT: pand %xmm1, %xmm0 175; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 176; SSE-NEXT: retq 177; 178; AVX-LABEL: combine_bitwise_ops_test1: 179; AVX: # %bb.0: 180; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 181; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 182; AVX-NEXT: retq 183 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 184 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 185 %and = and <4 x i32> %shuf1, %shuf2 186 ret <4 x i32> %and 187} 188 189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 190; SSE-LABEL: combine_bitwise_ops_test2: 191; SSE: # %bb.0: 192; SSE-NEXT: por %xmm1, %xmm0 193; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 194; SSE-NEXT: retq 195; 196; AVX-LABEL: combine_bitwise_ops_test2: 197; AVX: # %bb.0: 198; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 199; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 200; AVX-NEXT: retq 201 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 202 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 203 %or = or <4 x i32> %shuf1, %shuf2 204 ret <4 x i32> %or 205} 206 207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 208; SSE-LABEL: combine_bitwise_ops_test3: 209; SSE: # %bb.0: 210; SSE-NEXT: pxor %xmm1, %xmm0 211; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 212; SSE-NEXT: retq 213; 214; AVX-LABEL: combine_bitwise_ops_test3: 215; AVX: # %bb.0: 216; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 217; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 218; AVX-NEXT: retq 219 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 220 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 221 %xor = xor <4 x i32> %shuf1, %shuf2 222 ret <4 x i32> %xor 223} 224 225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 226; SSE-LABEL: combine_bitwise_ops_test4: 227; SSE: # %bb.0: 228; SSE-NEXT: pand %xmm1, %xmm0 229; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 230; SSE-NEXT: retq 231; 232; AVX-LABEL: combine_bitwise_ops_test4: 233; AVX: # %bb.0: 234; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 235; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 236; AVX-NEXT: retq 237 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 238 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 239 %and = and <4 x i32> %shuf1, %shuf2 240 ret <4 x i32> %and 241} 242 243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 244; SSE-LABEL: combine_bitwise_ops_test5: 245; SSE: # %bb.0: 246; SSE-NEXT: por %xmm1, %xmm0 247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 248; SSE-NEXT: retq 249; 250; AVX-LABEL: combine_bitwise_ops_test5: 251; AVX: # %bb.0: 252; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 253; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 254; AVX-NEXT: retq 255 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 256 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 257 %or = or <4 x i32> %shuf1, %shuf2 258 ret <4 x i32> %or 259} 260 261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 262; SSE-LABEL: combine_bitwise_ops_test6: 263; SSE: # %bb.0: 264; SSE-NEXT: pxor %xmm1, %xmm0 265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 266; SSE-NEXT: retq 267; 268; AVX-LABEL: combine_bitwise_ops_test6: 269; AVX: # %bb.0: 270; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 271; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 272; AVX-NEXT: retq 273 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 274 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 275 %xor = xor <4 x i32> %shuf1, %shuf2 276 ret <4 x i32> %xor 277} 278 279 280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 281; are not performing a swizzle operations. 282 283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 284; SSE2-LABEL: combine_bitwise_ops_test1b: 285; SSE2: # %bb.0: 286; SSE2-NEXT: pand %xmm1, %xmm0 287; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 288; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 289; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 290; SSE2-NEXT: retq 291; 292; SSSE3-LABEL: combine_bitwise_ops_test1b: 293; SSSE3: # %bb.0: 294; SSSE3-NEXT: pand %xmm1, %xmm0 295; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 296; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 297; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 298; SSSE3-NEXT: retq 299; 300; SSE41-LABEL: combine_bitwise_ops_test1b: 301; SSE41: # %bb.0: 302; SSE41-NEXT: andps %xmm1, %xmm0 303; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 304; SSE41-NEXT: retq 305; 306; AVX-LABEL: combine_bitwise_ops_test1b: 307; AVX: # %bb.0: 308; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 310; AVX-NEXT: retq 311 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 312 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 313 %and = and <4 x i32> %shuf1, %shuf2 314 ret <4 x i32> %and 315} 316 317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 318; SSE2-LABEL: combine_bitwise_ops_test2b: 319; SSE2: # %bb.0: 320; SSE2-NEXT: por %xmm1, %xmm0 321; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 322; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 323; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324; SSE2-NEXT: retq 325; 326; SSSE3-LABEL: combine_bitwise_ops_test2b: 327; SSSE3: # %bb.0: 328; SSSE3-NEXT: por %xmm1, %xmm0 329; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 330; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 331; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 332; SSSE3-NEXT: retq 333; 334; SSE41-LABEL: combine_bitwise_ops_test2b: 335; SSE41: # %bb.0: 336; SSE41-NEXT: orps %xmm1, %xmm0 337; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 338; SSE41-NEXT: retq 339; 340; AVX-LABEL: combine_bitwise_ops_test2b: 341; AVX: # %bb.0: 342; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 343; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 344; AVX-NEXT: retq 345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 347 %or = or <4 x i32> %shuf1, %shuf2 348 ret <4 x i32> %or 349} 350 351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 352; SSE2-LABEL: combine_bitwise_ops_test3b: 353; SSE2: # %bb.0: 354; SSE2-NEXT: xorps %xmm1, %xmm0 355; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 356; SSE2-NEXT: retq 357; 358; SSSE3-LABEL: combine_bitwise_ops_test3b: 359; SSSE3: # %bb.0: 360; SSSE3-NEXT: xorps %xmm1, %xmm0 361; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 362; SSSE3-NEXT: retq 363; 364; SSE41-LABEL: combine_bitwise_ops_test3b: 365; SSE41: # %bb.0: 366; SSE41-NEXT: xorps %xmm1, %xmm0 367; SSE41-NEXT: xorps %xmm1, %xmm1 368; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 369; SSE41-NEXT: retq 370; 371; AVX-LABEL: combine_bitwise_ops_test3b: 372; AVX: # %bb.0: 373; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 374; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 375; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 376; AVX-NEXT: retq 377 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 378 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 379 %xor = xor <4 x i32> %shuf1, %shuf2 380 ret <4 x i32> %xor 381} 382 383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 384; SSE2-LABEL: combine_bitwise_ops_test4b: 385; SSE2: # %bb.0: 386; SSE2-NEXT: pand %xmm1, %xmm0 387; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 388; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 389; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 390; SSE2-NEXT: retq 391; 392; SSSE3-LABEL: combine_bitwise_ops_test4b: 393; SSSE3: # %bb.0: 394; SSSE3-NEXT: pand %xmm1, %xmm0 395; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 396; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 397; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 398; SSSE3-NEXT: retq 399; 400; SSE41-LABEL: combine_bitwise_ops_test4b: 401; SSE41: # %bb.0: 402; SSE41-NEXT: andps %xmm1, %xmm0 403; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 404; SSE41-NEXT: retq 405; 406; AVX-LABEL: combine_bitwise_ops_test4b: 407; AVX: # %bb.0: 408; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 409; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 410; AVX-NEXT: retq 411 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 412 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 413 %and = and <4 x i32> %shuf1, %shuf2 414 ret <4 x i32> %and 415} 416 417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 418; SSE2-LABEL: combine_bitwise_ops_test5b: 419; SSE2: # %bb.0: 420; SSE2-NEXT: por %xmm1, %xmm0 421; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 423; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 424; SSE2-NEXT: retq 425; 426; SSSE3-LABEL: combine_bitwise_ops_test5b: 427; SSSE3: # %bb.0: 428; SSSE3-NEXT: por %xmm1, %xmm0 429; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 430; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 431; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 432; SSSE3-NEXT: retq 433; 434; SSE41-LABEL: combine_bitwise_ops_test5b: 435; SSE41: # %bb.0: 436; SSE41-NEXT: orps %xmm1, %xmm0 437; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 438; SSE41-NEXT: retq 439; 440; AVX-LABEL: combine_bitwise_ops_test5b: 441; AVX: # %bb.0: 442; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 443; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 444; AVX-NEXT: retq 445 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 446 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 447 %or = or <4 x i32> %shuf1, %shuf2 448 ret <4 x i32> %or 449} 450 451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 452; SSE2-LABEL: combine_bitwise_ops_test6b: 453; SSE2: # %bb.0: 454; SSE2-NEXT: xorps %xmm1, %xmm0 455; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 456; SSE2-NEXT: retq 457; 458; SSSE3-LABEL: combine_bitwise_ops_test6b: 459; SSSE3: # %bb.0: 460; SSSE3-NEXT: xorps %xmm1, %xmm0 461; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSSE3-NEXT: retq 463; 464; SSE41-LABEL: combine_bitwise_ops_test6b: 465; SSE41: # %bb.0: 466; SSE41-NEXT: xorps %xmm1, %xmm0 467; SSE41-NEXT: xorps %xmm1, %xmm1 468; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 469; SSE41-NEXT: retq 470; 471; AVX-LABEL: combine_bitwise_ops_test6b: 472; AVX: # %bb.0: 473; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 474; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 475; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 476; AVX-NEXT: retq 477 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 478 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 479 %xor = xor <4 x i32> %shuf1, %shuf2 480 ret <4 x i32> %xor 481} 482 483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 484; SSE-LABEL: combine_bitwise_ops_test1c: 485; SSE: # %bb.0: 486; SSE-NEXT: andps %xmm1, %xmm0 487; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 488; SSE-NEXT: retq 489; 490; AVX-LABEL: combine_bitwise_ops_test1c: 491; AVX: # %bb.0: 492; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 493; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 494; AVX-NEXT: retq 495 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 496 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 497 %and = and <4 x i32> %shuf1, %shuf2 498 ret <4 x i32> %and 499} 500 501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 502; SSE-LABEL: combine_bitwise_ops_test2c: 503; SSE: # %bb.0: 504; SSE-NEXT: orps %xmm1, %xmm0 505; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 506; SSE-NEXT: retq 507; 508; AVX-LABEL: combine_bitwise_ops_test2c: 509; AVX: # %bb.0: 510; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 511; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 512; AVX-NEXT: retq 513 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 514 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 515 %or = or <4 x i32> %shuf1, %shuf2 516 ret <4 x i32> %or 517} 518 519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 520; SSE2-LABEL: combine_bitwise_ops_test3c: 521; SSE2: # %bb.0: 522; SSE2-NEXT: xorps %xmm1, %xmm0 523; SSE2-NEXT: xorps %xmm1, %xmm1 524; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 525; SSE2-NEXT: retq 526; 527; SSSE3-LABEL: combine_bitwise_ops_test3c: 528; SSSE3: # %bb.0: 529; SSSE3-NEXT: xorps %xmm1, %xmm0 530; SSSE3-NEXT: xorps %xmm1, %xmm1 531; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 532; SSSE3-NEXT: retq 533; 534; SSE41-LABEL: combine_bitwise_ops_test3c: 535; SSE41: # %bb.0: 536; SSE41-NEXT: xorps %xmm1, %xmm0 537; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 538; SSE41-NEXT: retq 539; 540; AVX-LABEL: combine_bitwise_ops_test3c: 541; AVX: # %bb.0: 542; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 543; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 544; AVX-NEXT: retq 545 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 546 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 547 %xor = xor <4 x i32> %shuf1, %shuf2 548 ret <4 x i32> %xor 549} 550 551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 552; SSE-LABEL: combine_bitwise_ops_test4c: 553; SSE: # %bb.0: 554; SSE-NEXT: andps %xmm1, %xmm0 555; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 556; SSE-NEXT: movaps %xmm2, %xmm0 557; SSE-NEXT: retq 558; 559; AVX-LABEL: combine_bitwise_ops_test4c: 560; AVX: # %bb.0: 561; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 562; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 563; AVX-NEXT: retq 564 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 565 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 566 %and = and <4 x i32> %shuf1, %shuf2 567 ret <4 x i32> %and 568} 569 570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 571; SSE-LABEL: combine_bitwise_ops_test5c: 572; SSE: # %bb.0: 573; SSE-NEXT: orps %xmm1, %xmm0 574; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 575; SSE-NEXT: movaps %xmm2, %xmm0 576; SSE-NEXT: retq 577; 578; AVX-LABEL: combine_bitwise_ops_test5c: 579; AVX: # %bb.0: 580; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 581; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 582; AVX-NEXT: retq 583 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 584 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 585 %or = or <4 x i32> %shuf1, %shuf2 586 ret <4 x i32> %or 587} 588 589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 590; SSE2-LABEL: combine_bitwise_ops_test6c: 591; SSE2: # %bb.0: 592; SSE2-NEXT: xorps %xmm1, %xmm0 593; SSE2-NEXT: xorps %xmm1, %xmm1 594; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 595; SSE2-NEXT: movaps %xmm1, %xmm0 596; SSE2-NEXT: retq 597; 598; SSSE3-LABEL: combine_bitwise_ops_test6c: 599; SSSE3: # %bb.0: 600; SSSE3-NEXT: xorps %xmm1, %xmm0 601; SSSE3-NEXT: xorps %xmm1, %xmm1 602; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 603; SSSE3-NEXT: movaps %xmm1, %xmm0 604; SSSE3-NEXT: retq 605; 606; SSE41-LABEL: combine_bitwise_ops_test6c: 607; SSE41: # %bb.0: 608; SSE41-NEXT: xorps %xmm1, %xmm0 609; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 610; SSE41-NEXT: retq 611; 612; AVX-LABEL: combine_bitwise_ops_test6c: 613; AVX: # %bb.0: 614; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 615; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 616; AVX-NEXT: retq 617 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 618 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 619 %xor = xor <4 x i32> %shuf1, %shuf2 620 ret <4 x i32> %xor 621} 622 623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 624; SSE-LABEL: combine_nested_undef_test1: 625; SSE: # %bb.0: 626; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 627; SSE-NEXT: retq 628; 629; AVX-LABEL: combine_nested_undef_test1: 630; AVX: # %bb.0: 631; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] 632; AVX-NEXT: retq 633 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 634 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 635 ret <4 x i32> %2 636} 637 638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 639; SSE-LABEL: combine_nested_undef_test2: 640; SSE: # %bb.0: 641; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 642; SSE-NEXT: retq 643; 644; AVX-LABEL: combine_nested_undef_test2: 645; AVX: # %bb.0: 646; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] 647; AVX-NEXT: retq 648 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 649 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 650 ret <4 x i32> %2 651} 652 653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 654; SSE-LABEL: combine_nested_undef_test3: 655; SSE: # %bb.0: 656; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 657; SSE-NEXT: retq 658; 659; AVX-LABEL: combine_nested_undef_test3: 660; AVX: # %bb.0: 661; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] 662; AVX-NEXT: retq 663 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 664 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 665 ret <4 x i32> %2 666} 667 668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 669; SSE-LABEL: combine_nested_undef_test4: 670; SSE: # %bb.0: 671; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 672; SSE-NEXT: retq 673; 674; AVX1-LABEL: combine_nested_undef_test4: 675; AVX1: # %bb.0: 676; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] 677; AVX1-NEXT: retq 678; 679; AVX2-LABEL: combine_nested_undef_test4: 680; AVX2: # %bb.0: 681; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 682; AVX2-NEXT: retq 683 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 684 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 685 ret <4 x i32> %2 686} 687 688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 689; SSE-LABEL: combine_nested_undef_test5: 690; SSE: # %bb.0: 691; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 692; SSE-NEXT: retq 693; 694; AVX-LABEL: combine_nested_undef_test5: 695; AVX: # %bb.0: 696; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 697; AVX-NEXT: retq 698 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 699 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 700 ret <4 x i32> %2 701} 702 703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 704; SSE-LABEL: combine_nested_undef_test6: 705; SSE: # %bb.0: 706; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 707; SSE-NEXT: retq 708; 709; AVX-LABEL: combine_nested_undef_test6: 710; AVX: # %bb.0: 711; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] 712; AVX-NEXT: retq 713 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 714 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 715 ret <4 x i32> %2 716} 717 718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 719; SSE-LABEL: combine_nested_undef_test7: 720; SSE: # %bb.0: 721; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 722; SSE-NEXT: retq 723; 724; AVX-LABEL: combine_nested_undef_test7: 725; AVX: # %bb.0: 726; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 727; AVX-NEXT: retq 728 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 729 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 730 ret <4 x i32> %2 731} 732 733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 734; SSE-LABEL: combine_nested_undef_test8: 735; SSE: # %bb.0: 736; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 737; SSE-NEXT: retq 738; 739; AVX-LABEL: combine_nested_undef_test8: 740; AVX: # %bb.0: 741; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] 742; AVX-NEXT: retq 743 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 744 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 745 ret <4 x i32> %2 746} 747 748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 749; SSE-LABEL: combine_nested_undef_test9: 750; SSE: # %bb.0: 751; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 752; SSE-NEXT: retq 753; 754; AVX-LABEL: combine_nested_undef_test9: 755; AVX: # %bb.0: 756; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2] 757; AVX-NEXT: retq 758 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 759 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 760 ret <4 x i32> %2 761} 762 763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 764; SSE-LABEL: combine_nested_undef_test10: 765; SSE: # %bb.0: 766; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 767; SSE-NEXT: retq 768; 769; AVX-LABEL: combine_nested_undef_test10: 770; AVX: # %bb.0: 771; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 772; AVX-NEXT: retq 773 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 774 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 775 ret <4 x i32> %2 776} 777 778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 779; SSE-LABEL: combine_nested_undef_test11: 780; SSE: # %bb.0: 781; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 782; SSE-NEXT: retq 783; 784; AVX-LABEL: combine_nested_undef_test11: 785; AVX: # %bb.0: 786; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1] 787; AVX-NEXT: retq 788 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 789 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 790 ret <4 x i32> %2 791} 792 793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 794; SSE-LABEL: combine_nested_undef_test12: 795; SSE: # %bb.0: 796; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 797; SSE-NEXT: retq 798; 799; AVX1-LABEL: combine_nested_undef_test12: 800; AVX1: # %bb.0: 801; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] 802; AVX1-NEXT: retq 803; 804; AVX2-LABEL: combine_nested_undef_test12: 805; AVX2: # %bb.0: 806; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 807; AVX2-NEXT: retq 808 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 809 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 810 ret <4 x i32> %2 811} 812 813; The following pair of shuffles is folded into vector %A. 814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 815; CHECK-LABEL: combine_nested_undef_test13: 816; CHECK: # %bb.0: 817; CHECK-NEXT: retq 818 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 819 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 820 ret <4 x i32> %2 821} 822 823; The following pair of shuffles is folded into vector %B. 824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 825; SSE-LABEL: combine_nested_undef_test14: 826; SSE: # %bb.0: 827; SSE-NEXT: movaps %xmm1, %xmm0 828; SSE-NEXT: retq 829; 830; AVX-LABEL: combine_nested_undef_test14: 831; AVX: # %bb.0: 832; AVX-NEXT: vmovaps %xmm1, %xmm0 833; AVX-NEXT: retq 834 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 835 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 836 ret <4 x i32> %2 837} 838 839 840; Verify that we don't optimize the following cases. We expect more than one shuffle. 841; 842; FIXME: Many of these already don't make sense, and the rest should stop 843; making sense with th enew vector shuffle lowering. Revisit at least testing for 844; it. 845 846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 847; SSE2-LABEL: combine_nested_undef_test15: 848; SSE2: # %bb.0: 849; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 850; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 851; SSE2-NEXT: movaps %xmm1, %xmm0 852; SSE2-NEXT: retq 853; 854; SSSE3-LABEL: combine_nested_undef_test15: 855; SSSE3: # %bb.0: 856; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 857; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 858; SSSE3-NEXT: movaps %xmm1, %xmm0 859; SSSE3-NEXT: retq 860; 861; SSE41-LABEL: combine_nested_undef_test15: 862; SSE41: # %bb.0: 863; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 864; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 865; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 866; SSE41-NEXT: retq 867; 868; AVX1-LABEL: combine_nested_undef_test15: 869; AVX1: # %bb.0: 870; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1] 871; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] 872; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 873; AVX1-NEXT: retq 874; 875; AVX2-LABEL: combine_nested_undef_test15: 876; AVX2: # %bb.0: 877; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 878; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] 879; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 880; AVX2-NEXT: retq 881 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 882 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 883 ret <4 x i32> %2 884} 885 886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 887; SSE2-LABEL: combine_nested_undef_test16: 888; SSE2: # %bb.0: 889; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 890; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 891; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 892; SSE2-NEXT: retq 893; 894; SSSE3-LABEL: combine_nested_undef_test16: 895; SSSE3: # %bb.0: 896; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 897; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 898; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 899; SSSE3-NEXT: retq 900; 901; SSE41-LABEL: combine_nested_undef_test16: 902; SSE41: # %bb.0: 903; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 904; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 905; SSE41-NEXT: retq 906; 907; AVX-LABEL: combine_nested_undef_test16: 908; AVX: # %bb.0: 909; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] 910; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 911; AVX-NEXT: retq 912 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 913 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 914 ret <4 x i32> %2 915} 916 917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 918; SSE2-LABEL: combine_nested_undef_test17: 919; SSE2: # %bb.0: 920; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 921; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 922; SSE2-NEXT: retq 923; 924; SSSE3-LABEL: combine_nested_undef_test17: 925; SSSE3: # %bb.0: 926; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 927; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 928; SSSE3-NEXT: retq 929; 930; SSE41-LABEL: combine_nested_undef_test17: 931; SSE41: # %bb.0: 932; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 933; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 934; SSE41-NEXT: retq 935; 936; AVX-LABEL: combine_nested_undef_test17: 937; AVX: # %bb.0: 938; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 939; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] 940; AVX-NEXT: retq 941 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 942 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 943 ret <4 x i32> %2 944} 945 946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 947; SSE-LABEL: combine_nested_undef_test18: 948; SSE: # %bb.0: 949; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 950; SSE-NEXT: retq 951; 952; AVX-LABEL: combine_nested_undef_test18: 953; AVX: # %bb.0: 954; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3] 955; AVX-NEXT: retq 956 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 957 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 958 ret <4 x i32> %2 959} 960 961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 962; SSE2-LABEL: combine_nested_undef_test19: 963; SSE2: # %bb.0: 964; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 965; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 966; SSE2-NEXT: retq 967; 968; SSSE3-LABEL: combine_nested_undef_test19: 969; SSSE3: # %bb.0: 970; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 971; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 972; SSSE3-NEXT: retq 973; 974; SSE41-LABEL: combine_nested_undef_test19: 975; SSE41: # %bb.0: 976; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 977; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 978; SSE41-NEXT: retq 979; 980; AVX-LABEL: combine_nested_undef_test19: 981; AVX: # %bb.0: 982; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 983; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] 984; AVX-NEXT: retq 985 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 986 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 987 ret <4 x i32> %2 988} 989 990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 991; SSE2-LABEL: combine_nested_undef_test20: 992; SSE2: # %bb.0: 993; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 994; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 995; SSE2-NEXT: movaps %xmm1, %xmm0 996; SSE2-NEXT: retq 997; 998; SSSE3-LABEL: combine_nested_undef_test20: 999; SSSE3: # %bb.0: 1000; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1001; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1002; SSSE3-NEXT: movaps %xmm1, %xmm0 1003; SSSE3-NEXT: retq 1004; 1005; SSE41-LABEL: combine_nested_undef_test20: 1006; SSE41: # %bb.0: 1007; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1008; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1009; SSE41-NEXT: retq 1010; 1011; AVX-LABEL: combine_nested_undef_test20: 1012; AVX: # %bb.0: 1013; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1014; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0] 1015; AVX-NEXT: retq 1016 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1017 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1018 ret <4 x i32> %2 1019} 1020 1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1022; SSE2-LABEL: combine_nested_undef_test21: 1023; SSE2: # %bb.0: 1024; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1025; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1026; SSE2-NEXT: retq 1027; 1028; SSSE3-LABEL: combine_nested_undef_test21: 1029; SSSE3: # %bb.0: 1030; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1031; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1032; SSSE3-NEXT: retq 1033; 1034; SSE41-LABEL: combine_nested_undef_test21: 1035; SSE41: # %bb.0: 1036; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1037; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1038; SSE41-NEXT: retq 1039; 1040; AVX1-LABEL: combine_nested_undef_test21: 1041; AVX1: # %bb.0: 1042; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1043; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1044; AVX1-NEXT: retq 1045; 1046; AVX2-LABEL: combine_nested_undef_test21: 1047; AVX2: # %bb.0: 1048; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1049; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1050; AVX2-NEXT: retq 1051 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1052 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1053 ret <4 x i32> %2 1054} 1055 1056 1057; Test that we correctly combine shuffles according to rule 1058; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1059 1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1061; SSE-LABEL: combine_nested_undef_test22: 1062; SSE: # %bb.0: 1063; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: combine_nested_undef_test22: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3] 1069; AVX-NEXT: retq 1070 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1071 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1072 ret <4 x i32> %2 1073} 1074 1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1076; SSE-LABEL: combine_nested_undef_test23: 1077; SSE: # %bb.0: 1078; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1079; SSE-NEXT: retq 1080; 1081; AVX-LABEL: combine_nested_undef_test23: 1082; AVX: # %bb.0: 1083; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3] 1084; AVX-NEXT: retq 1085 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1086 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1087 ret <4 x i32> %2 1088} 1089 1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1091; SSE-LABEL: combine_nested_undef_test24: 1092; SSE: # %bb.0: 1093; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1094; SSE-NEXT: retq 1095; 1096; AVX-LABEL: combine_nested_undef_test24: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3] 1099; AVX-NEXT: retq 1100 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1101 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1102 ret <4 x i32> %2 1103} 1104 1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1106; SSE-LABEL: combine_nested_undef_test25: 1107; SSE: # %bb.0: 1108; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1109; SSE-NEXT: retq 1110; 1111; AVX1-LABEL: combine_nested_undef_test25: 1112; AVX1: # %bb.0: 1113; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1114; AVX1-NEXT: retq 1115; 1116; AVX2-LABEL: combine_nested_undef_test25: 1117; AVX2: # %bb.0: 1118; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1119; AVX2-NEXT: retq 1120 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1121 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1122 ret <4 x i32> %2 1123} 1124 1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1126; SSE-LABEL: combine_nested_undef_test26: 1127; SSE: # %bb.0: 1128; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1129; SSE-NEXT: retq 1130; 1131; AVX-LABEL: combine_nested_undef_test26: 1132; AVX: # %bb.0: 1133; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1134; AVX-NEXT: retq 1135 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1136 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1137 ret <4 x i32> %2 1138} 1139 1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1141; SSE-LABEL: combine_nested_undef_test27: 1142; SSE: # %bb.0: 1143; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1144; SSE-NEXT: retq 1145; 1146; AVX1-LABEL: combine_nested_undef_test27: 1147; AVX1: # %bb.0: 1148; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1149; AVX1-NEXT: retq 1150; 1151; AVX2-LABEL: combine_nested_undef_test27: 1152; AVX2: # %bb.0: 1153; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1154; AVX2-NEXT: retq 1155 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1156 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1157 ret <4 x i32> %2 1158} 1159 1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1161; SSE-LABEL: combine_nested_undef_test28: 1162; SSE: # %bb.0: 1163; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1164; SSE-NEXT: retq 1165; 1166; AVX-LABEL: combine_nested_undef_test28: 1167; AVX: # %bb.0: 1168; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0] 1169; AVX-NEXT: retq 1170 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1171 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1172 ret <4 x i32> %2 1173} 1174 1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1176; SSE-LABEL: combine_test1: 1177; SSE: # %bb.0: 1178; SSE-NEXT: movaps %xmm1, %xmm0 1179; SSE-NEXT: retq 1180; 1181; AVX-LABEL: combine_test1: 1182; AVX: # %bb.0: 1183; AVX-NEXT: vmovaps %xmm1, %xmm0 1184; AVX-NEXT: retq 1185 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1186 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1187 ret <4 x float> %2 1188} 1189 1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1191; SSE2-LABEL: combine_test2: 1192; SSE2: # %bb.0: 1193; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1194; SSE2-NEXT: movaps %xmm1, %xmm0 1195; SSE2-NEXT: retq 1196; 1197; SSSE3-LABEL: combine_test2: 1198; SSSE3: # %bb.0: 1199; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1200; SSSE3-NEXT: movaps %xmm1, %xmm0 1201; SSSE3-NEXT: retq 1202; 1203; SSE41-LABEL: combine_test2: 1204; SSE41: # %bb.0: 1205; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1206; SSE41-NEXT: retq 1207; 1208; AVX-LABEL: combine_test2: 1209; AVX: # %bb.0: 1210; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1211; AVX-NEXT: retq 1212 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1213 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1214 ret <4 x float> %2 1215} 1216 1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1218; SSE-LABEL: combine_test3: 1219; SSE: # %bb.0: 1220; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1221; SSE-NEXT: retq 1222; 1223; AVX-LABEL: combine_test3: 1224; AVX: # %bb.0: 1225; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1226; AVX-NEXT: retq 1227 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1228 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1229 ret <4 x float> %2 1230} 1231 1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1233; SSE-LABEL: combine_test4: 1234; SSE: # %bb.0: 1235; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1236; SSE-NEXT: retq 1237; 1238; AVX-LABEL: combine_test4: 1239; AVX: # %bb.0: 1240; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1241; AVX-NEXT: retq 1242 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1243 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1244 ret <4 x float> %2 1245} 1246 1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1248; SSE2-LABEL: combine_test5: 1249; SSE2: # %bb.0: 1250; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1251; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1252; SSE2-NEXT: retq 1253; 1254; SSSE3-LABEL: combine_test5: 1255; SSSE3: # %bb.0: 1256; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1257; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1258; SSSE3-NEXT: retq 1259; 1260; SSE41-LABEL: combine_test5: 1261; SSE41: # %bb.0: 1262; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1263; SSE41-NEXT: retq 1264; 1265; AVX-LABEL: combine_test5: 1266; AVX: # %bb.0: 1267; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1268; AVX-NEXT: retq 1269 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1270 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1271 ret <4 x float> %2 1272} 1273 1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1275; SSE-LABEL: combine_test6: 1276; SSE: # %bb.0: 1277; SSE-NEXT: movaps %xmm1, %xmm0 1278; SSE-NEXT: retq 1279; 1280; AVX-LABEL: combine_test6: 1281; AVX: # %bb.0: 1282; AVX-NEXT: vmovaps %xmm1, %xmm0 1283; AVX-NEXT: retq 1284 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1285 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1286 ret <4 x i32> %2 1287} 1288 1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1290; SSE2-LABEL: combine_test7: 1291; SSE2: # %bb.0: 1292; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1293; SSE2-NEXT: movaps %xmm1, %xmm0 1294; SSE2-NEXT: retq 1295; 1296; SSSE3-LABEL: combine_test7: 1297; SSSE3: # %bb.0: 1298; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1299; SSSE3-NEXT: movaps %xmm1, %xmm0 1300; SSSE3-NEXT: retq 1301; 1302; SSE41-LABEL: combine_test7: 1303; SSE41: # %bb.0: 1304; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1305; SSE41-NEXT: retq 1306; 1307; AVX-LABEL: combine_test7: 1308; AVX: # %bb.0: 1309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1310; AVX-NEXT: retq 1311 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1312 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1313 ret <4 x i32> %2 1314} 1315 1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1317; SSE-LABEL: combine_test8: 1318; SSE: # %bb.0: 1319; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1320; SSE-NEXT: retq 1321; 1322; AVX-LABEL: combine_test8: 1323; AVX: # %bb.0: 1324; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1325; AVX-NEXT: retq 1326 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1327 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1328 ret <4 x i32> %2 1329} 1330 1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1332; SSE-LABEL: combine_test9: 1333; SSE: # %bb.0: 1334; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1335; SSE-NEXT: movaps %xmm1, %xmm0 1336; SSE-NEXT: retq 1337; 1338; AVX-LABEL: combine_test9: 1339; AVX: # %bb.0: 1340; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1341; AVX-NEXT: retq 1342 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1343 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1344 ret <4 x i32> %2 1345} 1346 1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1348; SSE2-LABEL: combine_test10: 1349; SSE2: # %bb.0: 1350; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1351; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1352; SSE2-NEXT: retq 1353; 1354; SSSE3-LABEL: combine_test10: 1355; SSSE3: # %bb.0: 1356; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1357; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1358; SSSE3-NEXT: retq 1359; 1360; SSE41-LABEL: combine_test10: 1361; SSE41: # %bb.0: 1362; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1363; SSE41-NEXT: retq 1364; 1365; AVX-LABEL: combine_test10: 1366; AVX: # %bb.0: 1367; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1368; AVX-NEXT: retq 1369 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1370 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1371 ret <4 x i32> %2 1372} 1373 1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1375; CHECK-LABEL: combine_test11: 1376; CHECK: # %bb.0: 1377; CHECK-NEXT: retq 1378 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1379 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1380 ret <4 x float> %2 1381} 1382 1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1384; SSE2-LABEL: combine_test12: 1385; SSE2: # %bb.0: 1386; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1387; SSE2-NEXT: movaps %xmm1, %xmm0 1388; SSE2-NEXT: retq 1389; 1390; SSSE3-LABEL: combine_test12: 1391; SSSE3: # %bb.0: 1392; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1393; SSSE3-NEXT: movaps %xmm1, %xmm0 1394; SSSE3-NEXT: retq 1395; 1396; SSE41-LABEL: combine_test12: 1397; SSE41: # %bb.0: 1398; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1399; SSE41-NEXT: retq 1400; 1401; AVX-LABEL: combine_test12: 1402; AVX: # %bb.0: 1403; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1404; AVX-NEXT: retq 1405 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1406 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1407 ret <4 x float> %2 1408} 1409 1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1411; SSE-LABEL: combine_test13: 1412; SSE: # %bb.0: 1413; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1414; SSE-NEXT: retq 1415; 1416; AVX-LABEL: combine_test13: 1417; AVX: # %bb.0: 1418; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1419; AVX-NEXT: retq 1420 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1421 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1422 ret <4 x float> %2 1423} 1424 1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1426; SSE-LABEL: combine_test14: 1427; SSE: # %bb.0: 1428; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1429; SSE-NEXT: retq 1430; 1431; AVX-LABEL: combine_test14: 1432; AVX: # %bb.0: 1433; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1434; AVX-NEXT: retq 1435 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1436 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1437 ret <4 x float> %2 1438} 1439 1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1441; SSE2-LABEL: combine_test15: 1442; SSE2: # %bb.0: 1443; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1444; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1445; SSE2-NEXT: retq 1446; 1447; SSSE3-LABEL: combine_test15: 1448; SSSE3: # %bb.0: 1449; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1450; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1451; SSSE3-NEXT: retq 1452; 1453; SSE41-LABEL: combine_test15: 1454; SSE41: # %bb.0: 1455; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1456; SSE41-NEXT: retq 1457; 1458; AVX-LABEL: combine_test15: 1459; AVX: # %bb.0: 1460; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1461; AVX-NEXT: retq 1462 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1463 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1464 ret <4 x float> %2 1465} 1466 1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1468; CHECK-LABEL: combine_test16: 1469; CHECK: # %bb.0: 1470; CHECK-NEXT: retq 1471 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1472 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1473 ret <4 x i32> %2 1474} 1475 1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1477; SSE2-LABEL: combine_test17: 1478; SSE2: # %bb.0: 1479; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1480; SSE2-NEXT: movaps %xmm1, %xmm0 1481; SSE2-NEXT: retq 1482; 1483; SSSE3-LABEL: combine_test17: 1484; SSSE3: # %bb.0: 1485; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1486; SSSE3-NEXT: movaps %xmm1, %xmm0 1487; SSSE3-NEXT: retq 1488; 1489; SSE41-LABEL: combine_test17: 1490; SSE41: # %bb.0: 1491; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1492; SSE41-NEXT: retq 1493; 1494; AVX-LABEL: combine_test17: 1495; AVX: # %bb.0: 1496; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1497; AVX-NEXT: retq 1498 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1499 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1500 ret <4 x i32> %2 1501} 1502 1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1504; SSE-LABEL: combine_test18: 1505; SSE: # %bb.0: 1506; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1507; SSE-NEXT: retq 1508; 1509; AVX-LABEL: combine_test18: 1510; AVX: # %bb.0: 1511; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1512; AVX-NEXT: retq 1513 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1514 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1515 ret <4 x i32> %2 1516} 1517 1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1519; SSE-LABEL: combine_test19: 1520; SSE: # %bb.0: 1521; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1522; SSE-NEXT: retq 1523; 1524; AVX-LABEL: combine_test19: 1525; AVX: # %bb.0: 1526; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1527; AVX-NEXT: retq 1528 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1529 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1530 ret <4 x i32> %2 1531} 1532 1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1534; SSE2-LABEL: combine_test20: 1535; SSE2: # %bb.0: 1536; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1537; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1538; SSE2-NEXT: retq 1539; 1540; SSSE3-LABEL: combine_test20: 1541; SSSE3: # %bb.0: 1542; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1543; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1544; SSSE3-NEXT: retq 1545; 1546; SSE41-LABEL: combine_test20: 1547; SSE41: # %bb.0: 1548; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1549; SSE41-NEXT: retq 1550; 1551; AVX-LABEL: combine_test20: 1552; AVX: # %bb.0: 1553; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1554; AVX-NEXT: retq 1555 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1556 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1557 ret <4 x i32> %2 1558} 1559 1560define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) { 1561; SSE-LABEL: combine_test21: 1562; SSE: # %bb.0: 1563; SSE-NEXT: movaps %xmm0, %xmm2 1564; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1565; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1566; SSE-NEXT: movaps %xmm2, (%rdi) 1567; SSE-NEXT: retq 1568; 1569; AVX1-LABEL: combine_test21: 1570; AVX1: # %bb.0: 1571; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1572; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1573; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1574; AVX1-NEXT: vmovaps %xmm2, (%rdi) 1575; AVX1-NEXT: vzeroupper 1576; AVX1-NEXT: retq 1577; 1578; AVX2-LABEL: combine_test21: 1579; AVX2: # %bb.0: 1580; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3] 1581; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 1582; AVX2-NEXT: vmovaps %xmm0, (%rdi) 1583; AVX2-NEXT: vmovaps %xmm1, %xmm0 1584; AVX2-NEXT: vzeroupper 1585; AVX2-NEXT: retq 1586 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1587 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1588 store <4 x i32> %1, ptr %ptr, align 16 1589 ret <4 x i32> %2 1590} 1591 1592define <8 x float> @combine_test22(ptr %a, ptr %b) { 1593; SSE-LABEL: combine_test22: 1594; SSE: # %bb.0: 1595; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1596; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1597; SSE-NEXT: retq 1598; 1599; AVX-LABEL: combine_test22: 1600; AVX: # %bb.0: 1601; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1602; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1603; AVX-NEXT: retq 1604; Current AVX2 lowering of this is still awful, not adding a test case. 1605 %1 = load <2 x float>, ptr %a, align 8 1606 %2 = load <2 x float>, ptr %b, align 8 1607 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1608 ret <8 x float> %3 1609} 1610 1611; PR22359 1612define void @combine_test23(<8 x float> %v, ptr %ptr) { 1613; SSE-LABEL: combine_test23: 1614; SSE: # %bb.0: 1615; SSE-NEXT: movups %xmm0, (%rdi) 1616; SSE-NEXT: retq 1617; 1618; AVX-LABEL: combine_test23: 1619; AVX: # %bb.0: 1620; AVX-NEXT: vmovups %xmm0, (%rdi) 1621; AVX-NEXT: vzeroupper 1622; AVX-NEXT: retq 1623 %idx2 = getelementptr inbounds <2 x float>, ptr %ptr, i64 1 1624 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1625 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1626 store <2 x float> %shuffle0, ptr %ptr, align 8 1627 store <2 x float> %shuffle1, ptr %idx2, align 8 1628 ret void 1629} 1630 1631; Check some negative cases. 1632; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1633 1634define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1635; SSE-LABEL: combine_test1b: 1636; SSE: # %bb.0: 1637; SSE-NEXT: movaps %xmm1, %xmm0 1638; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 1639; SSE-NEXT: retq 1640; 1641; AVX-LABEL: combine_test1b: 1642; AVX: # %bb.0: 1643; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1644; AVX-NEXT: retq 1645 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1646 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1647 ret <4 x float> %2 1648} 1649 1650define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1651; SSE2-LABEL: combine_test2b: 1652; SSE2: # %bb.0: 1653; SSE2-NEXT: movaps %xmm1, %xmm0 1654; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1655; SSE2-NEXT: retq 1656; 1657; SSSE3-LABEL: combine_test2b: 1658; SSSE3: # %bb.0: 1659; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1660; SSSE3-NEXT: retq 1661; 1662; SSE41-LABEL: combine_test2b: 1663; SSE41: # %bb.0: 1664; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1665; SSE41-NEXT: retq 1666; 1667; AVX-LABEL: combine_test2b: 1668; AVX: # %bb.0: 1669; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1670; AVX-NEXT: retq 1671 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1672 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1673 ret <4 x float> %2 1674} 1675 1676define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1677; SSE2-LABEL: combine_test3b: 1678; SSE2: # %bb.0: 1679; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1680; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1681; SSE2-NEXT: retq 1682; 1683; SSSE3-LABEL: combine_test3b: 1684; SSSE3: # %bb.0: 1685; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1686; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1687; SSSE3-NEXT: retq 1688; 1689; SSE41-LABEL: combine_test3b: 1690; SSE41: # %bb.0: 1691; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1692; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1693; SSE41-NEXT: retq 1694; 1695; AVX-LABEL: combine_test3b: 1696; AVX: # %bb.0: 1697; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1698; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1699; AVX-NEXT: retq 1700 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1701 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1702 ret <4 x float> %2 1703} 1704 1705define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1706; SSE-LABEL: combine_test4b: 1707; SSE: # %bb.0: 1708; SSE-NEXT: movaps %xmm1, %xmm0 1709; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 1710; SSE-NEXT: retq 1711; 1712; AVX-LABEL: combine_test4b: 1713; AVX: # %bb.0: 1714; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1715; AVX-NEXT: retq 1716 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1717 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1718 ret <4 x float> %2 1719} 1720 1721 1722; Verify that we correctly fold shuffles even when we use illegal vector types. 1723 1724define <4 x i8> @combine_test1c(ptr %a, ptr %b) { 1725; SSE2-LABEL: combine_test1c: 1726; SSE2: # %bb.0: 1727; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1728; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1729; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1730; SSE2-NEXT: andps %xmm0, %xmm2 1731; SSE2-NEXT: andnps %xmm1, %xmm0 1732; SSE2-NEXT: orps %xmm2, %xmm0 1733; SSE2-NEXT: retq 1734; 1735; SSSE3-LABEL: combine_test1c: 1736; SSSE3: # %bb.0: 1737; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1738; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1739; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1740; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1741; SSSE3-NEXT: retq 1742; 1743; SSE41-LABEL: combine_test1c: 1744; SSE41: # %bb.0: 1745; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1746; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1747; SSE41-NEXT: movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] 1748; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1749; SSE41-NEXT: movdqa %xmm1, %xmm0 1750; SSE41-NEXT: retq 1751; 1752; AVX-LABEL: combine_test1c: 1753; AVX: # %bb.0: 1754; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1755; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1756; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] 1757; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1758; AVX-NEXT: retq 1759 %A = load <4 x i8>, ptr %a 1760 %B = load <4 x i8>, ptr %b 1761 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1762 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1763 ret <4 x i8> %2 1764} 1765 1766define <4 x i8> @combine_test2c(ptr %a, ptr %b) { 1767; SSE-LABEL: combine_test2c: 1768; SSE: # %bb.0: 1769; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1770; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1771; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1772; SSE-NEXT: retq 1773; 1774; AVX-LABEL: combine_test2c: 1775; AVX: # %bb.0: 1776; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1777; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1778; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1779; AVX-NEXT: retq 1780 %A = load <4 x i8>, ptr %a 1781 %B = load <4 x i8>, ptr %b 1782 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1783 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1784 ret <4 x i8> %2 1785} 1786 1787define <4 x i8> @combine_test3c(ptr %a, ptr %b) { 1788; SSE-LABEL: combine_test3c: 1789; SSE: # %bb.0: 1790; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1791; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1792; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1793; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1794; SSE-NEXT: retq 1795; 1796; AVX-LABEL: combine_test3c: 1797; AVX: # %bb.0: 1798; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1799; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1800; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1801; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1802; AVX-NEXT: retq 1803 %A = load <4 x i8>, ptr %a 1804 %B = load <4 x i8>, ptr %b 1805 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1806 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1807 ret <4 x i8> %2 1808} 1809 1810define <4 x i8> @combine_test4c(ptr %a, ptr %b) { 1811; SSE2-LABEL: combine_test4c: 1812; SSE2: # %bb.0: 1813; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1814; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1815; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1816; SSE2-NEXT: andps %xmm0, %xmm2 1817; SSE2-NEXT: andnps %xmm1, %xmm0 1818; SSE2-NEXT: orps %xmm2, %xmm0 1819; SSE2-NEXT: retq 1820; 1821; SSSE3-LABEL: combine_test4c: 1822; SSSE3: # %bb.0: 1823; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1824; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1825; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1826; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1827; SSSE3-NEXT: retq 1828; 1829; SSE41-LABEL: combine_test4c: 1830; SSE41: # %bb.0: 1831; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1832; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1833; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] 1834; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1835; SSE41-NEXT: movdqa %xmm1, %xmm0 1836; SSE41-NEXT: retq 1837; 1838; AVX-LABEL: combine_test4c: 1839; AVX: # %bb.0: 1840; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1841; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1842; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] 1843; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1844; AVX-NEXT: retq 1845 %A = load <4 x i8>, ptr %a 1846 %B = load <4 x i8>, ptr %b 1847 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1848 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1849 ret <4 x i8> %2 1850} 1851 1852 1853; The following test cases are generated from this C++ code 1854; 1855;__m128 blend_01(__m128 a, __m128 b) 1856;{ 1857; __m128 s = a; 1858; s = _mm_blend_ps( s, b, 1<<0 ); 1859; s = _mm_blend_ps( s, b, 1<<1 ); 1860; return s; 1861;} 1862; 1863;__m128 blend_02(__m128 a, __m128 b) 1864;{ 1865; __m128 s = a; 1866; s = _mm_blend_ps( s, b, 1<<0 ); 1867; s = _mm_blend_ps( s, b, 1<<2 ); 1868; return s; 1869;} 1870; 1871;__m128 blend_123(__m128 a, __m128 b) 1872;{ 1873; __m128 s = a; 1874; s = _mm_blend_ps( s, b, 1<<1 ); 1875; s = _mm_blend_ps( s, b, 1<<2 ); 1876; s = _mm_blend_ps( s, b, 1<<3 ); 1877; return s; 1878;} 1879 1880; Ideally, we should collapse the following shuffles into a single one. 1881 1882define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 1883; SSE2-LABEL: combine_blend_01: 1884; SSE2: # %bb.0: 1885; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1886; SSE2-NEXT: retq 1887; 1888; SSSE3-LABEL: combine_blend_01: 1889; SSSE3: # %bb.0: 1890; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1891; SSSE3-NEXT: retq 1892; 1893; SSE41-LABEL: combine_blend_01: 1894; SSE41: # %bb.0: 1895; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1896; SSE41-NEXT: retq 1897; 1898; AVX-LABEL: combine_blend_01: 1899; AVX: # %bb.0: 1900; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1901; AVX-NEXT: retq 1902 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 1903 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1904 ret <4 x float> %shuffle6 1905} 1906 1907define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 1908; SSE2-LABEL: combine_blend_02: 1909; SSE2: # %bb.0: 1910; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1911; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1912; SSE2-NEXT: movaps %xmm1, %xmm0 1913; SSE2-NEXT: retq 1914; 1915; SSSE3-LABEL: combine_blend_02: 1916; SSSE3: # %bb.0: 1917; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1918; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1919; SSSE3-NEXT: movaps %xmm1, %xmm0 1920; SSSE3-NEXT: retq 1921; 1922; SSE41-LABEL: combine_blend_02: 1923; SSE41: # %bb.0: 1924; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1925; SSE41-NEXT: retq 1926; 1927; AVX-LABEL: combine_blend_02: 1928; AVX: # %bb.0: 1929; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1930; AVX-NEXT: retq 1931 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 1932 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1933 ret <4 x float> %shuffle6 1934} 1935 1936define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 1937; SSE2-LABEL: combine_blend_123: 1938; SSE2: # %bb.0: 1939; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1940; SSE2-NEXT: movaps %xmm1, %xmm0 1941; SSE2-NEXT: retq 1942; 1943; SSSE3-LABEL: combine_blend_123: 1944; SSSE3: # %bb.0: 1945; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1946; SSSE3-NEXT: movaps %xmm1, %xmm0 1947; SSSE3-NEXT: retq 1948; 1949; SSE41-LABEL: combine_blend_123: 1950; SSE41: # %bb.0: 1951; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1952; SSE41-NEXT: retq 1953; 1954; AVX-LABEL: combine_blend_123: 1955; AVX: # %bb.0: 1956; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1957; AVX-NEXT: retq 1958 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 1959 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 1960 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1961 ret <4 x float> %shuffle12 1962} 1963 1964define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 1965; SSE-LABEL: combine_test_movhl_1: 1966; SSE: # %bb.0: 1967; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1968; SSE-NEXT: movaps %xmm1, %xmm0 1969; SSE-NEXT: retq 1970; 1971; AVX-LABEL: combine_test_movhl_1: 1972; AVX: # %bb.0: 1973; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1974; AVX-NEXT: retq 1975 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 1976 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 1977 ret <4 x i32> %2 1978} 1979 1980define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 1981; SSE-LABEL: combine_test_movhl_2: 1982; SSE: # %bb.0: 1983; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1984; SSE-NEXT: movaps %xmm1, %xmm0 1985; SSE-NEXT: retq 1986; 1987; AVX-LABEL: combine_test_movhl_2: 1988; AVX: # %bb.0: 1989; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1990; AVX-NEXT: retq 1991 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 1992 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 1993 ret <4 x i32> %2 1994} 1995 1996define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 1997; SSE-LABEL: combine_test_movhl_3: 1998; SSE: # %bb.0: 1999; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2000; SSE-NEXT: movaps %xmm1, %xmm0 2001; SSE-NEXT: retq 2002; 2003; AVX-LABEL: combine_test_movhl_3: 2004; AVX: # %bb.0: 2005; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2006; AVX-NEXT: retq 2007 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 2008 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2009 ret <4 x i32> %2 2010} 2011 2012define <16 x i8> @combine_and_or_shuffle(<16 x i8> %x, <16 x i8> %y) { 2013; SSE2-LABEL: combine_and_or_shuffle: 2014; SSE2: # %bb.0: 2015; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 2016; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2017; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2018; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,2,4,5,6,7] 2019; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,7,7] 2020; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2021; SSE2-NEXT: pxor %xmm3, %xmm3 2022; SSE2-NEXT: movdqa %xmm1, %xmm0 2023; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 2024; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 2025; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,3] 2026; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] 2027; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 2028; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 2029; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,2,1,4,5,6,7] 2030; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 2031; SSE2-NEXT: pand %xmm0, %xmm1 2032; SSE2-NEXT: pandn %xmm4, %xmm0 2033; SSE2-NEXT: por %xmm1, %xmm0 2034; SSE2-NEXT: packuswb %xmm0, %xmm0 2035; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2036; SSE2-NEXT: por %xmm2, %xmm0 2037; SSE2-NEXT: retq 2038; 2039; SSSE3-LABEL: combine_and_or_shuffle: 2040; SSSE3: # %bb.0: 2041; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2042; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2043; SSSE3-NEXT: por %xmm1, %xmm0 2044; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2045; SSSE3-NEXT: retq 2046; 2047; SSE41-LABEL: combine_and_or_shuffle: 2048; SSE41: # %bb.0: 2049; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2050; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2051; SSE41-NEXT: por %xmm1, %xmm0 2052; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2053; SSE41-NEXT: retq 2054; 2055; AVX-LABEL: combine_and_or_shuffle: 2056; AVX: # %bb.0: 2057; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2058; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2059; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2060; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2061; AVX-NEXT: retq 2062 %1 = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 15, i32 16, i32 1, i32 16, i32 14, i32 16, i32 2, i32 16, i32 13, i32 16, i32 3, i32 16, i32 16> 2063 %2 = shufflevector <16 x i8> %y, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 16, i32 0, i32 16, i32 8, i32 16, i32 1, i32 16, i32 9, i32 16, i32 10, i32 16, i32 7, i32 16, i32 7, i32 16> 2064 %3 = or <16 x i8> %1, %2 2065 %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 2066 ret <16 x i8> %4 2067} 2068 2069; Verify that we fold shuffles according to rule: 2070; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2071 2072define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2073; SSE2-LABEL: combine_undef_input_test1: 2074; SSE2: # %bb.0: 2075; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2076; SSE2-NEXT: retq 2077; 2078; SSSE3-LABEL: combine_undef_input_test1: 2079; SSSE3: # %bb.0: 2080; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2081; SSSE3-NEXT: retq 2082; 2083; SSE41-LABEL: combine_undef_input_test1: 2084; SSE41: # %bb.0: 2085; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2086; SSE41-NEXT: retq 2087; 2088; AVX-LABEL: combine_undef_input_test1: 2089; AVX: # %bb.0: 2090; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2091; AVX-NEXT: retq 2092 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2093 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2094 ret <4 x float> %2 2095} 2096 2097define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2098; SSE-LABEL: combine_undef_input_test2: 2099; SSE: # %bb.0: 2100; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2101; SSE-NEXT: retq 2102; 2103; AVX-LABEL: combine_undef_input_test2: 2104; AVX: # %bb.0: 2105; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2106; AVX-NEXT: retq 2107 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2108 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2109 ret <4 x float> %2 2110} 2111 2112define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2113; SSE-LABEL: combine_undef_input_test3: 2114; SSE: # %bb.0: 2115; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2116; SSE-NEXT: retq 2117; 2118; AVX-LABEL: combine_undef_input_test3: 2119; AVX: # %bb.0: 2120; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2121; AVX-NEXT: retq 2122 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2123 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2124 ret <4 x float> %2 2125} 2126 2127define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2128; SSE-LABEL: combine_undef_input_test4: 2129; SSE: # %bb.0: 2130; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2131; SSE-NEXT: retq 2132; 2133; AVX-LABEL: combine_undef_input_test4: 2134; AVX: # %bb.0: 2135; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2136; AVX-NEXT: retq 2137 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2138 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2139 ret <4 x float> %2 2140} 2141 2142define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2143; SSE2-LABEL: combine_undef_input_test5: 2144; SSE2: # %bb.0: 2145; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2146; SSE2-NEXT: retq 2147; 2148; SSSE3-LABEL: combine_undef_input_test5: 2149; SSSE3: # %bb.0: 2150; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2151; SSSE3-NEXT: retq 2152; 2153; SSE41-LABEL: combine_undef_input_test5: 2154; SSE41: # %bb.0: 2155; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2156; SSE41-NEXT: retq 2157; 2158; AVX-LABEL: combine_undef_input_test5: 2159; AVX: # %bb.0: 2160; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2161; AVX-NEXT: retq 2162 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2163 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2164 ret <4 x float> %2 2165} 2166 2167 2168; Verify that we fold shuffles according to rule: 2169; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2170 2171define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2172; CHECK-LABEL: combine_undef_input_test6: 2173; CHECK: # %bb.0: 2174; CHECK-NEXT: retq 2175 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2176 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2177 ret <4 x float> %2 2178} 2179 2180define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2181; SSE2-LABEL: combine_undef_input_test7: 2182; SSE2: # %bb.0: 2183; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2184; SSE2-NEXT: retq 2185; 2186; SSSE3-LABEL: combine_undef_input_test7: 2187; SSSE3: # %bb.0: 2188; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2189; SSSE3-NEXT: retq 2190; 2191; SSE41-LABEL: combine_undef_input_test7: 2192; SSE41: # %bb.0: 2193; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2194; SSE41-NEXT: retq 2195; 2196; AVX-LABEL: combine_undef_input_test7: 2197; AVX: # %bb.0: 2198; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2199; AVX-NEXT: retq 2200 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2201 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2202 ret <4 x float> %2 2203} 2204 2205define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2206; SSE2-LABEL: combine_undef_input_test8: 2207; SSE2: # %bb.0: 2208; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2209; SSE2-NEXT: retq 2210; 2211; SSSE3-LABEL: combine_undef_input_test8: 2212; SSSE3: # %bb.0: 2213; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2214; SSSE3-NEXT: retq 2215; 2216; SSE41-LABEL: combine_undef_input_test8: 2217; SSE41: # %bb.0: 2218; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2219; SSE41-NEXT: retq 2220; 2221; AVX-LABEL: combine_undef_input_test8: 2222; AVX: # %bb.0: 2223; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2224; AVX-NEXT: retq 2225 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2226 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2227 ret <4 x float> %2 2228} 2229 2230define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2231; SSE-LABEL: combine_undef_input_test9: 2232; SSE: # %bb.0: 2233; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2234; SSE-NEXT: retq 2235; 2236; AVX-LABEL: combine_undef_input_test9: 2237; AVX: # %bb.0: 2238; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] 2239; AVX-NEXT: retq 2240 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2241 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2242 ret <4 x float> %2 2243} 2244 2245define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2246; CHECK-LABEL: combine_undef_input_test10: 2247; CHECK: # %bb.0: 2248; CHECK-NEXT: retq 2249 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2250 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2251 ret <4 x float> %2 2252} 2253 2254define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2255; SSE2-LABEL: combine_undef_input_test11: 2256; SSE2: # %bb.0: 2257; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2258; SSE2-NEXT: retq 2259; 2260; SSSE3-LABEL: combine_undef_input_test11: 2261; SSSE3: # %bb.0: 2262; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2263; SSSE3-NEXT: retq 2264; 2265; SSE41-LABEL: combine_undef_input_test11: 2266; SSE41: # %bb.0: 2267; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2268; SSE41-NEXT: retq 2269; 2270; AVX-LABEL: combine_undef_input_test11: 2271; AVX: # %bb.0: 2272; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2273; AVX-NEXT: retq 2274 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2275 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2276 ret <4 x float> %2 2277} 2278 2279define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2280; SSE-LABEL: combine_undef_input_test12: 2281; SSE: # %bb.0: 2282; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2283; SSE-NEXT: retq 2284; 2285; AVX-LABEL: combine_undef_input_test12: 2286; AVX: # %bb.0: 2287; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2288; AVX-NEXT: retq 2289 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2290 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2291 ret <4 x float> %2 2292} 2293 2294define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2295; SSE-LABEL: combine_undef_input_test13: 2296; SSE: # %bb.0: 2297; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2298; SSE-NEXT: retq 2299; 2300; AVX-LABEL: combine_undef_input_test13: 2301; AVX: # %bb.0: 2302; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2303; AVX-NEXT: retq 2304 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2305 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2306 ret <4 x float> %2 2307} 2308 2309define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2310; SSE-LABEL: combine_undef_input_test14: 2311; SSE: # %bb.0: 2312; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2313; SSE-NEXT: retq 2314; 2315; AVX-LABEL: combine_undef_input_test14: 2316; AVX: # %bb.0: 2317; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2318; AVX-NEXT: retq 2319 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2320 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2321 ret <4 x float> %2 2322} 2323 2324define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2325; SSE2-LABEL: combine_undef_input_test15: 2326; SSE2: # %bb.0: 2327; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2328; SSE2-NEXT: retq 2329; 2330; SSSE3-LABEL: combine_undef_input_test15: 2331; SSSE3: # %bb.0: 2332; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2333; SSSE3-NEXT: retq 2334; 2335; SSE41-LABEL: combine_undef_input_test15: 2336; SSE41: # %bb.0: 2337; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2338; SSE41-NEXT: retq 2339; 2340; AVX-LABEL: combine_undef_input_test15: 2341; AVX: # %bb.0: 2342; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2343; AVX-NEXT: retq 2344 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2345 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2346 ret <4 x float> %2 2347} 2348 2349 2350; Verify that shuffles are canonicalized according to rules: 2351; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2352; 2353; This allows to trigger the following combine rule: 2354; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2355; 2356; As a result, all the shuffle pairs in each function below should be 2357; combined into a single legal shuffle operation. 2358 2359define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2360; CHECK-LABEL: combine_undef_input_test16: 2361; CHECK: # %bb.0: 2362; CHECK-NEXT: retq 2363 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2364 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2365 ret <4 x float> %2 2366} 2367 2368define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2369; SSE2-LABEL: combine_undef_input_test17: 2370; SSE2: # %bb.0: 2371; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2372; SSE2-NEXT: retq 2373; 2374; SSSE3-LABEL: combine_undef_input_test17: 2375; SSSE3: # %bb.0: 2376; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2377; SSSE3-NEXT: retq 2378; 2379; SSE41-LABEL: combine_undef_input_test17: 2380; SSE41: # %bb.0: 2381; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2382; SSE41-NEXT: retq 2383; 2384; AVX-LABEL: combine_undef_input_test17: 2385; AVX: # %bb.0: 2386; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2387; AVX-NEXT: retq 2388 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2389 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2390 ret <4 x float> %2 2391} 2392 2393define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2394; SSE2-LABEL: combine_undef_input_test18: 2395; SSE2: # %bb.0: 2396; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2397; SSE2-NEXT: retq 2398; 2399; SSSE3-LABEL: combine_undef_input_test18: 2400; SSSE3: # %bb.0: 2401; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2402; SSSE3-NEXT: retq 2403; 2404; SSE41-LABEL: combine_undef_input_test18: 2405; SSE41: # %bb.0: 2406; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2407; SSE41-NEXT: retq 2408; 2409; AVX-LABEL: combine_undef_input_test18: 2410; AVX: # %bb.0: 2411; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2412; AVX-NEXT: retq 2413 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2414 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2415 ret <4 x float> %2 2416} 2417 2418define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2419; SSE-LABEL: combine_undef_input_test19: 2420; SSE: # %bb.0: 2421; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2422; SSE-NEXT: retq 2423; 2424; AVX-LABEL: combine_undef_input_test19: 2425; AVX: # %bb.0: 2426; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,1] 2427; AVX-NEXT: retq 2428 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2429 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2430 ret <4 x float> %2 2431} 2432 2433define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2434; CHECK-LABEL: combine_undef_input_test20: 2435; CHECK: # %bb.0: 2436; CHECK-NEXT: retq 2437 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2438 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2439 ret <4 x float> %2 2440} 2441 2442; These tests are designed to test the ability to combine away unnecessary 2443; operations feeding into a shuffle. The AVX cases are the important ones as 2444; they leverage operations which cannot be done naturally on the entire vector 2445; and thus are decomposed into multiple smaller operations. 2446 2447define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2448; SSE-LABEL: combine_unneeded_subvector1: 2449; SSE: # %bb.0: 2450; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2451; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2452; SSE-NEXT: movdqa %xmm0, %xmm1 2453; SSE-NEXT: retq 2454; 2455; AVX1-LABEL: combine_unneeded_subvector1: 2456; AVX1: # %bb.0: 2457; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2458; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2459; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2460; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2461; AVX1-NEXT: retq 2462; 2463; AVX2-SLOW-LABEL: combine_unneeded_subvector1: 2464; AVX2-SLOW: # %bb.0: 2465; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2466; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2467; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2468; AVX2-SLOW-NEXT: retq 2469; 2470; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: 2471; AVX2-FAST-ALL: # %bb.0: 2472; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2473; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2474; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 2475; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2476; AVX2-FAST-ALL-NEXT: retq 2477; 2478; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: 2479; AVX2-FAST-PERLANE: # %bb.0: 2480; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2481; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2482; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2483; AVX2-FAST-PERLANE-NEXT: retq 2484 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2485 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2486 ret <8 x i32> %c 2487} 2488 2489define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2490; SSE-LABEL: combine_unneeded_subvector2: 2491; SSE: # %bb.0: 2492; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2493; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2494; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2495; SSE-NEXT: retq 2496; 2497; AVX1-LABEL: combine_unneeded_subvector2: 2498; AVX1: # %bb.0: 2499; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2500; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2501; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2502; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2503; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2504; AVX1-NEXT: retq 2505; 2506; AVX2-LABEL: combine_unneeded_subvector2: 2507; AVX2: # %bb.0: 2508; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2509; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2510; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2511; AVX2-NEXT: retq 2512 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2513 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2514 ret <8 x i32> %d 2515} 2516 2517define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2518; SSE2-LABEL: combine_insertps1: 2519; SSE2: # %bb.0: 2520; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2521; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2522; SSE2-NEXT: movaps %xmm1, %xmm0 2523; SSE2-NEXT: retq 2524; 2525; SSSE3-LABEL: combine_insertps1: 2526; SSSE3: # %bb.0: 2527; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2528; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2529; SSSE3-NEXT: movaps %xmm1, %xmm0 2530; SSSE3-NEXT: retq 2531; 2532; SSE41-LABEL: combine_insertps1: 2533; SSE41: # %bb.0: 2534; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2535; SSE41-NEXT: retq 2536; 2537; AVX-LABEL: combine_insertps1: 2538; AVX: # %bb.0: 2539; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2540; AVX-NEXT: retq 2541 2542 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2543 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2544 ret <4 x float> %d 2545} 2546 2547define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2548; SSE2-LABEL: combine_insertps2: 2549; SSE2: # %bb.0: 2550; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2551; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2552; SSE2-NEXT: movaps %xmm1, %xmm0 2553; SSE2-NEXT: retq 2554; 2555; SSSE3-LABEL: combine_insertps2: 2556; SSSE3: # %bb.0: 2557; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2558; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2559; SSSE3-NEXT: movaps %xmm1, %xmm0 2560; SSSE3-NEXT: retq 2561; 2562; SSE41-LABEL: combine_insertps2: 2563; SSE41: # %bb.0: 2564; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2565; SSE41-NEXT: retq 2566; 2567; AVX-LABEL: combine_insertps2: 2568; AVX: # %bb.0: 2569; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2570; AVX-NEXT: retq 2571 2572 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2573 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2574 ret <4 x float> %d 2575} 2576 2577define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2578; SSE2-LABEL: combine_insertps3: 2579; SSE2: # %bb.0: 2580; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2581; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2582; SSE2-NEXT: retq 2583; 2584; SSSE3-LABEL: combine_insertps3: 2585; SSSE3: # %bb.0: 2586; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2587; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2588; SSSE3-NEXT: retq 2589; 2590; SSE41-LABEL: combine_insertps3: 2591; SSE41: # %bb.0: 2592; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2593; SSE41-NEXT: retq 2594; 2595; AVX-LABEL: combine_insertps3: 2596; AVX: # %bb.0: 2597; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2598; AVX-NEXT: retq 2599 2600 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2601 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2602 ret <4 x float> %d 2603} 2604 2605define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2606; SSE2-LABEL: combine_insertps4: 2607; SSE2: # %bb.0: 2608; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2609; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2610; SSE2-NEXT: retq 2611; 2612; SSSE3-LABEL: combine_insertps4: 2613; SSSE3: # %bb.0: 2614; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2615; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2616; SSSE3-NEXT: retq 2617; 2618; SSE41-LABEL: combine_insertps4: 2619; SSE41: # %bb.0: 2620; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2621; SSE41-NEXT: retq 2622; 2623; AVX-LABEL: combine_insertps4: 2624; AVX: # %bb.0: 2625; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2626; AVX-NEXT: retq 2627 2628 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2629 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2630 ret <4 x float> %d 2631} 2632 2633define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) { 2634; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2635; SSE: # %bb.0: 2636; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2637; SSE-NEXT: movaps %xmm0, (%rsi) 2638; SSE-NEXT: retq 2639; 2640; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2641; AVX: # %bb.0: 2642; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2643; AVX-NEXT: vmovaps %xmm0, (%rsi) 2644; AVX-NEXT: retq 2645 %1 = load double, ptr %a0, align 8 2646 %2 = insertelement <2 x double> undef, double %1, i32 0 2647 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2648 %4 = bitcast <2 x double> %3 to <4 x float> 2649 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2650 store <4 x float> %5, ptr %a1, align 16 2651 ret void 2652} 2653 2654; PR30371 2655define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2656; SSE2-LABEL: combine_constant_insertion_v4f32: 2657; SSE2: # %bb.0: 2658; SSE2-NEXT: movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0] 2659; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2660; SSE2-NEXT: movaps %xmm1, %xmm0 2661; SSE2-NEXT: retq 2662; 2663; SSSE3-LABEL: combine_constant_insertion_v4f32: 2664; SSSE3: # %bb.0: 2665; SSSE3-NEXT: movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0] 2666; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2667; SSSE3-NEXT: movaps %xmm1, %xmm0 2668; SSSE3-NEXT: retq 2669; 2670; SSE41-LABEL: combine_constant_insertion_v4f32: 2671; SSE41: # %bb.0: 2672; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2673; SSE41-NEXT: retq 2674; 2675; AVX-LABEL: combine_constant_insertion_v4f32: 2676; AVX: # %bb.0: 2677; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2678; AVX-NEXT: retq 2679 %a0 = insertelement <4 x float> undef, float %f, i32 0 2680 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2681 ret <4 x float> %ret 2682} 2683 2684define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2685; SSE2-LABEL: combine_constant_insertion_v4i32: 2686; SSE2: # %bb.0: 2687; SSE2-NEXT: movd %edi, %xmm1 2688; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,4,5,30] 2689; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2690; SSE2-NEXT: retq 2691; 2692; SSSE3-LABEL: combine_constant_insertion_v4i32: 2693; SSSE3: # %bb.0: 2694; SSSE3-NEXT: movd %edi, %xmm1 2695; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,4,5,30] 2696; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2697; SSSE3-NEXT: retq 2698; 2699; SSE41-LABEL: combine_constant_insertion_v4i32: 2700; SSE41: # %bb.0: 2701; SSE41-NEXT: pmovsxbd {{.*#+}} xmm0 = [0,4,5,30] 2702; SSE41-NEXT: pinsrd $0, %edi, %xmm0 2703; SSE41-NEXT: retq 2704; 2705; AVX-LABEL: combine_constant_insertion_v4i32: 2706; AVX: # %bb.0: 2707; AVX-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,4,5,30] 2708; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 2709; AVX-NEXT: retq 2710 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2711 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2712 ret <4 x i32> %ret 2713} 2714 2715define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2716; SSE2-LABEL: PR22377: 2717; SSE2: # %bb.0: # %entry 2718; SSE2-NEXT: movaps %xmm0, %xmm1 2719; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] 2720; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2721; SSE2-NEXT: addps %xmm0, %xmm1 2722; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2723; SSE2-NEXT: retq 2724; 2725; SSSE3-LABEL: PR22377: 2726; SSSE3: # %bb.0: # %entry 2727; SSSE3-NEXT: movaps %xmm0, %xmm1 2728; SSSE3-NEXT: haddps %xmm0, %xmm1 2729; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2730; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2731; SSSE3-NEXT: retq 2732; 2733; SSE41-LABEL: PR22377: 2734; SSE41: # %bb.0: # %entry 2735; SSE41-NEXT: movaps %xmm0, %xmm1 2736; SSE41-NEXT: haddps %xmm0, %xmm1 2737; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2738; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2739; SSE41-NEXT: retq 2740; 2741; AVX-LABEL: PR22377: 2742; AVX: # %bb.0: # %entry 2743; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 2744; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2745; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2746; AVX-NEXT: retq 2747entry: 2748 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2749 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2750 %r2 = fadd <4 x float> %s1, %s2 2751 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2752 ret <4 x float> %s3 2753} 2754 2755define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2756; SSE2-LABEL: PR22390: 2757; SSE2: # %bb.0: # %entry 2758; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2759; SSE2-NEXT: movaps %xmm0, %xmm2 2760; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2761; SSE2-NEXT: addps %xmm2, %xmm0 2762; SSE2-NEXT: retq 2763; 2764; SSSE3-LABEL: PR22390: 2765; SSSE3: # %bb.0: # %entry 2766; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2767; SSSE3-NEXT: movaps %xmm0, %xmm2 2768; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2769; SSSE3-NEXT: addps %xmm2, %xmm0 2770; SSSE3-NEXT: retq 2771; 2772; SSE41-LABEL: PR22390: 2773; SSE41: # %bb.0: # %entry 2774; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2775; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2776; SSE41-NEXT: addps %xmm1, %xmm0 2777; SSE41-NEXT: retq 2778; 2779; AVX-LABEL: PR22390: 2780; AVX: # %bb.0: # %entry 2781; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2782; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2783; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2784; AVX-NEXT: retq 2785entry: 2786 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2787 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2788 %r2 = fadd <4 x float> %s1, %s2 2789 ret <4 x float> %r2 2790} 2791 2792define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2793; SSE-LABEL: PR22412: 2794; SSE: # %bb.0: # %entry 2795; SSE-NEXT: movaps %xmm3, %xmm1 2796; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2797; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2] 2798; SSE-NEXT: retq 2799; 2800; AVX1-LABEL: PR22412: 2801; AVX1: # %bb.0: # %entry 2802; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] 2803; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2804; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6] 2805; AVX1-NEXT: retq 2806; 2807; AVX2-LABEL: PR22412: 2808; AVX2: # %bb.0: # %entry 2809; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2810; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 2811; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2812; AVX2-NEXT: retq 2813entry: 2814 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2815 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2816 ret <8 x float> %s2 2817} 2818 2819define <4 x float> @PR30264(<4 x float> %x) { 2820; SSE2-LABEL: PR30264: 2821; SSE2: # %bb.0: 2822; SSE2-NEXT: xorps %xmm1, %xmm1 2823; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2824; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] 2825; SSE2-NEXT: movapd %xmm1, %xmm0 2826; SSE2-NEXT: retq 2827; 2828; SSSE3-LABEL: PR30264: 2829; SSSE3: # %bb.0: 2830; SSSE3-NEXT: xorps %xmm1, %xmm1 2831; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2832; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1] 2833; SSSE3-NEXT: movapd %xmm1, %xmm0 2834; SSSE3-NEXT: retq 2835; 2836; SSE41-LABEL: PR30264: 2837; SSE41: # %bb.0: 2838; SSE41-NEXT: movaps {{.*#+}} xmm1 = [u,u,4.0E+0,1.0E+0] 2839; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] 2840; SSE41-NEXT: movaps %xmm1, %xmm0 2841; SSE41-NEXT: retq 2842; 2843; AVX-LABEL: PR30264: 2844; AVX: # %bb.0: 2845; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.0E+0,1.0E+0,4.0E+0,1.0E+0] 2846; AVX-NEXT: # xmm1 = mem[0,0] 2847; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] 2848; AVX-NEXT: retq 2849 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2850 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2851 ret <4 x float> %shuf2 2852} 2853 2854define <8 x i16> @PR39549(<16 x i8> %x) { 2855; SSE-LABEL: PR39549: 2856; SSE: # %bb.0: 2857; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2858; SSE-NEXT: psraw $8, %xmm0 2859; SSE-NEXT: retq 2860; 2861; AVX-LABEL: PR39549: 2862; AVX: # %bb.0: 2863; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2864; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 2865; AVX-NEXT: retq 2866 %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef> 2867 %b = bitcast <16 x i8> %a to <8 x i16> 2868 %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2869 %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2870 ret <8 x i16> %d 2871} 2872 2873define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) { 2874; SSE-LABEL: PR41545: 2875; SSE: # %bb.0: 2876; SSE-NEXT: paddd %xmm1, %xmm0 2877; SSE-NEXT: retq 2878; 2879; AVX-LABEL: PR41545: 2880; AVX: # %bb.0: 2881; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2882; AVX-NEXT: retq 2883 %1 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 2884 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 2885 %3 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 2886 %4 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 2887 %5 = zext <4 x i8> %1 to <4 x i32> 2888 %6 = zext <4 x i8> %2 to <4 x i32> 2889 %7 = zext <4 x i8> %3 to <4 x i32> 2890 %8 = zext <4 x i8> %4 to <4 x i32> 2891 %9 = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8> 2892 %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16> 2893 %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24> 2894 %12 = or <4 x i32> %5, %9 2895 %13 = or <4 x i32> %12, %10 2896 %14 = or <4 x i32> %13, %11 2897 %15 = add <4 x i32> %a0, %14 2898 ret <4 x i32> %15 2899} 2900 2901define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { 2902; SSE-LABEL: shuffle_extract_insert: 2903; SSE: # %bb.0: 2904; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2905; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2906; SSE-NEXT: retq 2907; 2908; AVX1-LABEL: shuffle_extract_insert: 2909; AVX1: # %bb.0: 2910; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2911; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2912; AVX1-NEXT: retq 2913; 2914; AVX2-SLOW-LABEL: shuffle_extract_insert: 2915; AVX2-SLOW: # %bb.0: 2916; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2917; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2918; AVX2-SLOW-NEXT: retq 2919; 2920; AVX2-FAST-LABEL: shuffle_extract_insert: 2921; AVX2-FAST: # %bb.0: 2922; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] 2923; AVX2-FAST-NEXT: retq 2924 %a0 = extractelement <8 x i16> %a, i32 0 2925 %a1 = extractelement <8 x i16> %a, i32 1 2926 %a3 = extractelement <8 x i16> %a, i32 3 2927 %a4 = extractelement <8 x i16> %a, i32 4 2928 %a5 = extractelement <8 x i16> %a, i32 5 2929 %a6 = extractelement <8 x i16> %a, i32 6 2930 %a7 = extractelement <8 x i16> %a, i32 7 2931 %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2932 %2 = insertelement <8 x i16> %1, i16 %a1, i32 1 2933 %3 = insertelement <8 x i16> %2, i16 %a0, i32 2 2934 %4 = insertelement <8 x i16> %3, i16 %a3, i32 3 2935 %5 = insertelement <8 x i16> %4, i16 %a6, i32 4 2936 %6 = insertelement <8 x i16> %5, i16 %a5, i32 5 2937 %7 = insertelement <8 x i16> %6, i16 %a4, i32 6 2938 %8 = insertelement <8 x i16> %7, i16 %a7, i32 7 2939 ret <8 x i16> %8 2940} 2941 2942define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) { 2943; SSE2-LABEL: shuffle_extract_insert_double: 2944; SSE2: # %bb.0: 2945; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 2946; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2947; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2948; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2949; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2950; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2951; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2952; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2953; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2954; SSE2-NEXT: retq 2955; 2956; SSSE3-LABEL: shuffle_extract_insert_double: 2957; SSSE3: # %bb.0: 2958; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2959; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2960; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2961; SSSE3-NEXT: retq 2962; 2963; SSE41-LABEL: shuffle_extract_insert_double: 2964; SSE41: # %bb.0: 2965; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2966; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2967; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2968; SSE41-NEXT: retq 2969; 2970; AVX-LABEL: shuffle_extract_insert_double: 2971; AVX: # %bb.0: 2972; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2973; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2974; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2975; AVX-NEXT: retq 2976 %a0 = extractelement <8 x i16> %a, i32 0 2977 %a4 = extractelement <8 x i16> %a, i32 4 2978 %a6 = extractelement <8 x i16> %a, i32 6 2979 %b11 = extractelement <8 x i16> %b, i32 3 2980 %b13 = extractelement <8 x i16> %b, i32 5 2981 %b15 = extractelement <8 x i16> %b, i32 7 2982 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2983 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2984 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2985 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2986 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2987 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2988 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2989 ret <8 x i16> %7 2990} 2991 2992define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { 2993; SSE2-LABEL: shuffle_extract_concat_insert: 2994; SSE2: # %bb.0: 2995; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2996; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2997; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2998; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2999; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 3000; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 3001; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 3002; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3003; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 3004; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3005; SSE2-NEXT: retq 3006; 3007; SSSE3-LABEL: shuffle_extract_concat_insert: 3008; SSSE3: # %bb.0: 3009; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3010; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3011; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3012; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3013; SSSE3-NEXT: retq 3014; 3015; SSE41-LABEL: shuffle_extract_concat_insert: 3016; SSE41: # %bb.0: 3017; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3018; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3019; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3020; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3021; SSE41-NEXT: retq 3022; 3023; AVX-LABEL: shuffle_extract_concat_insert: 3024; AVX: # %bb.0: 3025; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3026; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3027; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3028; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3029; AVX-NEXT: retq 3030 %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3031 %a0 = extractelement <8 x i16> %a, i32 0 3032 %a4 = extractelement <8 x i16> %a, i32 4 3033 %a6 = extractelement <8 x i16> %a, i32 6 3034 %b11 = extractelement <8 x i16> %b, i32 3 3035 %b13 = extractelement <8 x i16> %b, i32 5 3036 %b15 = extractelement <8 x i16> %b, i32 7 3037 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3038 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 3039 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 3040 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 3041 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 3042 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 3043 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 3044 ret <8 x i16> %7 3045} 3046 3047define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { 3048; SSE2-LABEL: shuffle_scalar_to_vector_extract: 3049; SSE2: # %bb.0: 3050; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3051; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3052; SSE2-NEXT: psraw $8, %xmm1 3053; SSE2-NEXT: pextrw $7, %xmm1, %eax 3054; SSE2-NEXT: movd %eax, %xmm2 3055; SSE2-NEXT: movsbl (%rsi), %eax 3056; SSE2-NEXT: movd %eax, %xmm0 3057; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 3058; SSE2-NEXT: movsbl (%rdx), %eax 3059; SSE2-NEXT: movd %eax, %xmm0 3060; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3061; SSE2-NEXT: pxor %xmm0, %xmm0 3062; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3063; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3064; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3065; SSE2-NEXT: retq 3066; 3067; SSSE3-LABEL: shuffle_scalar_to_vector_extract: 3068; SSSE3: # %bb.0: 3069; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3070; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3071; SSSE3-NEXT: psraw $8, %xmm1 3072; SSSE3-NEXT: movsbl (%rsi), %eax 3073; SSSE3-NEXT: movd %eax, %xmm2 3074; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 3075; SSSE3-NEXT: movsbl (%rdx), %eax 3076; SSSE3-NEXT: movd %eax, %xmm0 3077; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3078; SSSE3-NEXT: pxor %xmm0, %xmm0 3079; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3080; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3081; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3082; SSSE3-NEXT: retq 3083; 3084; SSE41-LABEL: shuffle_scalar_to_vector_extract: 3085; SSE41: # %bb.0: 3086; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 3087; SSE41-NEXT: pextrw $4, %xmm0, %eax 3088; SSE41-NEXT: pextrw $7, %xmm0, %ecx 3089; SSE41-NEXT: pxor %xmm0, %xmm0 3090; SSE41-NEXT: pinsrw $1, %eax, %xmm0 3091; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB 3092; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3093; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3094; SSE41-NEXT: movsbl (%rsi), %eax 3095; SSE41-NEXT: pinsrw $5, %eax, %xmm0 3096; SSE41-NEXT: movsbl (%rdx), %eax 3097; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3098; SSE41-NEXT: retq 3099; 3100; AVX-LABEL: shuffle_scalar_to_vector_extract: 3101; AVX: # %bb.0: 3102; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 3103; AVX-NEXT: vpextrw $4, %xmm0, %eax 3104; AVX-NEXT: vpextrw $7, %xmm0, %ecx 3105; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3106; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 3107; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB 3108; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 3109; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 3110; AVX-NEXT: movsbl (%rsi), %eax 3111; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 3112; AVX-NEXT: movsbl (%rdx), %eax 3113; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 3114; AVX-NEXT: retq 3115 %tmp = load <8 x i8>, ptr %p0, align 1 3116 %tmp1 = sext <8 x i8> %tmp to <8 x i16> 3117 %tmp2 = load i8, ptr %p1, align 1 3118 %cvt1 = sext i8 %tmp2 to i16 3119 %tmp3 = load i8, ptr %p2, align 1 3120 %cvt2 = sext i8 %tmp3 to i16 3121 %tmp4 = extractelement <8 x i16> %tmp1, i32 4 3122 %tmp5 = extractelement <8 x i16> %tmp1, i32 7 3123 %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0 3124 %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1 3125 %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3 3126 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4 3127 %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5 3128 %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6 3129 %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7 3130 %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 3131 ret <8 x i16> %tmp13 3132} 3133 3134; Bug noticed in D96345 3135define i32 @shuffle_binops_with_undef() { 3136; SSE-LABEL: shuffle_binops_with_undef: 3137; SSE: # %bb.0: # %entry 3138; SSE-NEXT: movdqa (%rax), %xmm0 3139; SSE-NEXT: paddw %xmm0, %xmm0 3140; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3141; SSE-NEXT: psrlw %xmm1, %xmm0 3142; SSE-NEXT: movdqa %xmm0, (%rax) 3143; SSE-NEXT: retq 3144; 3145; AVX-LABEL: shuffle_binops_with_undef: 3146; AVX: # %bb.0: # %entry 3147; AVX-NEXT: vmovdqa (%rax), %xmm0 3148; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 3149; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3150; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 3151; AVX-NEXT: vmovdqa %xmm0, (%rax) 3152; AVX-NEXT: retq 3153entry: 3154 %load0 = load <8 x i16>, ptr undef, align 16 3155 %load1 = load <8 x i16>, ptr undef, align 16 3156 %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 3157 %addi = add <8 x i16> %load0, %load1 3158 %bc0 = bitcast <8 x i16> %addi to <2 x i64> 3159 %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16> 3160 %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 3161 %addi24 = add <8 x i16> %shuf1, %bc1 3162 %bc2 = bitcast <8 x i16> %addi24 to <2 x i64> 3163 %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2> 3164 %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16> 3165 %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (ptr @shuffle_binops_with_undef to i32)) 3166 store <8 x i16> %psrli, ptr undef, align 16 3167 ret i32 undef 3168} 3169declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) 3170 3171define void @PR43024() { 3172; SSE2-LABEL: PR43024: 3173; SSE2: # %bb.0: 3174; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3175; SSE2-NEXT: movaps %xmm0, (%rax) 3176; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3177; SSE2-NEXT: xorps %xmm1, %xmm1 3178; SSE2-NEXT: addss %xmm1, %xmm0 3179; SSE2-NEXT: addss %xmm1, %xmm0 3180; SSE2-NEXT: movss %xmm0, (%rax) 3181; SSE2-NEXT: retq 3182; 3183; SSSE3-LABEL: PR43024: 3184; SSSE3: # %bb.0: 3185; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3186; SSSE3-NEXT: movaps %xmm0, (%rax) 3187; SSSE3-NEXT: addss %xmm0, %xmm0 3188; SSSE3-NEXT: xorps %xmm1, %xmm1 3189; SSSE3-NEXT: addss %xmm1, %xmm0 3190; SSSE3-NEXT: addss %xmm1, %xmm0 3191; SSSE3-NEXT: movss %xmm0, (%rax) 3192; SSSE3-NEXT: retq 3193; 3194; SSE41-LABEL: PR43024: 3195; SSE41: # %bb.0: 3196; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3197; SSE41-NEXT: movaps %xmm0, (%rax) 3198; SSE41-NEXT: addss %xmm0, %xmm0 3199; SSE41-NEXT: xorps %xmm1, %xmm1 3200; SSE41-NEXT: addss %xmm1, %xmm0 3201; SSE41-NEXT: addss %xmm1, %xmm0 3202; SSE41-NEXT: movss %xmm0, (%rax) 3203; SSE41-NEXT: retq 3204; 3205; AVX-LABEL: PR43024: 3206; AVX: # %bb.0: 3207; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3208; AVX-NEXT: vmovaps %xmm0, (%rax) 3209; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 3210; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 3211; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 3212; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 3213; AVX-NEXT: vmovss %xmm0, (%rax) 3214; AVX-NEXT: retq 3215 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16 3216 %1 = load <4 x float>, ptr undef, align 16 3217 %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0> 3218 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 3219 %4 = fadd <4 x float> %2, %3 3220 %5 = fadd <4 x float> zeroinitializer, %4 3221 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 3222 %7 = fadd <4 x float> %6, %5 3223 %8 = extractelement <4 x float> %7, i32 0 3224 store float %8, ptr undef, align 8 3225 ret void 3226} 3227 3228declare <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float>, <4 x float>, metadata, metadata) 3229declare <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float>, <4 x float>, metadata, metadata) 3230 3231define void @PR43024_strictfp() strictfp { 3232; SSE2-LABEL: PR43024_strictfp: 3233; SSE2: # %bb.0: 3234; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3235; SSE2-NEXT: movaps %xmm0, (%rax) 3236; SSE2-NEXT: xorps %xmm1, %xmm1 3237; SSE2-NEXT: mulps %xmm1, %xmm0 3238; SSE2-NEXT: movaps %xmm0, %xmm2 3239; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] 3240; SSE2-NEXT: addps %xmm0, %xmm2 3241; SSE2-NEXT: addps %xmm1, %xmm2 3242; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 3243; SSE2-NEXT: addps %xmm2, %xmm0 3244; SSE2-NEXT: movss %xmm0, (%rax) 3245; SSE2-NEXT: retq 3246; 3247; SSSE3-LABEL: PR43024_strictfp: 3248; SSSE3: # %bb.0: 3249; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3250; SSSE3-NEXT: movaps %xmm0, (%rax) 3251; SSSE3-NEXT: xorps %xmm1, %xmm1 3252; SSSE3-NEXT: mulps %xmm1, %xmm0 3253; SSSE3-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 3254; SSSE3-NEXT: addps %xmm0, %xmm2 3255; SSSE3-NEXT: addps %xmm1, %xmm2 3256; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 3257; SSSE3-NEXT: addps %xmm2, %xmm0 3258; SSSE3-NEXT: movss %xmm0, (%rax) 3259; SSSE3-NEXT: retq 3260; 3261; SSE41-LABEL: PR43024_strictfp: 3262; SSE41: # %bb.0: 3263; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3264; SSE41-NEXT: movaps %xmm0, (%rax) 3265; SSE41-NEXT: xorps %xmm1, %xmm1 3266; SSE41-NEXT: mulps %xmm1, %xmm0 3267; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 3268; SSE41-NEXT: addps %xmm0, %xmm2 3269; SSE41-NEXT: addps %xmm1, %xmm2 3270; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 3271; SSE41-NEXT: addps %xmm2, %xmm0 3272; SSE41-NEXT: movss %xmm0, (%rax) 3273; SSE41-NEXT: retq 3274; 3275; AVX-LABEL: PR43024_strictfp: 3276; AVX: # %bb.0: 3277; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3278; AVX-NEXT: vmovaps %xmm0, (%rax) 3279; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 3280; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 3281; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] 3282; AVX-NEXT: vaddps %xmm2, %xmm0, %xmm2 3283; AVX-NEXT: vaddps %xmm2, %xmm1, %xmm1 3284; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 3285; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 3286; AVX-NEXT: vmovss %xmm0, (%rax) 3287; AVX-NEXT: retq 3288 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16 3289 %1 = load <4 x float>, ptr undef, align 16 3290 %2 = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %1, <4 x float> zeroinitializer, metadata !"round.dynamic", metadata !"fpexcept.strict") 3291 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 3292 %4 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %2, <4 x float> %3, metadata !"round.dynamic", metadata !"fpexcept.strict") 3293 %5 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> zeroinitializer, <4 x float> %4, metadata !"round.dynamic", metadata !"fpexcept.strict") 3294 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 3295 %7 = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %6, <4 x float> %5, metadata !"round.dynamic", metadata !"fpexcept.strict") 3296 %8 = extractelement <4 x float> %7, i32 0 3297 store float %8, ptr undef, align 8 3298 ret void 3299} 3300 3301define void @PR45604(ptr %dst, ptr %src) { 3302; SSE2-LABEL: PR45604: 3303; SSE2: # %bb.0: 3304; SSE2-NEXT: movdqa (%rsi), %xmm0 3305; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 3306; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 3307; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 3308; SSE2-NEXT: movdqa %xmm2, %xmm3 3309; SSE2-NEXT: pandn %xmm1, %xmm3 3310; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0] 3311; SSE2-NEXT: por %xmm1, %xmm3 3312; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 3313; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 3314; SSE2-NEXT: movdqa %xmm2, %xmm5 3315; SSE2-NEXT: pandn %xmm4, %xmm5 3316; SSE2-NEXT: por %xmm1, %xmm5 3317; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] 3318; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 3319; SSE2-NEXT: movdqa %xmm2, %xmm6 3320; SSE2-NEXT: pandn %xmm4, %xmm6 3321; SSE2-NEXT: por %xmm1, %xmm6 3322; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3323; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 3324; SSE2-NEXT: pandn %xmm0, %xmm2 3325; SSE2-NEXT: por %xmm1, %xmm2 3326; SSE2-NEXT: movdqa %xmm2, 48(%rdi) 3327; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 3328; SSE2-NEXT: movdqa %xmm5, 16(%rdi) 3329; SSE2-NEXT: movdqa %xmm3, (%rdi) 3330; SSE2-NEXT: retq 3331; 3332; SSSE3-LABEL: PR45604: 3333; SSSE3: # %bb.0: 3334; SSSE3-NEXT: movdqa (%rsi), %xmm0 3335; SSSE3-NEXT: movdqa %xmm0, %xmm1 3336; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[2,3],zero,zero,zero,zero,zero,zero 3337; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0] 3338; SSSE3-NEXT: por %xmm2, %xmm1 3339; SSSE3-NEXT: movdqa %xmm0, %xmm3 3340; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[4,5],zero,zero,zero,zero,zero,zero,xmm3[6,7],zero,zero,zero,zero,zero,zero 3341; SSSE3-NEXT: por %xmm2, %xmm3 3342; SSSE3-NEXT: movdqa %xmm0, %xmm4 3343; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,zero 3344; SSSE3-NEXT: por %xmm2, %xmm4 3345; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero 3346; SSSE3-NEXT: por %xmm2, %xmm0 3347; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) 3348; SSSE3-NEXT: movdqa %xmm4, 32(%rdi) 3349; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) 3350; SSSE3-NEXT: movdqa %xmm1, (%rdi) 3351; SSSE3-NEXT: retq 3352; 3353; SSE41-LABEL: PR45604: 3354; SSE41: # %bb.0: 3355; SSE41-NEXT: movdqa (%rsi), %xmm0 3356; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3357; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3358; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [0,11,0,11] 3359; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 3360; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3361; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3362; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7] 3363; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 3364; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3365; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3],xmm4[4],xmm2[5,6,7] 3366; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3367; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 3368; SSE41-NEXT: movdqa %xmm0, (%rdi) 3369; SSE41-NEXT: movdqa %xmm4, 48(%rdi) 3370; SSE41-NEXT: movdqa %xmm3, 32(%rdi) 3371; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 3372; SSE41-NEXT: retq 3373; 3374; AVX1-LABEL: PR45604: 3375; AVX1: # %bb.0: 3376; AVX1-NEXT: vmovdqa (%rsi), %xmm0 3377; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3378; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3379; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] 3380; AVX1-NEXT: # xmm2 = mem[0,0] 3381; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 3382; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3383; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3384; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3385; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3386; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 3387; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3388; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3389; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3390; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 3391; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3392; AVX1-NEXT: vmovups %ymm0, (%rdi) 3393; AVX1-NEXT: vmovups %ymm1, 32(%rdi) 3394; AVX1-NEXT: vzeroupper 3395; AVX1-NEXT: retq 3396; 3397; AVX2-LABEL: PR45604: 3398; AVX2: # %bb.0: 3399; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3400; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] 3401; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm2 = [151519488,185205506,218891524,252577542] 3402; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3403; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0] 3404; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 3405; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 3406; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3407; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 3408; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 3409; AVX2-NEXT: vmovdqu %ymm1, (%rdi) 3410; AVX2-NEXT: vzeroupper 3411; AVX2-NEXT: retq 3412 %v1 = load <8 x i16>, ptr %src, align 16 3413 %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3414 %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 3415 store <32 x i16> %v3, ptr %dst, align 16 3416 ret void 3417} 3418 3419; getFauxShuffle AND/ANDN decoding wrongly assumed an undef src always gives an undef dst. 3420define <2 x i64> @PR55157(ptr %0) { 3421; SSE-LABEL: PR55157: 3422; SSE: # %bb.0: 3423; SSE-NEXT: xorps %xmm0, %xmm0 3424; SSE-NEXT: retq 3425; 3426; AVX-LABEL: PR55157: 3427; AVX: # %bb.0: 3428; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 3429; AVX-NEXT: retq 3430 %2 = load <16 x i8>, ptr %0, align 16 3431 %3 = icmp eq <16 x i8> %2, zeroinitializer 3432 %4 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer) 3433 %5 = select <16 x i1> %3, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %4 3434 %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> 3435 %7 = bitcast <16 x i8> %6 to <2 x i64> 3436 ret <2 x i64> %7 3437} 3438declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) 3439 3440; SelectionDAG::isSplatValue - incorrect handling of undef sub-elements 3441define <2 x i64> @PR56520(<16 x i8> %0) { 3442; SSE-LABEL: PR56520: 3443; SSE: # %bb.0: 3444; SSE-NEXT: pxor %xmm1, %xmm1 3445; SSE-NEXT: pcmpeqb %xmm0, %xmm1 3446; SSE-NEXT: movd %xmm1, %eax 3447; SSE-NEXT: movsbl %al, %eax 3448; SSE-NEXT: movd %eax, %xmm0 3449; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3450; SSE-NEXT: retq 3451; 3452; AVX1-LABEL: PR56520: 3453; AVX1: # %bb.0: 3454; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 3455; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3456; AVX1-NEXT: vmovd %xmm0, %eax 3457; AVX1-NEXT: movsbl %al, %eax 3458; AVX1-NEXT: vmovd %eax, %xmm0 3459; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3460; AVX1-NEXT: retq 3461; 3462; AVX2-SLOW-LABEL: PR56520: 3463; AVX2-SLOW: # %bb.0: 3464; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3465; AVX2-SLOW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3466; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 3467; AVX2-SLOW-NEXT: movsbl %al, %eax 3468; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 3469; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 3470; AVX2-SLOW-NEXT: retq 3471; 3472; AVX2-FAST-LABEL: PR56520: 3473; AVX2-FAST: # %bb.0: 3474; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 3475; AVX2-FAST-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3476; AVX2-FAST-NEXT: vmovd %xmm0, %eax 3477; AVX2-FAST-NEXT: movsbl %al, %eax 3478; AVX2-FAST-NEXT: vmovd %eax, %xmm0 3479; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 3480; AVX2-FAST-NEXT: retq 3481 %2 = icmp eq <16 x i8> zeroinitializer, %0 3482 %3 = extractelement <16 x i1> %2, i64 0 3483 %4 = sext i1 %3 to i32 3484 %5 = insertelement <2 x i32> zeroinitializer, i32 %4, i64 0 3485 %6 = zext <2 x i32> %5 to <2 x i64> 3486 %7 = shufflevector <2 x i64> %6, <2 x i64> zeroinitializer, <2 x i32> zeroinitializer 3487 ret <2 x i64> %7 3488} 3489 3490define <4 x i32> @PR63700(i128 %0) { 3491; SSE2-LABEL: PR63700: 3492; SSE2: # %bb.0: 3493; SSE2-NEXT: movd %edi, %xmm0 3494; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3495; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3496; SSE2-NEXT: retq 3497; 3498; SSSE3-LABEL: PR63700: 3499; SSSE3: # %bb.0: 3500; SSSE3-NEXT: movd %edi, %xmm0 3501; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 3502; SSSE3-NEXT: retq 3503; 3504; SSE41-LABEL: PR63700: 3505; SSE41: # %bb.0: 3506; SSE41-NEXT: movd %edi, %xmm0 3507; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3508; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 3509; SSE41-NEXT: retq 3510; 3511; AVX1-LABEL: PR63700: 3512; AVX1: # %bb.0: 3513; AVX1-NEXT: vmovd %edi, %xmm0 3514; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 3515; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 3516; AVX1-NEXT: retq 3517; 3518; AVX2-SLOW-LABEL: PR63700: 3519; AVX2-SLOW: # %bb.0: 3520; AVX2-SLOW-NEXT: vmovd %edi, %xmm0 3521; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 3522; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 3523; AVX2-SLOW-NEXT: retq 3524; 3525; AVX2-FAST-LABEL: PR63700: 3526; AVX2-FAST: # %bb.0: 3527; AVX2-FAST-NEXT: vmovq %rdi, %xmm0 3528; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 3529; AVX2-FAST-NEXT: retq 3530 %vcmp = bitcast i128 %0 to <4 x i32> 3531 %shuffle.i = shufflevector <4 x i32> %vcmp, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef> 3532 %shuffle.i11 = shufflevector <4 x i32> %shuffle.i, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 3533 ret <4 x i32> %shuffle.i11 3534} 3535 3536define <16 x i8> @PR107289(<16 x i8> %0) { 3537; SSE-LABEL: PR107289: 3538; SSE: # %bb.0: 3539; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 3540; SSE-NEXT: retq 3541; 3542; AVX-LABEL: PR107289: 3543; AVX: # %bb.0: 3544; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 3545; AVX-NEXT: retq 3546 %src = bitcast <16 x i8> %0 to i128 3547 %shl = shl i128 %src, 8 3548 %res = bitcast i128 %shl to <16 x i8> 3549 ret <16 x i8> %res 3550} 3551 3552; Test case reported on D105827 3553define void @SpinningCube() { 3554; SSE2-LABEL: SpinningCube: 3555; SSE2: # %bb.0: # %entry 3556; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3557; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] 3558; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] 3559; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] 3560; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3561; SSE2-NEXT: xorps %xmm3, %xmm3 3562; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 3563; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 3564; SSE2-NEXT: addps %xmm3, %xmm1 3565; SSE2-NEXT: movaps %xmm1, (%rax) 3566; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3567; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 3568; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3569; SSE2-NEXT: addps %xmm0, %xmm1 3570; SSE2-NEXT: movaps %xmm1, (%rax) 3571; SSE2-NEXT: retq 3572; 3573; SSSE3-LABEL: SpinningCube: 3574; SSSE3: # %bb.0: # %entry 3575; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3576; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] 3577; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] 3578; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] 3579; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3580; SSSE3-NEXT: xorps %xmm3, %xmm3 3581; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 3582; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 3583; SSSE3-NEXT: addps %xmm3, %xmm1 3584; SSSE3-NEXT: movaps %xmm1, (%rax) 3585; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3586; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] 3587; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3588; SSSE3-NEXT: addps %xmm0, %xmm1 3589; SSSE3-NEXT: movaps %xmm1, (%rax) 3590; SSSE3-NEXT: retq 3591; 3592; SSE41-LABEL: SpinningCube: 3593; SSE41: # %bb.0: # %entry 3594; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3595; SSE41-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] 3596; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] 3597; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] 3598; SSE41-NEXT: movaps %xmm1, %xmm3 3599; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] 3600; SSE41-NEXT: movaps %xmm0, %xmm4 3601; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] 3602; SSE41-NEXT: addps %xmm3, %xmm4 3603; SSE41-NEXT: movaps %xmm4, (%rax) 3604; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3605; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] 3606; SSE41-NEXT: mulps %xmm1, %xmm2 3607; SSE41-NEXT: addps %xmm0, %xmm2 3608; SSE41-NEXT: movaps %xmm2, (%rax) 3609; SSE41-NEXT: retq 3610; 3611; AVX-LABEL: SpinningCube: 3612; AVX: # %bb.0: # %entry 3613; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3614; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 3615; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] 3616; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] 3617; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] 3618; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] 3619; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 3620; AVX-NEXT: vmovaps %xmm2, (%rax) 3621; AVX-NEXT: vbroadcastss (%rax), %xmm2 3622; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 3623; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 3624; AVX-NEXT: vmovaps %xmm0, (%rax) 3625; AVX-NEXT: retq 3626entry: 3627 store float 1.000000e+00, ptr undef, align 4 3628 %0 = load float, ptr undef, align 4 3629 %1 = fmul float undef, 0.000000e+00 3630 %2 = insertelement <4 x float> poison, float %0, i32 3 3631 %3 = load float, ptr undef, align 4 3632 %4 = insertelement <2 x float> poison, float %3, i32 0 3633 %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer 3634 %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00> 3635 %7 = fadd float %1, undef 3636 %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3637 %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3638 %10 = insertelement <4 x float> %9, float %7, i32 3 3639 %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1 3640 %12 = insertelement <4 x float> %11, float undef, i32 0 3641 %13 = insertelement <4 x float> %12, float undef, i32 2 3642 %14 = fadd <4 x float> %10, %13 3643 store <4 x float> %14, ptr undef, align 16 3644 %15 = load float, ptr undef, align 4 3645 %16 = insertelement <2 x float> poison, float %15, i32 0 3646 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer 3647 %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00> 3648 %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3649 %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3650 %21 = fadd <4 x float> %20, %2 3651 store <4 x float> %21, ptr undef, align 16 3652 ret void 3653} 3654 3655; Inifite loop test case reported on 5ca77541446d 3656define void @autogen_SD25931() { 3657; CHECK-LABEL: autogen_SD25931: 3658; CHECK: # %bb.0: # %BB 3659; CHECK-NEXT: .p2align 4 3660; CHECK-NEXT: .LBB142_1: # %CF242 3661; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 3662; CHECK-NEXT: jmp .LBB142_1 3663BB: 3664 %Cmp16 = icmp uge <2 x i1> zeroinitializer, zeroinitializer 3665 %Shuff19 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp16, <2 x i32> <i32 3, i32 1> 3666 %Shuff33 = shufflevector <2 x i1> %Shuff19, <2 x i1> zeroinitializer, <2 x i32> <i32 0, i32 2> 3667 br label %CF250 3668 3669CF250: ; preds = %CF250, %BB 3670 br i1 poison, label %CF250, label %CF259 3671 3672CF259: ; preds = %CF250 3673 %Cmp83 = icmp ule <2 x i1> %Shuff19, zeroinitializer 3674 br label %CF242 3675 3676CF242: ; preds = %CF242, %CF259 3677 %Shuff153 = shufflevector <2 x i1> %Shuff33, <2 x i1> poison, <2 x i32> <i32 3, i32 1> 3678 %Shuff161 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp83, <2 x i32> <i32 1, i32 3> 3679 br label %CF242 3680} 3681