1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 4; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 5 6target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" 7 8; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true> 9; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought 10; we would lower that into a blend where only the high bit is relevant. 11; However, since the whole mask is constant, this is simplified incorrectly 12; by the generic code, because it was expecting -1 in place of 2147483648. 13; 14; The problem does not occur without AVX, because vselect of v4i32 is not legal 15; nor custom. 16; 17; <rdar://problem/18675020> 18 19define void @test(ptr %a, ptr %b) { 20; AVX-LABEL: test: 21; AVX: ## %bb.0: ## %body 22; AVX-NEXT: movabsq $4167800517033787389, %rax ## imm = 0x39D7007D007CFFFD 23; AVX-NEXT: movq %rax, (%rdi) 24; AVX-NEXT: movabsq $-281474976645121, %rax ## imm = 0xFFFF00000000FFFF 25; AVX-NEXT: movq %rax, (%rsi) 26; AVX-NEXT: retq 27body: 28 %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127> 29 %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer 30 store <4 x i16> %predphi, ptr %a, align 8 31 store <4 x i16> %predphi42, ptr %b, align 8 32 ret void 33} 34 35; Improve code coverage. 36; 37; When shrinking the condition used into the select to match a blend, this 38; test case exercises the path where the modified node is not the root 39; of the condition. 40 41define void @test2(ptr %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { 42; AVX1-LABEL: test2: 43; AVX1: ## %bb.0: ## %bb 44; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 45; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 46; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 47; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 48; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 49; AVX1-NEXT: movq (%rdi,%rsi,8), %rax 50; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] 51; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 52; AVX1-NEXT: vmovupd %ymm0, (%rax) 53; AVX1-NEXT: vzeroupper 54; AVX1-NEXT: retq 55; 56; AVX2-LABEL: test2: 57; AVX2: ## %bb.0: ## %bb 58; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 59; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 60; AVX2-NEXT: movq (%rdi,%rsi,8), %rax 61; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 62; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] 63; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 64; AVX2-NEXT: vmovupd %ymm0, (%rax) 65; AVX2-NEXT: vzeroupper 66; AVX2-NEXT: retq 67; 68; AVX512-LABEL: test2: 69; AVX512: ## %bb.0: ## %bb 70; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 71; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 72; AVX512-NEXT: movq (%rdi,%rsi,8), %rax 73; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] 74; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 {%k1} = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 75; AVX512-NEXT: vmovupd %ymm0, (%rax) 76; AVX512-NEXT: vzeroupper 77; AVX512-NEXT: retq 78bb: 79 %arrayidx1928 = getelementptr inbounds ptr, ptr %call1559, i64 %indvars.iv4198 80 %tmp1888 = load ptr, ptr %arrayidx1928, align 8 81 %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> 82 store <4 x double> %predphi.v.v, ptr %tmp1888, align 8 83 ret void 84} 85 86; For this test, we used to optimized the conditional mask for the blend, i.e., 87; we shrunk some of its bits. 88; However, this same mask was used in another select (%predphi31) that turned out 89; to be optimized into a and. In that case, the conditional mask was wrong. 90; 91; Make sure that the and is fed by the original mask. 92; 93; <rdar://problem/18819506> 94 95define void @test3(<4 x i32> %induction30, ptr %tmp16, ptr %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { 96; AVX1-LABEL: test3: 97; AVX1: ## %bb.0: 98; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 99; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 100; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 101; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 102; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 103; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 104; AVX1-NEXT: vmovq %xmm0, (%rdi) 105; AVX1-NEXT: vmovq %xmm1, (%rsi) 106; AVX1-NEXT: retq 107; 108; AVX2-LABEL: test3: 109; AVX2: ## %bb.0: 110; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] 111; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 112; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] 113; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 114; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] 115; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 116; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 117; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 118; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 119; AVX2-NEXT: vmovq %xmm0, (%rdi) 120; AVX2-NEXT: vmovq %xmm1, (%rsi) 121; AVX2-NEXT: retq 122; 123; AVX512-LABEL: test3: 124; AVX512: ## %bb.0: 125; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 126; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 127; AVX512-NEXT: vpcmpleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 128; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 129; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 130; AVX512-NEXT: vpmovdw %ymm0, %xmm0 131; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm2 ^ (xmm0 & (xmm1 ^ xmm2)) 132; AVX512-NEXT: vmovq %xmm0, (%rdi) 133; AVX512-NEXT: vmovq %xmm1, (%rsi) 134; AVX512-NEXT: vzeroupper 135; AVX512-NEXT: retq 136 %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3> 137 %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer 138 %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12 139 %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer 140 141 store <4 x i16> %predphi31, ptr %tmp16, align 8 142 store <4 x i16> %predphi, ptr %tmp17, align 8 143 ret void 144} 145 146; We shouldn't try to lower this directly using VSELECT because we don't have 147; vpblendvb in AVX1, only in AVX2. Instead, it should be expanded. 148 149define <32 x i8> @PR22706(<32 x i1> %x) { 150; AVX1-LABEL: PR22706: 151; AVX1: ## %bb.0: 152; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 153; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 154; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] 155; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 156; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 157; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 158; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 159; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 160; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 161; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 162; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 163; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 164; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: PR22706: 168; AVX2: ## %bb.0: 169; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 170; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 171; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 172; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 173; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 174; AVX2-NEXT: retq 175; 176; AVX512-LABEL: PR22706: 177; AVX512: ## %bb.0: 178; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 179; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 180; AVX512-NEXT: vpblendvb %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 181; AVX512-NEXT: retq 182 %tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2> 183 ret <32 x i8> %tmp 184} 185 186; Don't concat select/blendv ops if the concatenated mask isn't legal. 187define void @PR59003(<2 x float> %0, <2 x float> %1, <8 x i1> %shuffle108) { 188; AVX-LABEL: PR59003: 189; AVX: ## %bb.0: ## %entry 190; AVX-NEXT: .p2align 4 191; AVX-NEXT: LBB4_1: ## %for.body.i 192; AVX-NEXT: ## =>This Inner Loop Header: Depth=1 193; AVX-NEXT: jmp LBB4_1 194entry: 195 br label %for.body.i 196 197for.body.i: ; preds = %for.body.i, %entry 198 %2 = phi <8 x float> [ zeroinitializer, %entry ], [ %3, %for.body.i ] 199 %shuffle111 = shufflevector <2 x float> %0, <2 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 200 %shuffle112 = shufflevector <2 x float> %1, <2 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 201 %3 = select <8 x i1> %shuffle108, <8 x float> %shuffle111, <8 x float> %shuffle112 202 %4 = shufflevector <8 x float> zeroinitializer, <8 x float> %2, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 203 %5 = select <8 x i1> zeroinitializer, <8 x float> zeroinitializer, <8 x float> %2 204 br label %for.body.i 205} 206 207 208; Split a 256-bit select into two 128-bit selects when the operands are concatenated. 209 210define void @blendv_split(ptr %p, <8 x i32> %cond, <8 x i32> %a, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { 211; AVX1-LABEL: blendv_split: 212; AVX1: ## %bb.0: 213; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 214; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 215; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm5 216; AVX1-NEXT: vpslld %xmm2, %xmm1, %xmm2 217; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 218; AVX1-NEXT: vpslld %xmm3, %xmm4, %xmm4 219; AVX1-NEXT: vpslld %xmm3, %xmm1, %xmm1 220; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm1 221; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 222; AVX1-NEXT: vblendvps %xmm0, %xmm5, %xmm4, %xmm0 223; AVX1-NEXT: vmovups %xmm0, 16(%rdi) 224; AVX1-NEXT: vmovups %xmm1, (%rdi) 225; AVX1-NEXT: vzeroupper 226; AVX1-NEXT: retq 227; 228; AVX2-LABEL: blendv_split: 229; AVX2: ## %bb.0: 230; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 231; AVX2-NEXT: vpslld %xmm2, %ymm1, %ymm2 232; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero 233; AVX2-NEXT: vpslld %xmm3, %ymm1, %ymm1 234; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 235; AVX2-NEXT: vmovups %ymm0, (%rdi) 236; AVX2-NEXT: vzeroupper 237; AVX2-NEXT: retq 238; 239; AVX512-LABEL: blendv_split: 240; AVX512: ## %bb.0: 241; AVX512-NEXT: vpsrld $31, %ymm0, %ymm0 242; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 243; AVX512-NEXT: vptestmd %ymm0, %ymm0, %k1 244; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero 245; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero 246; AVX512-NEXT: vpslld %xmm2, %ymm1, %ymm2 247; AVX512-NEXT: vpslld %xmm0, %ymm1, %ymm2 {%k1} 248; AVX512-NEXT: vmovdqu %ymm2, (%rdi) 249; AVX512-NEXT: vzeroupper 250; AVX512-NEXT: retq 251 %signbits = ashr <8 x i32> %cond, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> 252 %bool = trunc <8 x i32> %signbits to <8 x i1> 253 %shamt1 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> zeroinitializer 254 %shamt2 = shufflevector <8 x i32> %y, <8 x i32> undef, <8 x i32> zeroinitializer 255 %sh1 = shl <8 x i32> %a, %shamt1 256 %sh2 = shl <8 x i32> %a, %shamt2 257 %sel = select <8 x i1> %bool, <8 x i32> %sh1, <8 x i32> %sh2 258 store <8 x i32> %sel, ptr %p, align 4 259 ret void 260} 261 262; Concatenate 128-bit pblendvb back together on AVX2+ targets (hidden by SSE __m128i bitcasts) 263define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { 264; AVX1-LABEL: vselect_concat_split_v16i8: 265; AVX1: ## %bb.0: 266; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 267; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 268; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4 269; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2 270; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm2 271; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 272; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 273; AVX1-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 274; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 275; AVX1-NEXT: retq 276; 277; AVX2-LABEL: vselect_concat_split_v16i8: 278; AVX2: ## %bb.0: 279; AVX2-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 280; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 281; AVX2-NEXT: retq 282; 283; AVX512-LABEL: vselect_concat_split_v16i8: 284; AVX512: ## %bb.0: 285; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 286; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm1)) 287; AVX512-NEXT: retq 288 %a.bc = bitcast <4 x i64> %a to <32 x i8> 289 %b.bc = bitcast <4 x i64> %b to <32 x i8> 290 %c.bc = bitcast <4 x i64> %c to <32 x i8> 291 %d.bc = bitcast <4 x i64> %d to <32 x i8> 292 %cmp = icmp slt <32 x i8> %c.bc, %d.bc 293 %a.lo = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 294 %b.lo = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 295 %cmp.lo = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 296 %lo = select <16 x i1> %cmp.lo, <16 x i8> %b.lo, <16 x i8> %a.lo 297 %a.hi = shufflevector <32 x i8> %a.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 298 %b.hi = shufflevector <32 x i8> %b.bc, <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 299 %cmp.hi = shufflevector <32 x i1> %cmp, <32 x i1> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 300 %hi = select <16 x i1> %cmp.hi, <16 x i8> %b.hi, <16 x i8> %a.hi 301 %concat = shufflevector <16 x i8> %lo, <16 x i8> %hi, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 302 %result = bitcast <32 x i8> %concat to <4 x i64> 303 ret <4 x i64> %result 304} 305 306; Regression test for rGea8fb3b60196 307define void @vselect_concat() { 308; AVX-LABEL: vselect_concat: 309; AVX: ## %bb.0: ## %entry 310; AVX-NEXT: retq 311entry: 312 %0 = load <8 x i32>, ptr undef 313 %1 = shufflevector <8 x i32> zeroinitializer, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 314 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 315 %3 = select <4 x i1> zeroinitializer, <4 x i32> %1, <4 x i32> %2 316 %4 = shufflevector <8 x i32> zeroinitializer, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 317 %5 = shufflevector <8 x i32> %0, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 318 %6 = select <4 x i1> zeroinitializer, <4 x i32> %4, <4 x i32> %5 319 %7 = shufflevector <4 x i32> %3, <4 x i32> %6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 320 store <8 x i32> %7, ptr undef 321 ret void 322} 323 324; Regression test for rGb5d7beeb9792 325define void @vselect_concat_splat() { 326; AVX1-LABEL: vselect_concat_splat: 327; AVX1: ## %bb.0: ## %entry 328; AVX1-NEXT: vmovups (%rax), %xmm0 329; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1] 330; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] 331; AVX1-NEXT: vmovups 16, %xmm2 332; AVX1-NEXT: vmovups 32, %xmm3 333; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3] 334; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] 335; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1] 336; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] 337; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] 338; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2] 339; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 340; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3 341; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1 342; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm0 343; AVX1-NEXT: vmovups %xmm0, (%rax) 344; AVX1-NEXT: vmovups %xmm1, (%rax) 345; AVX1-NEXT: retq 346; 347; AVX2-LABEL: vselect_concat_splat: 348; AVX2: ## %bb.0: ## %entry 349; AVX2-NEXT: vmovups (%rax), %ymm0 350; AVX2-NEXT: vmovups (%rax), %xmm1 351; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1] 352; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] 353; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm3 354; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = [1,4,7,2] 355; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 356; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0 357; AVX2-NEXT: vmovups 0, %ymm1 358; AVX2-NEXT: vmovups 32, %xmm5 359; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6,7] 360; AVX2-NEXT: vpermps %ymm6, %ymm2, %ymm2 361; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] 362; AVX2-NEXT: vpermps %ymm1, %ymm4, %ymm1 363; AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 364; AVX2-NEXT: vcmpneqps %xmm4, %xmm3, %xmm4 365; AVX2-NEXT: vblendvps %xmm4, %xmm2, %xmm3, %xmm2 366; AVX2-NEXT: vblendvps %xmm4, %xmm1, %xmm0, %xmm0 367; AVX2-NEXT: vmovups %xmm0, (%rax) 368; AVX2-NEXT: vmovups %xmm2, (%rax) 369; AVX2-NEXT: vzeroupper 370; AVX2-NEXT: retq 371; 372; AVX512-LABEL: vselect_concat_splat: 373; AVX512: ## %bb.0: ## %entry 374; AVX512-NEXT: vmovups (%rax), %ymm0 375; AVX512-NEXT: vmovups (%rax), %xmm1 376; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,1,4,7,10] 377; AVX512-NEXT: vmovaps %ymm2, %ymm3 378; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3 379; AVX512-NEXT: vmovups 32, %xmm4 380; AVX512-NEXT: vmovups 0, %ymm5 381; AVX512-NEXT: vxorps %xmm6, %xmm6, %xmm6 382; AVX512-NEXT: vcmpneqps %xmm6, %xmm3, %k0 383; AVX512-NEXT: kshiftlw $4, %k0, %k1 384; AVX512-NEXT: korw %k1, %k0, %k1 385; AVX512-NEXT: vpermt2ps %ymm4, %ymm2, %ymm5 386; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 387; AVX512-NEXT: vmovaps %ymm5, %ymm0 {%k1} 388; AVX512-NEXT: vmovups %ymm0, (%rax) 389; AVX512-NEXT: vzeroupper 390; AVX512-NEXT: retq 391entry: 392 %wide.vec = load <12 x float>, ptr undef, align 1 393 %strided.vec = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 394 %strided.vec29 = shufflevector <12 x float> %wide.vec, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 395 %wide.vec31 = load <12 x float>, ptr null, align 1 396 %strided.vec32 = shufflevector <12 x float> %wide.vec31, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 397 %strided.vec33 = shufflevector <12 x float> %wide.vec31, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 398 %i = select i1 false, <4 x float> zeroinitializer, <4 x float> %strided.vec 399 %i1 = fcmp une <4 x float> %i, zeroinitializer 400 %i2 = select <4 x i1> %i1, <4 x float> %strided.vec32, <4 x float> %strided.vec 401 %.v = select <4 x i1> %i1, <4 x float> %strided.vec33, <4 x float> %strided.vec29 402 %.uncasted = shufflevector <4 x float> %i2, <4 x float> %.v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 403 store <8 x float> %.uncasted, ptr undef, align 1 404 ret void 405} 406