1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,X86-AVX512,AVX512-SLOW,X86-AVX512-SLOW 3; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,X64-AVX512,AVX512-SLOW,X64-AVX512-SLOW 4; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,X86-AVX512,AVX512-FAST,X86-AVX512-FAST 5; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,X64-AVX512,AVX512-FAST,X64-AVX512-FAST 6; RUN: llc < %s -mtriple=i386-pc-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F 7; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F 8 9;expand 128 -> 256 include <4 x float> <2 x double> 10define <8 x float> @expand(<4 x float> %a) { 11; AVX512-LABEL: expand: 12; AVX512: # %bb.0: 13; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 14; AVX512-NEXT: movb $5, %al 15; AVX512-NEXT: kmovd %eax, %k1 16; AVX512-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} 17; AVX512-NEXT: ret{{[l|q]}} 18; 19; AVX512F-LABEL: expand: 20; AVX512F: # %bb.0: 21; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 22; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 23; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] 24; AVX512F-NEXT: ret{{[l|q]}} 25 %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5> 26 ret <8 x float> %res 27} 28 29define <8 x float> @expand1(<4 x float> %a ) { 30; AVX512-LABEL: expand1: 31; AVX512: # %bb.0: 32; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 33; AVX512-NEXT: movb $-86, %al 34; AVX512-NEXT: kmovd %eax, %k1 35; AVX512-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} 36; AVX512-NEXT: ret{{[l|q]}} 37; 38; AVX512F-LABEL: expand1: 39; AVX512F: # %bb.0: 40; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 41; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,0,18,1,20,2,22,3] 42; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 43; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 44; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 45; AVX512F-NEXT: ret{{[l|q]}} 46 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 47 ret <8 x float> %res 48} 49 50;Expand 128 -> 256 test <2 x double> -> <4 x double> 51define <4 x double> @expand2(<2 x double> %a) { 52; CHECK-LABEL: expand2: 53; CHECK: # %bb.0: 54; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 55; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] 56; CHECK-NEXT: vmovaps %xmm0, %xmm0 57; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 58; CHECK-NEXT: ret{{[l|q]}} 59 %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1> 60 ret <4 x double> %res 61} 62 63;expand 128 -> 256 include case <4 x i32> <8 x i32> 64define <8 x i32> @expand3(<4 x i32> %a ) { 65; AVX512-LABEL: expand3: 66; AVX512: # %bb.0: 67; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 68; AVX512-NEXT: movb $-127, %al 69; AVX512-NEXT: kmovd %eax, %k1 70; AVX512-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z} 71; AVX512-NEXT: ret{{[l|q]}} 72; 73; AVX512F-LABEL: expand3: 74; AVX512F: # %bb.0: 75; AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 76; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 77; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] 78; AVX512F-NEXT: ret{{[l|q]}} 79 %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5> 80 ret <8 x i32> %res 81} 82 83;expand 128 -> 256 include case <2 x i64> <4 x i64> 84define <4 x i64> @expand4(<2 x i64> %a ) { 85; AVX512-LABEL: expand4: 86; AVX512: # %bb.0: 87; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 88; AVX512-NEXT: movb $9, %al 89; AVX512-NEXT: kmovd %eax, %k1 90; AVX512-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z} 91; AVX512-NEXT: ret{{[l|q]}} 92; 93; AVX512F-LABEL: expand4: 94; AVX512F: # %bb.0: 95; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 96; AVX512F-NEXT: vperm2f128 {{.*#+}} ymm1 = zero,zero,ymm0[0,1] 97; AVX512F-NEXT: vmovaps %xmm0, %xmm0 98; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 99; AVX512F-NEXT: ret{{[l|q]}} 100 %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3> 101 ret <4 x i64> %res 102} 103 104;Negative test for 128-> 256 105define <8 x float> @expand5(<4 x float> %a ) { 106; AVX512-SLOW-LABEL: expand5: 107; AVX512-SLOW: # %bb.0: 108; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %ymm0 109; AVX512-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 110; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 111; AVX512-SLOW-NEXT: ret{{[l|q]}} 112; 113; AVX512-FAST-LABEL: expand5: 114; AVX512-FAST: # %bb.0: 115; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 116; AVX512-FAST-NEXT: vxorps %xmm1, %xmm1, %xmm1 117; AVX512-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [8,10,12,14] 118; AVX512-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 119; AVX512-FAST-NEXT: ret{{[l|q]}} 120; 121; AVX512F-LABEL: expand5: 122; AVX512F: # %bb.0: 123; AVX512F-NEXT: vbroadcastss %xmm0, %ymm0 124; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 125; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 126; AVX512F-NEXT: ret{{[l|q]}} 127 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4> 128 ret <8 x float> %res 129} 130 131;expand 256 -> 512 include <8 x float> <16 x float> 132define <8 x float> @expand6(<4 x float> %a ) { 133; CHECK-LABEL: expand6: 134; CHECK: # %bb.0: 135; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 136; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 137; CHECK-NEXT: ret{{[l|q]}} 138 %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 139 ret <8 x float> %res 140} 141 142define <16 x float> @expand7(<8 x float> %a) { 143; AVX512-LABEL: expand7: 144; AVX512: # %bb.0: 145; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 146; AVX512-NEXT: movw $1285, %ax # imm = 0x505 147; AVX512-NEXT: kmovd %eax, %k1 148; AVX512-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} 149; AVX512-NEXT: ret{{[l|q]}} 150; 151; AVX512F-LABEL: expand7: 152; AVX512F: # %bb.0: 153; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 154; AVX512F-NEXT: movw $1285, %ax # imm = 0x505 155; AVX512F-NEXT: kmovw %eax, %k1 156; AVX512F-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} 157; AVX512F-NEXT: ret{{[l|q]}} 158 %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8> 159 ret <16 x float> %res 160} 161 162define <16 x float> @expand8(<8 x float> %a ) { 163; AVX512-LABEL: expand8: 164; AVX512: # %bb.0: 165; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 166; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 167; AVX512-NEXT: kmovd %eax, %k1 168; AVX512-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} 169; AVX512-NEXT: ret{{[l|q]}} 170; 171; AVX512F-LABEL: expand8: 172; AVX512F: # %bb.0: 173; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 174; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA 175; AVX512F-NEXT: kmovw %eax, %k1 176; AVX512F-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z} 177; AVX512F-NEXT: ret{{[l|q]}} 178 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 179 ret <16 x float> %res 180} 181 182;expand 256 -> 512 include <4 x double> <8 x double> 183define <8 x double> @expand9(<4 x double> %a) { 184; AVX512-LABEL: expand9: 185; AVX512: # %bb.0: 186; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 187; AVX512-NEXT: movb $-127, %al 188; AVX512-NEXT: kmovd %eax, %k1 189; AVX512-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} 190; AVX512-NEXT: ret{{[l|q]}} 191; 192; AVX512F-LABEL: expand9: 193; AVX512F: # %bb.0: 194; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 195; AVX512F-NEXT: movb $-127, %al 196; AVX512F-NEXT: kmovw %eax, %k1 197; AVX512F-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} 198; AVX512F-NEXT: ret{{[l|q]}} 199 %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1> 200 ret <8 x double> %res 201} 202 203define <16 x i32> @expand10(<8 x i32> %a ) { 204; AVX512-LABEL: expand10: 205; AVX512: # %bb.0: 206; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 207; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 208; AVX512-NEXT: kmovd %eax, %k1 209; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 210; AVX512-NEXT: ret{{[l|q]}} 211; 212; AVX512F-LABEL: expand10: 213; AVX512F: # %bb.0: 214; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 215; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA 216; AVX512F-NEXT: kmovw %eax, %k1 217; AVX512F-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 218; AVX512F-NEXT: ret{{[l|q]}} 219 %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 220 ret <16 x i32> %res 221} 222 223define <8 x i64> @expand11(<4 x i64> %a) { 224; AVX512-LABEL: expand11: 225; AVX512: # %bb.0: 226; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 227; AVX512-NEXT: movb $-127, %al 228; AVX512-NEXT: kmovd %eax, %k1 229; AVX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 230; AVX512-NEXT: ret{{[l|q]}} 231; 232; AVX512F-LABEL: expand11: 233; AVX512F: # %bb.0: 234; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 235; AVX512F-NEXT: movb $-127, %al 236; AVX512F-NEXT: kmovw %eax, %k1 237; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} 238; AVX512F-NEXT: ret{{[l|q]}} 239 %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1> 240 ret <8 x i64> %res 241} 242 243;Negative test for 256-> 512 244define <16 x float> @expand12(<8 x float> %a) { 245; CHECK-LABEL: expand12: 246; CHECK: # %bb.0: 247; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 248; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16] 249; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 250; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 251; CHECK-NEXT: vmovaps %zmm1, %zmm0 252; CHECK-NEXT: ret{{[l|q]}} 253 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8> 254 ret <16 x float> %res 255} 256 257define <16 x float> @expand13(<8 x float> %a ) { 258; CHECK-LABEL: expand13: 259; CHECK: # %bb.0: 260; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 261; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 262; CHECK-NEXT: ret{{[l|q]}} 263 %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 264 ret <16 x float> %res 265} 266 267; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector. 268 269define <8 x float> @expand14(<4 x float> %a) { 270; AVX512-LABEL: expand14: 271; AVX512: # %bb.0: 272; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 273; AVX512-NEXT: movb $20, %al 274; AVX512-NEXT: kmovd %eax, %k1 275; AVX512-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z} 276; AVX512-NEXT: ret{{[l|q]}} 277; 278; AVX512F-LABEL: expand14: 279; AVX512F: # %bb.0: 280; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 281; AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,17,0,19,1,21,22,23] 282; AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 283; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 284; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 285; AVX512F-NEXT: ret{{[l|q]}} 286 %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0> 287 %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0> 288 ret <8 x float> %res 289} 290 291;Negative test. 292define <8 x float> @expand15(<4 x float> %a) { 293; AVX512-SLOW-LABEL: expand15: 294; AVX512-SLOW: # %bb.0: 295; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 296; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] 297; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] 298; AVX512-SLOW-NEXT: ret{{[l|q]}} 299; 300; AVX512-FAST-LABEL: expand15: 301; AVX512-FAST: # %bb.0: 302; AVX512-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 303; AVX512-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,1,0] 304; AVX512-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 305; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] 306; AVX512-FAST-NEXT: ret{{[l|q]}} 307; 308; AVX512F-LABEL: expand15: 309; AVX512F: # %bb.0: 310; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] 311; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] 312; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] 313; AVX512F-NEXT: ret{{[l|q]}} 314 %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0> 315 %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0> 316 ret <8 x float> %res 317} 318 319; Shuffle to blend test 320 321define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ 322; X86-AVX512-LABEL: test_mm512_mask_blend_epi8: 323; X86-AVX512: # %bb.0: # %entry 324; X86-AVX512-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA 325; X86-AVX512-NEXT: kmovd %eax, %k0 326; X86-AVX512-NEXT: kunpckdq %k0, %k0, %k1 327; X86-AVX512-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} 328; X86-AVX512-NEXT: retl 329; 330; X64-AVX512-LABEL: test_mm512_mask_blend_epi8: 331; X64-AVX512: # %bb.0: # %entry 332; X64-AVX512-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA 333; X64-AVX512-NEXT: kmovq %rax, %k1 334; X64-AVX512-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1} 335; X64-AVX512-NEXT: retq 336; 337; AVX512F-LABEL: test_mm512_mask_blend_epi8: 338; AVX512F: # %bb.0: # %entry 339; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 340; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 341; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1)) 342; AVX512F-NEXT: ret{{[l|q]}} 343entry: 344 %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63> 345 ret <64 x i8> %0 346} 347 348define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ 349; AVX512-LABEL: test_mm512_mask_blend_epi16: 350; AVX512: # %bb.0: # %entry 351; AVX512-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA 352; AVX512-NEXT: kmovd %eax, %k1 353; AVX512-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} 354; AVX512-NEXT: ret{{[l|q]}} 355; 356; AVX512F-LABEL: test_mm512_mask_blend_epi16: 357; AVX512F: # %bb.0: # %entry 358; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) 359; AVX512F-NEXT: ret{{[l|q]}} 360entry: 361 %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> 362 ret <32 x i16> %0 363} 364 365define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){ 366; AVX512-LABEL: test_mm512_mask_blend_epi32: 367; AVX512: # %bb.0: # %entry 368; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 369; AVX512-NEXT: kmovd %eax, %k1 370; AVX512-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} 371; AVX512-NEXT: ret{{[l|q]}} 372; 373; AVX512F-LABEL: test_mm512_mask_blend_epi32: 374; AVX512F: # %bb.0: # %entry 375; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA 376; AVX512F-NEXT: kmovw %eax, %k1 377; AVX512F-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} 378; AVX512F-NEXT: ret{{[l|q]}} 379entry: 380 %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> 381 ret <16 x i32> %0 382} 383 384define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){ 385; AVX512-LABEL: test_mm512_mask_blend_epi64: 386; AVX512: # %bb.0: # %entry 387; AVX512-NEXT: movb $-86, %al 388; AVX512-NEXT: kmovd %eax, %k1 389; AVX512-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} 390; AVX512-NEXT: ret{{[l|q]}} 391; 392; AVX512F-LABEL: test_mm512_mask_blend_epi64: 393; AVX512F: # %bb.0: # %entry 394; AVX512F-NEXT: movb $-86, %al 395; AVX512F-NEXT: kmovw %eax, %k1 396; AVX512F-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} 397; AVX512F-NEXT: ret{{[l|q]}} 398entry: 399 %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> 400 ret <8 x i64> %0 401} 402 403define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){ 404; AVX512-LABEL: test_mm512_mask_blend_ps: 405; AVX512: # %bb.0: # %entry 406; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 407; AVX512-NEXT: kmovd %eax, %k1 408; AVX512-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 409; AVX512-NEXT: ret{{[l|q]}} 410; 411; AVX512F-LABEL: test_mm512_mask_blend_ps: 412; AVX512F: # %bb.0: # %entry 413; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA 414; AVX512F-NEXT: kmovw %eax, %k1 415; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} 416; AVX512F-NEXT: ret{{[l|q]}} 417entry: 418 %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> 419 ret <16 x float> %0 420} 421 422define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){ 423; AVX512-LABEL: test_mm512_mask_blend_pd: 424; AVX512: # %bb.0: # %entry 425; AVX512-NEXT: movb $-88, %al 426; AVX512-NEXT: kmovd %eax, %k1 427; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 428; AVX512-NEXT: ret{{[l|q]}} 429; 430; AVX512F-LABEL: test_mm512_mask_blend_pd: 431; AVX512F: # %bb.0: # %entry 432; AVX512F-NEXT: movb $-88, %al 433; AVX512F-NEXT: kmovw %eax, %k1 434; AVX512F-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} 435; AVX512F-NEXT: ret{{[l|q]}} 436entry: 437 %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> 438 ret <8 x double> %0 439} 440 441 442define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){ 443; AVX512-LABEL: test_mm256_mask_blend_epi8: 444; AVX512: # %bb.0: # %entry 445; AVX512-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA 446; AVX512-NEXT: kmovd %eax, %k1 447; AVX512-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} 448; AVX512-NEXT: ret{{[l|q]}} 449; 450; AVX512F-LABEL: test_mm256_mask_blend_epi8: 451; AVX512F: # %bb.0: # %entry 452; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 453; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 454; AVX512F-NEXT: ret{{[l|q]}} 455entry: 456 %0 = shufflevector <32 x i8> %A, <32 x i8> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> 457 ret <32 x i8> %0 458} 459 460define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){ 461; AVX512-LABEL: test_mm_mask_blend_epi8: 462; AVX512: # %bb.0: # %entry 463; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA 464; AVX512-NEXT: kmovd %eax, %k1 465; AVX512-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} 466; AVX512-NEXT: ret{{[l|q]}} 467; 468; AVX512F-LABEL: test_mm_mask_blend_epi8: 469; AVX512F: # %bb.0: # %entry 470; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 471; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 472; AVX512F-NEXT: ret{{[l|q]}} 473entry: 474 %0 = shufflevector <16 x i8> %A, <16 x i8> %W, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> 475 ret <16 x i8> %0 476} 477 478; PR34370 479define <8 x float> @test_masked_permps_v8f32(ptr %vp, <8 x float> %vec2) { 480; X86-AVX512-LABEL: test_masked_permps_v8f32: 481; X86-AVX512: # %bb.0: 482; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 483; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] 484; X86-AVX512-NEXT: vpermt2ps (%eax), %ymm1, %ymm0 485; X86-AVX512-NEXT: retl 486; 487; X64-AVX512-LABEL: test_masked_permps_v8f32: 488; X64-AVX512: # %bb.0: 489; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,14,11,3,15,14,6,7] 490; X64-AVX512-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0 491; X64-AVX512-NEXT: retq 492; 493; X86-AVX512F-LABEL: test_masked_permps_v8f32: 494; X86-AVX512F: # %bb.0: 495; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 496; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 497; X86-AVX512F-NEXT: vmovaps (%eax), %ymm1 498; X86-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] 499; X86-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 500; X86-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 501; X86-AVX512F-NEXT: retl 502; 503; X64-AVX512F-LABEL: test_masked_permps_v8f32: 504; X64-AVX512F: # %bb.0: 505; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 506; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm1 507; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm2 = [23,22,19,3,23,22,6,7] 508; X64-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 509; X64-AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 510; X64-AVX512F-NEXT: retq 511 %vec = load <8 x float>, ptr %vp 512 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0> 513 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2 514 ret <8 x float> %res 515} 516 517define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) { 518; X86-AVX512-LABEL: test_masked_permps_v16f32: 519; X86-AVX512: # %bb.0: 520; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 521; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] 522; X86-AVX512-NEXT: vpermt2ps (%eax), %zmm1, %zmm0 523; X86-AVX512-NEXT: retl 524; 525; X64-AVX512-LABEL: test_masked_permps_v16f32: 526; X64-AVX512: # %bb.0: 527; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] 528; X64-AVX512-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 529; X64-AVX512-NEXT: retq 530; 531; X86-AVX512F-LABEL: test_masked_permps_v16f32: 532; X86-AVX512F: # %bb.0: 533; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 534; X86-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] 535; X86-AVX512F-NEXT: vpermt2ps (%eax), %zmm1, %zmm0 536; X86-AVX512F-NEXT: retl 537; 538; X64-AVX512F-LABEL: test_masked_permps_v16f32: 539; X64-AVX512F: # %bb.0: 540; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm1 = [31,29,27,3,30,28,6,7,23,22,19,11,23,13,19,15] 541; X64-AVX512F-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0 542; X64-AVX512F-NEXT: retq 543 %vec = load <16 x float>, ptr %vp 544 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 14, i32 12, i32 10, i32 8, i32 7, i32 6, i32 3, i32 0, i32 7, i32 6, i32 3, i32 0> 545 %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2 546 ret <16 x float> %res 547} 548 549define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) { 550; X86-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 551; X86-AVX512-SLOW: # %bb.0: 552; X86-AVX512-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax 553; X86-AVX512-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx 554; X86-AVX512-SLOW-NEXT: vpbroadcastd 44(%ecx), %xmm0 555; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 556; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%eax) 557; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] 558; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 559; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%eax) 560; X86-AVX512-SLOW-NEXT: vzeroupper 561; X86-AVX512-SLOW-NEXT: retl 562; 563; X64-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 564; X64-AVX512-SLOW: # %bb.0: 565; X64-AVX512-SLOW-NEXT: vpbroadcastd 44(%rdi), %xmm0 566; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 567; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%rsi) 568; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] 569; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 570; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%rsi) 571; X64-AVX512-SLOW-NEXT: vzeroupper 572; X64-AVX512-SLOW-NEXT: retq 573; 574; X86-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 575; X86-AVX512-FAST: # %bb.0: 576; X86-AVX512-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax 577; X86-AVX512-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx 578; X86-AVX512-FAST-NEXT: vpbroadcastd 44(%ecx), %xmm0 579; X86-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 580; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%eax) 581; X86-AVX512-FAST-NEXT: vmovdqa 208(%ecx), %xmm0 582; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 583; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%eax) 584; X86-AVX512-FAST-NEXT: vzeroupper 585; X86-AVX512-FAST-NEXT: retl 586; 587; X64-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 588; X64-AVX512-FAST: # %bb.0: 589; X64-AVX512-FAST-NEXT: vpbroadcastd 44(%rdi), %xmm0 590; X64-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 591; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%rsi) 592; X64-AVX512-FAST-NEXT: vmovdqa 208(%rdi), %xmm0 593; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 594; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%rsi) 595; X64-AVX512-FAST-NEXT: vzeroupper 596; X64-AVX512-FAST-NEXT: retq 597; 598; X86-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 599; X86-AVX512F: # %bb.0: 600; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 601; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 602; X86-AVX512F-NEXT: vpbroadcastd 44(%ecx), %xmm0 603; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 604; X86-AVX512F-NEXT: vmovdqa %ymm0, 672(%eax) 605; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] 606; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 607; X86-AVX512F-NEXT: vmovdqa %ymm0, 832(%eax) 608; X86-AVX512F-NEXT: vzeroupper 609; X86-AVX512F-NEXT: retl 610; 611; X64-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8: 612; X64-AVX512F: # %bb.0: 613; X64-AVX512F-NEXT: vpbroadcastd 44(%rdi), %xmm0 614; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 615; X64-AVX512F-NEXT: vmovdqa %ymm0, 672(%rsi) 616; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3] 617; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 618; X64-AVX512F-NEXT: vmovdqa %ymm0, 832(%rsi) 619; X64-AVX512F-NEXT: vzeroupper 620; X64-AVX512F-NEXT: retq 621 %t87 = load <16 x i32>, ptr %src, align 64 622 %t88 = extractelement <16 x i32> %t87, i64 11 623 %t89 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t88, i64 0 624 %t90 = insertelement <8 x i32> %t89, i32 %t88, i64 1 625 %ptridx49.i = getelementptr inbounds <8 x i32>, ptr %dst, i64 21 626 store <8 x i32> %t90, ptr %ptridx49.i, align 32 627 %ptridx56.i = getelementptr inbounds <2 x i32>, ptr %src, i64 24 628 %t09 = load <16 x i32>, ptr %ptridx56.i, align 64 629 %t10 = extractelement <16 x i32> %t09, i64 5 630 %t11 = insertelement <8 x i32> <i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %t10, i64 0 631 %t12 = extractelement <16 x i32> %t09, i64 4 632 %t13 = insertelement <8 x i32> %t11, i32 %t12, i64 1 633 %ptridx64.i = getelementptr inbounds <8 x i32>, ptr %dst, i64 26 634 store <8 x i32> %t13, ptr %ptridx64.i, align 32 635 ret void 636} 637 638define <32 x float> @PR47534(<8 x float> %tmp) { 639; CHECK-LABEL: PR47534: 640; CHECK: # %bb.0: 641; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 642; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 643; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31] 644; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 645; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 646; CHECK-NEXT: ret{{[l|q]}} 647 %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 648 %tmp2 = shufflevector <32 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <32 x float> undef, <32 x i32> <i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31> 649 %tmp18 = shufflevector <32 x float> %tmp2, <32 x float> %tmp1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 39, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 30, i32 31> 650 ret <32 x float> %tmp18 651} 652 653%union1= type { <16 x float> } 654@src1 = external dso_local local_unnamed_addr global %union1, align 64 655 656define void @PR43170(ptr %a0) { 657; X86-AVX512-LABEL: PR43170: 658; X86-AVX512: # %bb.0: # %entry 659; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 660; X86-AVX512-NEXT: vmovaps src1, %ymm0 661; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) 662; X86-AVX512-NEXT: vzeroupper 663; X86-AVX512-NEXT: retl 664; 665; X64-AVX512-LABEL: PR43170: 666; X64-AVX512: # %bb.0: # %entry 667; X64-AVX512-NEXT: vmovaps src1(%rip), %ymm0 668; X64-AVX512-NEXT: vmovaps %zmm0, (%rdi) 669; X64-AVX512-NEXT: vzeroupper 670; X64-AVX512-NEXT: retq 671; 672; X86-AVX512F-LABEL: PR43170: 673; X86-AVX512F: # %bb.0: # %entry 674; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 675; X86-AVX512F-NEXT: vmovaps src1, %ymm0 676; X86-AVX512F-NEXT: vmovaps %zmm0, (%eax) 677; X86-AVX512F-NEXT: vzeroupper 678; X86-AVX512F-NEXT: retl 679; 680; X64-AVX512F-LABEL: PR43170: 681; X64-AVX512F: # %bb.0: # %entry 682; X64-AVX512F-NEXT: vmovaps src1(%rip), %ymm0 683; X64-AVX512F-NEXT: vmovaps %zmm0, (%rdi) 684; X64-AVX512F-NEXT: vzeroupper 685; X64-AVX512F-NEXT: retq 686entry: 687 %0 = load <8 x float>, ptr @src1, align 64 688 %1 = shufflevector <8 x float> %0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 689 store <16 x float> %1, ptr %a0, align 64 690 ret void 691} 692