1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 7 8define <4 x float> @fadd_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) { 9; SSE2-LABEL: fadd_v4f32: 10; SSE2: # %bb.0: 11; SSE2-NEXT: pslld $31, %xmm0 12; SSE2-NEXT: psrad $31, %xmm0 13; SSE2-NEXT: pand %xmm0, %xmm2 14; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 15; SSE2-NEXT: por %xmm2, %xmm0 16; SSE2-NEXT: addps %xmm1, %xmm0 17; SSE2-NEXT: retq 18; 19; SSE42-LABEL: fadd_v4f32: 20; SSE42: # %bb.0: 21; SSE42-NEXT: pslld $31, %xmm0 22; SSE42-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 23; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3 24; SSE42-NEXT: addps %xmm1, %xmm3 25; SSE42-NEXT: movaps %xmm3, %xmm0 26; SSE42-NEXT: retq 27; 28; AVX2-LABEL: fadd_v4f32: 29; AVX2: # %bb.0: 30; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 31; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 32; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 33; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 34; AVX2-NEXT: retq 35; 36; AVX512F-LABEL: fadd_v4f32: 37; AVX512F: # %bb.0: 38; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 39; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 40; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 41; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 42; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 43; AVX512F-NEXT: vaddps %xmm0, %xmm1, %xmm0 44; AVX512F-NEXT: vzeroupper 45; AVX512F-NEXT: retq 46; 47; AVX512VL-LABEL: fadd_v4f32: 48; AVX512VL: # %bb.0: 49; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 50; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 51; AVX512VL-NEXT: vaddps %xmm2, %xmm1, %xmm1 {%k1} 52; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 53; AVX512VL-NEXT: retq 54 %s = select <4 x i1> %b, <4 x float> %y, <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0> 55 %r = fadd <4 x float> %x, %s 56 ret <4 x float> %r 57} 58 59define <8 x float> @fadd_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) { 60; SSE2-LABEL: fadd_v8f32_commute: 61; SSE2: # %bb.0: 62; SSE2-NEXT: movdqa %xmm0, %xmm5 63; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 64; SSE2-NEXT: pslld $31, %xmm5 65; SSE2-NEXT: psrad $31, %xmm5 66; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 67; SSE2-NEXT: pand %xmm5, %xmm4 68; SSE2-NEXT: pandn %xmm6, %xmm5 69; SSE2-NEXT: por %xmm4, %xmm5 70; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 71; SSE2-NEXT: pslld $31, %xmm0 72; SSE2-NEXT: psrad $31, %xmm0 73; SSE2-NEXT: pand %xmm0, %xmm3 74; SSE2-NEXT: pandn %xmm6, %xmm0 75; SSE2-NEXT: por %xmm3, %xmm0 76; SSE2-NEXT: addps %xmm1, %xmm0 77; SSE2-NEXT: addps %xmm2, %xmm5 78; SSE2-NEXT: movaps %xmm5, %xmm1 79; SSE2-NEXT: retq 80; 81; SSE42-LABEL: fadd_v8f32_commute: 82; SSE42: # %bb.0: 83; SSE42-NEXT: movdqa %xmm0, %xmm5 84; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 85; SSE42-NEXT: pslld $31, %xmm0 86; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 87; SSE42-NEXT: movaps %xmm6, %xmm7 88; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 89; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 90; SSE42-NEXT: pslld $31, %xmm5 91; SSE42-NEXT: movdqa %xmm5, %xmm0 92; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6 93; SSE42-NEXT: addps %xmm1, %xmm7 94; SSE42-NEXT: addps %xmm2, %xmm6 95; SSE42-NEXT: movaps %xmm7, %xmm0 96; SSE42-NEXT: movaps %xmm6, %xmm1 97; SSE42-NEXT: retq 98; 99; AVX2-LABEL: fadd_v8f32_commute: 100; AVX2: # %bb.0: 101; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 102; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 103; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 104; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 105; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 106; AVX2-NEXT: retq 107; 108; AVX512F-LABEL: fadd_v8f32_commute: 109; AVX512F: # %bb.0: 110; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 111; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 112; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 113; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 114; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 115; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 116; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 117; AVX512F-NEXT: retq 118; 119; AVX512VL-LABEL: fadd_v8f32_commute: 120; AVX512VL: # %bb.0: 121; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 122; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 123; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 124; AVX512VL-NEXT: vaddps %ymm2, %ymm1, %ymm1 {%k1} 125; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 126; AVX512VL-NEXT: retq 127 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0> 128 %r = fadd <8 x float> %s, %x 129 ret <8 x float> %r 130} 131 132define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 133; SSE2-LABEL: fadd_v16f32_swap: 134; SSE2: # %bb.0: 135; SSE2-NEXT: movdqa %xmm0, %xmm10 136; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 137; SSE2-NEXT: movdqa %xmm10, %xmm8 138; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 139; SSE2-NEXT: pslld $31, %xmm8 140; SSE2-NEXT: movdqa %xmm8, %xmm9 141; SSE2-NEXT: psrad $31, %xmm9 142; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 143; SSE2-NEXT: por %xmm8, %xmm9 144; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] 145; SSE2-NEXT: pslld $31, %xmm10 146; SSE2-NEXT: movdqa %xmm10, %xmm8 147; SSE2-NEXT: psrad $31, %xmm8 148; SSE2-NEXT: pandn %xmm7, %xmm8 149; SSE2-NEXT: por %xmm10, %xmm8 150; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 151; SSE2-NEXT: movdqa %xmm0, %xmm10 152; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] 153; SSE2-NEXT: pslld $31, %xmm10 154; SSE2-NEXT: movdqa %xmm10, %xmm7 155; SSE2-NEXT: psrad $31, %xmm7 156; SSE2-NEXT: pandn %xmm6, %xmm7 157; SSE2-NEXT: por %xmm10, %xmm7 158; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 159; SSE2-NEXT: pslld $31, %xmm0 160; SSE2-NEXT: movdqa %xmm0, %xmm6 161; SSE2-NEXT: psrad $31, %xmm6 162; SSE2-NEXT: pandn %xmm5, %xmm6 163; SSE2-NEXT: por %xmm6, %xmm0 164; SSE2-NEXT: addps %xmm1, %xmm0 165; SSE2-NEXT: addps %xmm2, %xmm7 166; SSE2-NEXT: addps %xmm3, %xmm8 167; SSE2-NEXT: addps %xmm4, %xmm9 168; SSE2-NEXT: movaps %xmm7, %xmm1 169; SSE2-NEXT: movaps %xmm8, %xmm2 170; SSE2-NEXT: movaps %xmm9, %xmm3 171; SSE2-NEXT: retq 172; 173; SSE42-LABEL: fadd_v16f32_swap: 174; SSE42: # %bb.0: 175; SSE42-NEXT: movaps %xmm3, %xmm8 176; SSE42-NEXT: movdqa %xmm0, %xmm9 177; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 178; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 179; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 180; SSE42-NEXT: pslld $31, %xmm0 181; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 182; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 183; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 184; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 185; SSE42-NEXT: pslld $31, %xmm0 186; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 187; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 188; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 189; SSE42-NEXT: pslld $31, %xmm0 190; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 191; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 192; SSE42-NEXT: pslld $31, %xmm0 193; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 194; SSE42-NEXT: addps %xmm1, %xmm5 195; SSE42-NEXT: addps %xmm2, %xmm6 196; SSE42-NEXT: addps %xmm8, %xmm7 197; SSE42-NEXT: addps %xmm4, %xmm3 198; SSE42-NEXT: movaps %xmm5, %xmm0 199; SSE42-NEXT: movaps %xmm6, %xmm1 200; SSE42-NEXT: movaps %xmm7, %xmm2 201; SSE42-NEXT: retq 202; 203; AVX2-LABEL: fadd_v16f32_swap: 204; AVX2: # %bb.0: 205; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 206; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 207; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 208; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 209; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 210; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 211; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 212; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 213; AVX2-NEXT: vaddps %ymm3, %ymm1, %ymm0 214; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm1 215; AVX2-NEXT: retq 216; 217; AVX512-LABEL: fadd_v16f32_swap: 218; AVX512: # %bb.0: 219; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 220; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 221; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 222; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm0 223; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 224; AVX512-NEXT: retq 225 %s = select <16 x i1> %b, <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, <16 x float> %y 226 %r = fadd <16 x float> %x, %s 227 ret <16 x float> %r 228} 229 230define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 231; SSE2-LABEL: fadd_v16f32_commute_swap: 232; SSE2: # %bb.0: 233; SSE2-NEXT: movdqa %xmm0, %xmm10 234; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 235; SSE2-NEXT: movdqa %xmm10, %xmm8 236; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 237; SSE2-NEXT: pslld $31, %xmm8 238; SSE2-NEXT: movdqa %xmm8, %xmm9 239; SSE2-NEXT: psrad $31, %xmm9 240; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 241; SSE2-NEXT: por %xmm8, %xmm9 242; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] 243; SSE2-NEXT: pslld $31, %xmm10 244; SSE2-NEXT: movdqa %xmm10, %xmm8 245; SSE2-NEXT: psrad $31, %xmm8 246; SSE2-NEXT: pandn %xmm7, %xmm8 247; SSE2-NEXT: por %xmm10, %xmm8 248; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 249; SSE2-NEXT: movdqa %xmm0, %xmm10 250; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] 251; SSE2-NEXT: pslld $31, %xmm10 252; SSE2-NEXT: movdqa %xmm10, %xmm7 253; SSE2-NEXT: psrad $31, %xmm7 254; SSE2-NEXT: pandn %xmm6, %xmm7 255; SSE2-NEXT: por %xmm10, %xmm7 256; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 257; SSE2-NEXT: pslld $31, %xmm0 258; SSE2-NEXT: movdqa %xmm0, %xmm6 259; SSE2-NEXT: psrad $31, %xmm6 260; SSE2-NEXT: pandn %xmm5, %xmm6 261; SSE2-NEXT: por %xmm6, %xmm0 262; SSE2-NEXT: addps %xmm1, %xmm0 263; SSE2-NEXT: addps %xmm2, %xmm7 264; SSE2-NEXT: addps %xmm3, %xmm8 265; SSE2-NEXT: addps %xmm4, %xmm9 266; SSE2-NEXT: movaps %xmm7, %xmm1 267; SSE2-NEXT: movaps %xmm8, %xmm2 268; SSE2-NEXT: movaps %xmm9, %xmm3 269; SSE2-NEXT: retq 270; 271; SSE42-LABEL: fadd_v16f32_commute_swap: 272; SSE42: # %bb.0: 273; SSE42-NEXT: movaps %xmm3, %xmm8 274; SSE42-NEXT: movdqa %xmm0, %xmm9 275; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 276; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 277; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 278; SSE42-NEXT: pslld $31, %xmm0 279; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 280; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 281; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 282; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 283; SSE42-NEXT: pslld $31, %xmm0 284; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 285; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 286; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 287; SSE42-NEXT: pslld $31, %xmm0 288; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 289; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 290; SSE42-NEXT: pslld $31, %xmm0 291; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 292; SSE42-NEXT: addps %xmm1, %xmm5 293; SSE42-NEXT: addps %xmm2, %xmm6 294; SSE42-NEXT: addps %xmm8, %xmm7 295; SSE42-NEXT: addps %xmm4, %xmm3 296; SSE42-NEXT: movaps %xmm5, %xmm0 297; SSE42-NEXT: movaps %xmm6, %xmm1 298; SSE42-NEXT: movaps %xmm7, %xmm2 299; SSE42-NEXT: retq 300; 301; AVX2-LABEL: fadd_v16f32_commute_swap: 302; AVX2: # %bb.0: 303; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 304; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 305; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 306; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 307; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 308; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 309; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 310; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 311; AVX2-NEXT: vaddps %ymm1, %ymm3, %ymm0 312; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm1 313; AVX2-NEXT: retq 314; 315; AVX512-LABEL: fadd_v16f32_commute_swap: 316; AVX512: # %bb.0: 317; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 318; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 319; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 320; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm0 321; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 322; AVX512-NEXT: retq 323 %s = select <16 x i1> %b, <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, <16 x float> %y 324 %r = fadd <16 x float> %s, %x 325 ret <16 x float> %r 326} 327 328define <4 x float> @fsub_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) { 329; SSE-LABEL: fsub_v4f32: 330; SSE: # %bb.0: 331; SSE-NEXT: pslld $31, %xmm0 332; SSE-NEXT: psrad $31, %xmm0 333; SSE-NEXT: pand %xmm2, %xmm0 334; SSE-NEXT: subps %xmm0, %xmm1 335; SSE-NEXT: movaps %xmm1, %xmm0 336; SSE-NEXT: retq 337; 338; AVX2-LABEL: fsub_v4f32: 339; AVX2: # %bb.0: 340; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 341; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 342; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 343; AVX2-NEXT: vsubps %xmm0, %xmm1, %xmm0 344; AVX2-NEXT: retq 345; 346; AVX512F-LABEL: fsub_v4f32: 347; AVX512F: # %bb.0: 348; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 349; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 350; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 351; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} {z} 352; AVX512F-NEXT: vsubps %xmm0, %xmm1, %xmm0 353; AVX512F-NEXT: vzeroupper 354; AVX512F-NEXT: retq 355; 356; AVX512VL-LABEL: fsub_v4f32: 357; AVX512VL: # %bb.0: 358; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 359; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 360; AVX512VL-NEXT: vsubps %xmm2, %xmm1, %xmm1 {%k1} 361; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 362; AVX512VL-NEXT: retq 363 %s = select <4 x i1> %b, <4 x float> %y, <4 x float> zeroinitializer 364 %r = fsub <4 x float> %x, %s 365 ret <4 x float> %r 366} 367 368; negative test - fsub is not commutative; there is no identity constant for operand 0 369 370define <8 x float> @fsub_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) { 371; SSE2-LABEL: fsub_v8f32_commute: 372; SSE2: # %bb.0: 373; SSE2-NEXT: movdqa %xmm0, %xmm5 374; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 375; SSE2-NEXT: pslld $31, %xmm5 376; SSE2-NEXT: psrad $31, %xmm5 377; SSE2-NEXT: pand %xmm4, %xmm5 378; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 379; SSE2-NEXT: pslld $31, %xmm0 380; SSE2-NEXT: psrad $31, %xmm0 381; SSE2-NEXT: pand %xmm3, %xmm0 382; SSE2-NEXT: subps %xmm1, %xmm0 383; SSE2-NEXT: subps %xmm2, %xmm5 384; SSE2-NEXT: movaps %xmm5, %xmm1 385; SSE2-NEXT: retq 386; 387; SSE42-LABEL: fsub_v8f32_commute: 388; SSE42: # %bb.0: 389; SSE42-NEXT: movdqa %xmm0, %xmm5 390; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 391; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 392; SSE42-NEXT: pslld $31, %xmm5 393; SSE42-NEXT: psrad $31, %xmm5 394; SSE42-NEXT: pand %xmm4, %xmm5 395; SSE42-NEXT: pslld $31, %xmm0 396; SSE42-NEXT: psrad $31, %xmm0 397; SSE42-NEXT: pand %xmm3, %xmm0 398; SSE42-NEXT: subps %xmm1, %xmm0 399; SSE42-NEXT: subps %xmm2, %xmm5 400; SSE42-NEXT: movaps %xmm5, %xmm1 401; SSE42-NEXT: retq 402; 403; AVX2-LABEL: fsub_v8f32_commute: 404; AVX2: # %bb.0: 405; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 406; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 407; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 408; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 409; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm0 410; AVX2-NEXT: retq 411; 412; AVX512F-LABEL: fsub_v8f32_commute: 413; AVX512F: # %bb.0: 414; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 415; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 416; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 417; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 418; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} {z} 419; AVX512F-NEXT: vsubps %ymm1, %ymm0, %ymm0 420; AVX512F-NEXT: retq 421; 422; AVX512VL-LABEL: fsub_v8f32_commute: 423; AVX512VL: # %bb.0: 424; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 425; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 426; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 427; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 {%k1} {z} 428; AVX512VL-NEXT: vsubps %ymm1, %ymm0, %ymm0 429; AVX512VL-NEXT: retq 430 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> zeroinitializer 431 %r = fsub <8 x float> %s, %x 432 ret <8 x float> %r 433} 434 435define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 436; SSE2-LABEL: fsub_v16f32_swap: 437; SSE2: # %bb.0: 438; SSE2-NEXT: movdqa %xmm0, %xmm9 439; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 440; SSE2-NEXT: movdqa %xmm9, %xmm8 441; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 442; SSE2-NEXT: pslld $31, %xmm8 443; SSE2-NEXT: psrad $31, %xmm8 444; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 445; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 446; SSE2-NEXT: pslld $31, %xmm9 447; SSE2-NEXT: psrad $31, %xmm9 448; SSE2-NEXT: pandn %xmm7, %xmm9 449; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 450; SSE2-NEXT: movdqa %xmm0, %xmm7 451; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 452; SSE2-NEXT: pslld $31, %xmm7 453; SSE2-NEXT: psrad $31, %xmm7 454; SSE2-NEXT: pandn %xmm6, %xmm7 455; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 456; SSE2-NEXT: pslld $31, %xmm0 457; SSE2-NEXT: psrad $31, %xmm0 458; SSE2-NEXT: pandn %xmm5, %xmm0 459; SSE2-NEXT: subps %xmm0, %xmm1 460; SSE2-NEXT: subps %xmm7, %xmm2 461; SSE2-NEXT: subps %xmm9, %xmm3 462; SSE2-NEXT: subps %xmm8, %xmm4 463; SSE2-NEXT: movaps %xmm1, %xmm0 464; SSE2-NEXT: movaps %xmm2, %xmm1 465; SSE2-NEXT: movaps %xmm3, %xmm2 466; SSE2-NEXT: movaps %xmm4, %xmm3 467; SSE2-NEXT: retq 468; 469; SSE42-LABEL: fsub_v16f32_swap: 470; SSE42: # %bb.0: 471; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 472; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 473; SSE42-NEXT: pslld $31, %xmm8 474; SSE42-NEXT: psrad $31, %xmm8 475; SSE42-NEXT: pandn %xmm7, %xmm8 476; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 477; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 478; SSE42-NEXT: pslld $31, %xmm7 479; SSE42-NEXT: psrad $31, %xmm7 480; SSE42-NEXT: pandn %xmm6, %xmm7 481; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 482; SSE42-NEXT: pslld $31, %xmm6 483; SSE42-NEXT: psrad $31, %xmm6 484; SSE42-NEXT: pandn %xmm5, %xmm6 485; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 486; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 487; SSE42-NEXT: pslld $31, %xmm0 488; SSE42-NEXT: psrad $31, %xmm0 489; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 490; SSE42-NEXT: subps %xmm6, %xmm1 491; SSE42-NEXT: subps %xmm7, %xmm2 492; SSE42-NEXT: subps %xmm8, %xmm3 493; SSE42-NEXT: subps %xmm0, %xmm4 494; SSE42-NEXT: movaps %xmm1, %xmm0 495; SSE42-NEXT: movaps %xmm2, %xmm1 496; SSE42-NEXT: movaps %xmm3, %xmm2 497; SSE42-NEXT: movaps %xmm4, %xmm3 498; SSE42-NEXT: retq 499; 500; AVX2-LABEL: fsub_v16f32_swap: 501; AVX2: # %bb.0: 502; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 503; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 504; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 505; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 506; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 507; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 508; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 509; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 510; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 511; AVX2-NEXT: vsubps %ymm0, %ymm1, %ymm0 512; AVX2-NEXT: vsubps %ymm4, %ymm2, %ymm1 513; AVX2-NEXT: retq 514; 515; AVX512-LABEL: fsub_v16f32_swap: 516; AVX512: # %bb.0: 517; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 518; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 519; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 520; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm0 521; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 522; AVX512-NEXT: retq 523 %s = select <16 x i1> %b, <16 x float> zeroinitializer, <16 x float> %y 524 %r = fsub <16 x float> %x, %s 525 ret <16 x float> %r 526} 527 528; negative test - fsub is not commutative; there is no identity constant for operand 0 529 530define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 531; SSE2-LABEL: fsub_v16f32_commute_swap: 532; SSE2: # %bb.0: 533; SSE2-NEXT: movaps %xmm2, %xmm8 534; SSE2-NEXT: movdqa %xmm0, %xmm2 535; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 536; SSE2-NEXT: movdqa %xmm2, %xmm9 537; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 538; SSE2-NEXT: pslld $31, %xmm9 539; SSE2-NEXT: psrad $31, %xmm9 540; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 541; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 542; SSE2-NEXT: pslld $31, %xmm2 543; SSE2-NEXT: psrad $31, %xmm2 544; SSE2-NEXT: pandn %xmm7, %xmm2 545; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 546; SSE2-NEXT: movdqa %xmm0, %xmm7 547; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 548; SSE2-NEXT: pslld $31, %xmm7 549; SSE2-NEXT: psrad $31, %xmm7 550; SSE2-NEXT: pandn %xmm6, %xmm7 551; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 552; SSE2-NEXT: pslld $31, %xmm0 553; SSE2-NEXT: psrad $31, %xmm0 554; SSE2-NEXT: pandn %xmm5, %xmm0 555; SSE2-NEXT: subps %xmm1, %xmm0 556; SSE2-NEXT: subps %xmm8, %xmm7 557; SSE2-NEXT: subps %xmm3, %xmm2 558; SSE2-NEXT: subps %xmm4, %xmm9 559; SSE2-NEXT: movaps %xmm7, %xmm1 560; SSE2-NEXT: movaps %xmm9, %xmm3 561; SSE2-NEXT: retq 562; 563; SSE42-LABEL: fsub_v16f32_commute_swap: 564; SSE42: # %bb.0: 565; SSE42-NEXT: movaps %xmm2, %xmm8 566; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 567; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 568; SSE42-NEXT: pslld $31, %xmm2 569; SSE42-NEXT: psrad $31, %xmm2 570; SSE42-NEXT: pandn %xmm7, %xmm2 571; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 572; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 573; SSE42-NEXT: pslld $31, %xmm7 574; SSE42-NEXT: psrad $31, %xmm7 575; SSE42-NEXT: pandn %xmm6, %xmm7 576; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 577; SSE42-NEXT: pslld $31, %xmm6 578; SSE42-NEXT: psrad $31, %xmm6 579; SSE42-NEXT: pandn %xmm5, %xmm6 580; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 581; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 582; SSE42-NEXT: pslld $31, %xmm5 583; SSE42-NEXT: psrad $31, %xmm5 584; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 585; SSE42-NEXT: subps %xmm1, %xmm6 586; SSE42-NEXT: subps %xmm8, %xmm7 587; SSE42-NEXT: subps %xmm3, %xmm2 588; SSE42-NEXT: subps %xmm4, %xmm5 589; SSE42-NEXT: movaps %xmm6, %xmm0 590; SSE42-NEXT: movaps %xmm7, %xmm1 591; SSE42-NEXT: movaps %xmm5, %xmm3 592; SSE42-NEXT: retq 593; 594; AVX2-LABEL: fsub_v16f32_commute_swap: 595; AVX2: # %bb.0: 596; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 597; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 598; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 599; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 600; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 601; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 602; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 603; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 604; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 605; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm0 606; AVX2-NEXT: vsubps %ymm2, %ymm4, %ymm1 607; AVX2-NEXT: retq 608; 609; AVX512-LABEL: fsub_v16f32_commute_swap: 610; AVX512: # %bb.0: 611; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 612; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 613; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 614; AVX512-NEXT: vmovaps %zmm2, %zmm0 {%k1} {z} 615; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0 616; AVX512-NEXT: retq 617 %s = select <16 x i1> %b, <16 x float> zeroinitializer, <16 x float> %y 618 %r = fsub <16 x float> %s, %x 619 ret <16 x float> %r 620} 621 622define <4 x float> @fmul_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) { 623; SSE2-LABEL: fmul_v4f32: 624; SSE2: # %bb.0: 625; SSE2-NEXT: pslld $31, %xmm0 626; SSE2-NEXT: psrad $31, %xmm0 627; SSE2-NEXT: pand %xmm0, %xmm2 628; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 629; SSE2-NEXT: por %xmm2, %xmm0 630; SSE2-NEXT: mulps %xmm1, %xmm0 631; SSE2-NEXT: retq 632; 633; SSE42-LABEL: fmul_v4f32: 634; SSE42: # %bb.0: 635; SSE42-NEXT: pslld $31, %xmm0 636; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 637; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3 638; SSE42-NEXT: mulps %xmm1, %xmm3 639; SSE42-NEXT: movaps %xmm3, %xmm0 640; SSE42-NEXT: retq 641; 642; AVX2-LABEL: fmul_v4f32: 643; AVX2: # %bb.0: 644; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 645; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 646; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 647; AVX2-NEXT: vmulps %xmm0, %xmm1, %xmm0 648; AVX2-NEXT: retq 649; 650; AVX512F-LABEL: fmul_v4f32: 651; AVX512F: # %bb.0: 652; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 653; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 654; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 655; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 656; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 657; AVX512F-NEXT: vmulps %xmm0, %xmm1, %xmm0 658; AVX512F-NEXT: vzeroupper 659; AVX512F-NEXT: retq 660; 661; AVX512VL-LABEL: fmul_v4f32: 662; AVX512VL: # %bb.0: 663; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 664; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 665; AVX512VL-NEXT: vmulps %xmm2, %xmm1, %xmm1 {%k1} 666; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 667; AVX512VL-NEXT: retq 668 %s = select <4 x i1> %b, <4 x float> %y, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> 669 %r = fmul <4 x float> %x, %s 670 ret <4 x float> %r 671} 672 673define <8 x float> @fmul_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) { 674; SSE2-LABEL: fmul_v8f32_commute: 675; SSE2: # %bb.0: 676; SSE2-NEXT: movdqa %xmm0, %xmm5 677; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 678; SSE2-NEXT: pslld $31, %xmm5 679; SSE2-NEXT: psrad $31, %xmm5 680; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 681; SSE2-NEXT: pand %xmm5, %xmm4 682; SSE2-NEXT: pandn %xmm6, %xmm5 683; SSE2-NEXT: por %xmm4, %xmm5 684; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 685; SSE2-NEXT: pslld $31, %xmm0 686; SSE2-NEXT: psrad $31, %xmm0 687; SSE2-NEXT: pand %xmm0, %xmm3 688; SSE2-NEXT: pandn %xmm6, %xmm0 689; SSE2-NEXT: por %xmm3, %xmm0 690; SSE2-NEXT: mulps %xmm1, %xmm0 691; SSE2-NEXT: mulps %xmm2, %xmm5 692; SSE2-NEXT: movaps %xmm5, %xmm1 693; SSE2-NEXT: retq 694; 695; SSE42-LABEL: fmul_v8f32_commute: 696; SSE42: # %bb.0: 697; SSE42-NEXT: movdqa %xmm0, %xmm5 698; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 699; SSE42-NEXT: pslld $31, %xmm0 700; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 701; SSE42-NEXT: movaps %xmm6, %xmm7 702; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 703; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 704; SSE42-NEXT: pslld $31, %xmm5 705; SSE42-NEXT: movdqa %xmm5, %xmm0 706; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6 707; SSE42-NEXT: mulps %xmm1, %xmm7 708; SSE42-NEXT: mulps %xmm2, %xmm6 709; SSE42-NEXT: movaps %xmm7, %xmm0 710; SSE42-NEXT: movaps %xmm6, %xmm1 711; SSE42-NEXT: retq 712; 713; AVX2-LABEL: fmul_v8f32_commute: 714; AVX2: # %bb.0: 715; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 716; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 717; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 718; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 719; AVX2-NEXT: vmulps %ymm1, %ymm0, %ymm0 720; AVX2-NEXT: retq 721; 722; AVX512F-LABEL: fmul_v8f32_commute: 723; AVX512F: # %bb.0: 724; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 725; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 726; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 727; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 728; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 729; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 730; AVX512F-NEXT: vmulps %ymm1, %ymm0, %ymm0 731; AVX512F-NEXT: retq 732; 733; AVX512VL-LABEL: fmul_v8f32_commute: 734; AVX512VL: # %bb.0: 735; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 736; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 737; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 738; AVX512VL-NEXT: vmulps %ymm2, %ymm1, %ymm1 {%k1} 739; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 740; AVX512VL-NEXT: retq 741 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 742 %r = fmul <8 x float> %s, %x 743 ret <8 x float> %r 744} 745 746define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 747; SSE2-LABEL: fmul_v16f32_swap: 748; SSE2: # %bb.0: 749; SSE2-NEXT: movaps %xmm2, %xmm8 750; SSE2-NEXT: movdqa %xmm0, %xmm2 751; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 752; SSE2-NEXT: movdqa %xmm2, %xmm10 753; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] 754; SSE2-NEXT: pslld $31, %xmm10 755; SSE2-NEXT: psrad $31, %xmm10 756; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 757; SSE2-NEXT: movdqa %xmm11, %xmm9 758; SSE2-NEXT: pand %xmm10, %xmm9 759; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 760; SSE2-NEXT: por %xmm9, %xmm10 761; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 762; SSE2-NEXT: pslld $31, %xmm2 763; SSE2-NEXT: psrad $31, %xmm2 764; SSE2-NEXT: movdqa %xmm11, %xmm9 765; SSE2-NEXT: pand %xmm2, %xmm9 766; SSE2-NEXT: pandn %xmm7, %xmm2 767; SSE2-NEXT: por %xmm9, %xmm2 768; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 769; SSE2-NEXT: movdqa %xmm0, %xmm9 770; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 771; SSE2-NEXT: pslld $31, %xmm9 772; SSE2-NEXT: psrad $31, %xmm9 773; SSE2-NEXT: movdqa %xmm11, %xmm7 774; SSE2-NEXT: pand %xmm9, %xmm7 775; SSE2-NEXT: pandn %xmm6, %xmm9 776; SSE2-NEXT: por %xmm7, %xmm9 777; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 778; SSE2-NEXT: pslld $31, %xmm0 779; SSE2-NEXT: psrad $31, %xmm0 780; SSE2-NEXT: pand %xmm0, %xmm11 781; SSE2-NEXT: pandn %xmm5, %xmm0 782; SSE2-NEXT: por %xmm11, %xmm0 783; SSE2-NEXT: mulps %xmm1, %xmm0 784; SSE2-NEXT: mulps %xmm8, %xmm9 785; SSE2-NEXT: mulps %xmm3, %xmm2 786; SSE2-NEXT: mulps %xmm4, %xmm10 787; SSE2-NEXT: movaps %xmm9, %xmm1 788; SSE2-NEXT: movaps %xmm10, %xmm3 789; SSE2-NEXT: retq 790; 791; SSE42-LABEL: fmul_v16f32_swap: 792; SSE42: # %bb.0: 793; SSE42-NEXT: movaps %xmm3, %xmm8 794; SSE42-NEXT: movdqa %xmm0, %xmm9 795; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 796; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 797; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 798; SSE42-NEXT: pslld $31, %xmm0 799; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 800; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 801; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 802; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 803; SSE42-NEXT: pslld $31, %xmm0 804; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 805; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 806; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 807; SSE42-NEXT: pslld $31, %xmm0 808; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 809; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 810; SSE42-NEXT: pslld $31, %xmm0 811; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 812; SSE42-NEXT: mulps %xmm1, %xmm5 813; SSE42-NEXT: mulps %xmm2, %xmm6 814; SSE42-NEXT: mulps %xmm8, %xmm7 815; SSE42-NEXT: mulps %xmm4, %xmm3 816; SSE42-NEXT: movaps %xmm5, %xmm0 817; SSE42-NEXT: movaps %xmm6, %xmm1 818; SSE42-NEXT: movaps %xmm7, %xmm2 819; SSE42-NEXT: retq 820; 821; AVX2-LABEL: fmul_v16f32_swap: 822; AVX2: # %bb.0: 823; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 824; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 825; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 826; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 827; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 828; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 829; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 830; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 831; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm0 832; AVX2-NEXT: vmulps %ymm4, %ymm2, %ymm1 833; AVX2-NEXT: retq 834; 835; AVX512-LABEL: fmul_v16f32_swap: 836; AVX512: # %bb.0: 837; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 838; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 839; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 840; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm0 841; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 842; AVX512-NEXT: retq 843 %s = select <16 x i1> %b, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, <16 x float> %y 844 %r = fmul <16 x float> %x, %s 845 ret <16 x float> %r 846} 847 848define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 849; SSE2-LABEL: fmul_v16f32_commute_swap: 850; SSE2: # %bb.0: 851; SSE2-NEXT: movaps %xmm2, %xmm8 852; SSE2-NEXT: movdqa %xmm0, %xmm2 853; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 854; SSE2-NEXT: movdqa %xmm2, %xmm10 855; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] 856; SSE2-NEXT: pslld $31, %xmm10 857; SSE2-NEXT: psrad $31, %xmm10 858; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 859; SSE2-NEXT: movdqa %xmm11, %xmm9 860; SSE2-NEXT: pand %xmm10, %xmm9 861; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 862; SSE2-NEXT: por %xmm9, %xmm10 863; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 864; SSE2-NEXT: pslld $31, %xmm2 865; SSE2-NEXT: psrad $31, %xmm2 866; SSE2-NEXT: movdqa %xmm11, %xmm9 867; SSE2-NEXT: pand %xmm2, %xmm9 868; SSE2-NEXT: pandn %xmm7, %xmm2 869; SSE2-NEXT: por %xmm9, %xmm2 870; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 871; SSE2-NEXT: movdqa %xmm0, %xmm9 872; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 873; SSE2-NEXT: pslld $31, %xmm9 874; SSE2-NEXT: psrad $31, %xmm9 875; SSE2-NEXT: movdqa %xmm11, %xmm7 876; SSE2-NEXT: pand %xmm9, %xmm7 877; SSE2-NEXT: pandn %xmm6, %xmm9 878; SSE2-NEXT: por %xmm7, %xmm9 879; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 880; SSE2-NEXT: pslld $31, %xmm0 881; SSE2-NEXT: psrad $31, %xmm0 882; SSE2-NEXT: pand %xmm0, %xmm11 883; SSE2-NEXT: pandn %xmm5, %xmm0 884; SSE2-NEXT: por %xmm11, %xmm0 885; SSE2-NEXT: mulps %xmm1, %xmm0 886; SSE2-NEXT: mulps %xmm8, %xmm9 887; SSE2-NEXT: mulps %xmm3, %xmm2 888; SSE2-NEXT: mulps %xmm4, %xmm10 889; SSE2-NEXT: movaps %xmm9, %xmm1 890; SSE2-NEXT: movaps %xmm10, %xmm3 891; SSE2-NEXT: retq 892; 893; SSE42-LABEL: fmul_v16f32_commute_swap: 894; SSE42: # %bb.0: 895; SSE42-NEXT: movaps %xmm3, %xmm8 896; SSE42-NEXT: movdqa %xmm0, %xmm9 897; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 898; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 899; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 900; SSE42-NEXT: pslld $31, %xmm0 901; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 902; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 903; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 904; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 905; SSE42-NEXT: pslld $31, %xmm0 906; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 907; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 908; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 909; SSE42-NEXT: pslld $31, %xmm0 910; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 911; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 912; SSE42-NEXT: pslld $31, %xmm0 913; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 914; SSE42-NEXT: mulps %xmm1, %xmm5 915; SSE42-NEXT: mulps %xmm2, %xmm6 916; SSE42-NEXT: mulps %xmm8, %xmm7 917; SSE42-NEXT: mulps %xmm4, %xmm3 918; SSE42-NEXT: movaps %xmm5, %xmm0 919; SSE42-NEXT: movaps %xmm6, %xmm1 920; SSE42-NEXT: movaps %xmm7, %xmm2 921; SSE42-NEXT: retq 922; 923; AVX2-LABEL: fmul_v16f32_commute_swap: 924; AVX2: # %bb.0: 925; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 926; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 927; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 928; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 929; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 930; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 931; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 932; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 933; AVX2-NEXT: vmulps %ymm1, %ymm3, %ymm0 934; AVX2-NEXT: vmulps %ymm2, %ymm4, %ymm1 935; AVX2-NEXT: retq 936; 937; AVX512-LABEL: fmul_v16f32_commute_swap: 938; AVX512: # %bb.0: 939; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 940; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 941; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 942; AVX512-NEXT: vmulps %zmm2, %zmm1, %zmm0 943; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 944; AVX512-NEXT: retq 945 %s = select <16 x i1> %b, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, <16 x float> %y 946 %r = fmul <16 x float> %s, %x 947 ret <16 x float> %r 948} 949 950define <4 x float> @fdiv_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) { 951; SSE2-LABEL: fdiv_v4f32: 952; SSE2: # %bb.0: 953; SSE2-NEXT: pslld $31, %xmm0 954; SSE2-NEXT: psrad $31, %xmm0 955; SSE2-NEXT: pand %xmm0, %xmm2 956; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 957; SSE2-NEXT: por %xmm2, %xmm0 958; SSE2-NEXT: divps %xmm0, %xmm1 959; SSE2-NEXT: movaps %xmm1, %xmm0 960; SSE2-NEXT: retq 961; 962; SSE42-LABEL: fdiv_v4f32: 963; SSE42: # %bb.0: 964; SSE42-NEXT: pslld $31, %xmm0 965; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 966; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3 967; SSE42-NEXT: divps %xmm3, %xmm1 968; SSE42-NEXT: movaps %xmm1, %xmm0 969; SSE42-NEXT: retq 970; 971; AVX2-LABEL: fdiv_v4f32: 972; AVX2: # %bb.0: 973; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 974; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 975; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 976; AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 977; AVX2-NEXT: retq 978; 979; AVX512F-LABEL: fdiv_v4f32: 980; AVX512F: # %bb.0: 981; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 982; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 983; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 984; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 985; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 986; AVX512F-NEXT: vdivps %xmm0, %xmm1, %xmm0 987; AVX512F-NEXT: vzeroupper 988; AVX512F-NEXT: retq 989; 990; AVX512VL-LABEL: fdiv_v4f32: 991; AVX512VL: # %bb.0: 992; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 993; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 994; AVX512VL-NEXT: vdivps %xmm2, %xmm1, %xmm1 {%k1} 995; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 996; AVX512VL-NEXT: retq 997 %s = select <4 x i1> %b, <4 x float> %y, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0> 998 %r = fdiv <4 x float> %x, %s 999 ret <4 x float> %r 1000} 1001 1002define <8 x float> @fdiv_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) { 1003; SSE2-LABEL: fdiv_v8f32_commute: 1004; SSE2: # %bb.0: 1005; SSE2-NEXT: movdqa %xmm0, %xmm5 1006; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1007; SSE2-NEXT: pslld $31, %xmm5 1008; SSE2-NEXT: psrad $31, %xmm5 1009; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1010; SSE2-NEXT: pand %xmm5, %xmm4 1011; SSE2-NEXT: pandn %xmm6, %xmm5 1012; SSE2-NEXT: por %xmm4, %xmm5 1013; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1014; SSE2-NEXT: pslld $31, %xmm0 1015; SSE2-NEXT: psrad $31, %xmm0 1016; SSE2-NEXT: pand %xmm0, %xmm3 1017; SSE2-NEXT: pandn %xmm6, %xmm0 1018; SSE2-NEXT: por %xmm3, %xmm0 1019; SSE2-NEXT: divps %xmm1, %xmm0 1020; SSE2-NEXT: divps %xmm2, %xmm5 1021; SSE2-NEXT: movaps %xmm5, %xmm1 1022; SSE2-NEXT: retq 1023; 1024; SSE42-LABEL: fdiv_v8f32_commute: 1025; SSE42: # %bb.0: 1026; SSE42-NEXT: movdqa %xmm0, %xmm5 1027; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1028; SSE42-NEXT: pslld $31, %xmm0 1029; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1030; SSE42-NEXT: movaps %xmm6, %xmm7 1031; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 1032; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1033; SSE42-NEXT: pslld $31, %xmm5 1034; SSE42-NEXT: movdqa %xmm5, %xmm0 1035; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6 1036; SSE42-NEXT: divps %xmm1, %xmm7 1037; SSE42-NEXT: divps %xmm2, %xmm6 1038; SSE42-NEXT: movaps %xmm7, %xmm0 1039; SSE42-NEXT: movaps %xmm6, %xmm1 1040; SSE42-NEXT: retq 1041; 1042; AVX2-LABEL: fdiv_v8f32_commute: 1043; AVX2: # %bb.0: 1044; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1045; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 1046; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1047; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 1048; AVX2-NEXT: vdivps %ymm1, %ymm0, %ymm0 1049; AVX2-NEXT: retq 1050; 1051; AVX512F-LABEL: fdiv_v8f32_commute: 1052; AVX512F: # %bb.0: 1053; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 1054; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 1055; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 1056; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 1057; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1058; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} 1059; AVX512F-NEXT: vdivps %ymm1, %ymm0, %ymm0 1060; AVX512F-NEXT: retq 1061; 1062; AVX512VL-LABEL: fdiv_v8f32_commute: 1063; AVX512VL: # %bb.0: 1064; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 1065; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 1066; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 1067; AVX512VL-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1068; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 {%k1} 1069; AVX512VL-NEXT: vdivps %ymm1, %ymm0, %ymm0 1070; AVX512VL-NEXT: retq 1071 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 1072 %r = fdiv <8 x float> %s, %x 1073 ret <8 x float> %r 1074} 1075 1076define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 1077; SSE2-LABEL: fdiv_v16f32_swap: 1078; SSE2: # %bb.0: 1079; SSE2-NEXT: movdqa %xmm0, %xmm9 1080; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1081; SSE2-NEXT: movdqa %xmm9, %xmm8 1082; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 1083; SSE2-NEXT: pslld $31, %xmm8 1084; SSE2-NEXT: psrad $31, %xmm8 1085; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1086; SSE2-NEXT: movdqa %xmm10, %xmm11 1087; SSE2-NEXT: pand %xmm8, %xmm11 1088; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 1089; SSE2-NEXT: por %xmm11, %xmm8 1090; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 1091; SSE2-NEXT: pslld $31, %xmm9 1092; SSE2-NEXT: psrad $31, %xmm9 1093; SSE2-NEXT: movdqa %xmm10, %xmm11 1094; SSE2-NEXT: pand %xmm9, %xmm11 1095; SSE2-NEXT: pandn %xmm7, %xmm9 1096; SSE2-NEXT: por %xmm11, %xmm9 1097; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1098; SSE2-NEXT: movdqa %xmm0, %xmm7 1099; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 1100; SSE2-NEXT: pslld $31, %xmm7 1101; SSE2-NEXT: psrad $31, %xmm7 1102; SSE2-NEXT: movdqa %xmm10, %xmm11 1103; SSE2-NEXT: pand %xmm7, %xmm11 1104; SSE2-NEXT: pandn %xmm6, %xmm7 1105; SSE2-NEXT: por %xmm11, %xmm7 1106; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1107; SSE2-NEXT: pslld $31, %xmm0 1108; SSE2-NEXT: psrad $31, %xmm0 1109; SSE2-NEXT: pand %xmm0, %xmm10 1110; SSE2-NEXT: pandn %xmm5, %xmm0 1111; SSE2-NEXT: por %xmm10, %xmm0 1112; SSE2-NEXT: divps %xmm0, %xmm1 1113; SSE2-NEXT: divps %xmm7, %xmm2 1114; SSE2-NEXT: divps %xmm9, %xmm3 1115; SSE2-NEXT: divps %xmm8, %xmm4 1116; SSE2-NEXT: movaps %xmm1, %xmm0 1117; SSE2-NEXT: movaps %xmm2, %xmm1 1118; SSE2-NEXT: movaps %xmm3, %xmm2 1119; SSE2-NEXT: movaps %xmm4, %xmm3 1120; SSE2-NEXT: retq 1121; 1122; SSE42-LABEL: fdiv_v16f32_swap: 1123; SSE42: # %bb.0: 1124; SSE42-NEXT: movdqa %xmm0, %xmm8 1125; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9 1126; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1127; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1128; SSE42-NEXT: pslld $31, %xmm0 1129; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1130; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9 1131; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 1132; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1133; SSE42-NEXT: pslld $31, %xmm0 1134; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 1135; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 1136; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1137; SSE42-NEXT: pslld $31, %xmm0 1138; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 1139; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 1140; SSE42-NEXT: pslld $31, %xmm0 1141; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 1142; SSE42-NEXT: divps %xmm5, %xmm1 1143; SSE42-NEXT: divps %xmm6, %xmm2 1144; SSE42-NEXT: divps %xmm7, %xmm3 1145; SSE42-NEXT: divps %xmm9, %xmm4 1146; SSE42-NEXT: movaps %xmm1, %xmm0 1147; SSE42-NEXT: movaps %xmm2, %xmm1 1148; SSE42-NEXT: movaps %xmm3, %xmm2 1149; SSE42-NEXT: movaps %xmm4, %xmm3 1150; SSE42-NEXT: retq 1151; 1152; AVX2-LABEL: fdiv_v16f32_swap: 1153; AVX2: # %bb.0: 1154; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1155; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 1156; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1157; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 1158; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1159; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1160; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 1161; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 1162; AVX2-NEXT: vdivps %ymm3, %ymm1, %ymm0 1163; AVX2-NEXT: vdivps %ymm4, %ymm2, %ymm1 1164; AVX2-NEXT: retq 1165; 1166; AVX512-LABEL: fdiv_v16f32_swap: 1167; AVX512: # %bb.0: 1168; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 1169; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 1170; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 1171; AVX512-NEXT: vdivps %zmm2, %zmm1, %zmm0 1172; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} 1173; AVX512-NEXT: retq 1174 %s = select <16 x i1> %b, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, <16 x float> %y 1175 %r = fdiv <16 x float> %x, %s 1176 ret <16 x float> %r 1177} 1178 1179define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { 1180; SSE2-LABEL: fdiv_v16f32_commute_swap: 1181; SSE2: # %bb.0: 1182; SSE2-NEXT: movaps %xmm2, %xmm8 1183; SSE2-NEXT: movdqa %xmm0, %xmm2 1184; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1185; SSE2-NEXT: movdqa %xmm2, %xmm10 1186; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7] 1187; SSE2-NEXT: pslld $31, %xmm10 1188; SSE2-NEXT: psrad $31, %xmm10 1189; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1190; SSE2-NEXT: movdqa %xmm11, %xmm9 1191; SSE2-NEXT: pand %xmm10, %xmm9 1192; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10 1193; SSE2-NEXT: por %xmm9, %xmm10 1194; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1195; SSE2-NEXT: pslld $31, %xmm2 1196; SSE2-NEXT: psrad $31, %xmm2 1197; SSE2-NEXT: movdqa %xmm11, %xmm9 1198; SSE2-NEXT: pand %xmm2, %xmm9 1199; SSE2-NEXT: pandn %xmm7, %xmm2 1200; SSE2-NEXT: por %xmm9, %xmm2 1201; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1202; SSE2-NEXT: movdqa %xmm0, %xmm9 1203; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 1204; SSE2-NEXT: pslld $31, %xmm9 1205; SSE2-NEXT: psrad $31, %xmm9 1206; SSE2-NEXT: movdqa %xmm11, %xmm7 1207; SSE2-NEXT: pand %xmm9, %xmm7 1208; SSE2-NEXT: pandn %xmm6, %xmm9 1209; SSE2-NEXT: por %xmm7, %xmm9 1210; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1211; SSE2-NEXT: pslld $31, %xmm0 1212; SSE2-NEXT: psrad $31, %xmm0 1213; SSE2-NEXT: pand %xmm0, %xmm11 1214; SSE2-NEXT: pandn %xmm5, %xmm0 1215; SSE2-NEXT: por %xmm11, %xmm0 1216; SSE2-NEXT: divps %xmm1, %xmm0 1217; SSE2-NEXT: divps %xmm8, %xmm9 1218; SSE2-NEXT: divps %xmm3, %xmm2 1219; SSE2-NEXT: divps %xmm4, %xmm10 1220; SSE2-NEXT: movaps %xmm9, %xmm1 1221; SSE2-NEXT: movaps %xmm10, %xmm3 1222; SSE2-NEXT: retq 1223; 1224; SSE42-LABEL: fdiv_v16f32_commute_swap: 1225; SSE42: # %bb.0: 1226; SSE42-NEXT: movaps %xmm3, %xmm8 1227; SSE42-NEXT: movdqa %xmm0, %xmm9 1228; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3 1229; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1230; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1231; SSE42-NEXT: pslld $31, %xmm0 1232; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1233; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3 1234; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] 1235; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1236; SSE42-NEXT: pslld $31, %xmm0 1237; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7 1238; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] 1239; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1240; SSE42-NEXT: pslld $31, %xmm0 1241; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6 1242; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero 1243; SSE42-NEXT: pslld $31, %xmm0 1244; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5 1245; SSE42-NEXT: divps %xmm1, %xmm5 1246; SSE42-NEXT: divps %xmm2, %xmm6 1247; SSE42-NEXT: divps %xmm8, %xmm7 1248; SSE42-NEXT: divps %xmm4, %xmm3 1249; SSE42-NEXT: movaps %xmm5, %xmm0 1250; SSE42-NEXT: movaps %xmm6, %xmm1 1251; SSE42-NEXT: movaps %xmm7, %xmm2 1252; SSE42-NEXT: retq 1253; 1254; AVX2-LABEL: fdiv_v16f32_commute_swap: 1255; AVX2: # %bb.0: 1256; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 1257; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 1258; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1259; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 1260; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1261; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1262; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 1263; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 1264; AVX2-NEXT: vdivps %ymm1, %ymm3, %ymm0 1265; AVX2-NEXT: vdivps %ymm2, %ymm4, %ymm1 1266; AVX2-NEXT: retq 1267; 1268; AVX512-LABEL: fdiv_v16f32_commute_swap: 1269; AVX512: # %bb.0: 1270; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 1271; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 1272; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 1273; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 {%k1} = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1274; AVX512-NEXT: vdivps %zmm1, %zmm2, %zmm0 1275; AVX512-NEXT: retq 1276 %s = select <16 x i1> %b, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, <16 x float> %y 1277 %r = fdiv <16 x float> %s, %x 1278 ret <16 x float> %r 1279} 1280 1281define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { 1282; SSE2-LABEL: fadd_v8f32_cast_cond: 1283; SSE2: # %bb.0: 1284; SSE2-NEXT: movd %edi, %xmm4 1285; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1286; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 1287; SSE2-NEXT: movdqa %xmm4, %xmm6 1288; SSE2-NEXT: pand %xmm5, %xmm6 1289; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 1290; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1291; SSE2-NEXT: pand %xmm6, %xmm3 1292; SSE2-NEXT: pandn %xmm5, %xmm6 1293; SSE2-NEXT: por %xmm3, %xmm6 1294; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 1295; SSE2-NEXT: pand %xmm3, %xmm4 1296; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1297; SSE2-NEXT: pand %xmm4, %xmm2 1298; SSE2-NEXT: pandn %xmm5, %xmm4 1299; SSE2-NEXT: por %xmm2, %xmm4 1300; SSE2-NEXT: addps %xmm4, %xmm0 1301; SSE2-NEXT: addps %xmm6, %xmm1 1302; SSE2-NEXT: retq 1303; 1304; SSE42-LABEL: fadd_v8f32_cast_cond: 1305; SSE42: # %bb.0: 1306; SSE42-NEXT: movaps %xmm0, %xmm4 1307; SSE42-NEXT: movd %edi, %xmm0 1308; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 1309; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 1310; SSE42-NEXT: movdqa %xmm5, %xmm0 1311; SSE42-NEXT: pand %xmm6, %xmm0 1312; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 1313; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1314; SSE42-NEXT: movaps %xmm6, %xmm7 1315; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 1316; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] 1317; SSE42-NEXT: pand %xmm0, %xmm5 1318; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 1319; SSE42-NEXT: movdqa %xmm5, %xmm0 1320; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6 1321; SSE42-NEXT: addps %xmm4, %xmm6 1322; SSE42-NEXT: addps %xmm7, %xmm1 1323; SSE42-NEXT: movaps %xmm6, %xmm0 1324; SSE42-NEXT: retq 1325; 1326; AVX2-LABEL: fadd_v8f32_cast_cond: 1327; AVX2: # %bb.0: 1328; AVX2-NEXT: vmovd %edi, %xmm2 1329; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1330; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 1331; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1332; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 1333; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1334; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 1335; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 1336; AVX2-NEXT: retq 1337; 1338; AVX512F-LABEL: fadd_v8f32_cast_cond: 1339; AVX512F: # %bb.0: 1340; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1341; AVX512F-NEXT: kmovw %edi, %k1 1342; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1343; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} 1344; AVX512F-NEXT: vaddps %ymm2, %ymm0, %ymm0 1345; AVX512F-NEXT: retq 1346; 1347; AVX512VL-LABEL: fadd_v8f32_cast_cond: 1348; AVX512VL: # %bb.0: 1349; AVX512VL-NEXT: kmovw %edi, %k1 1350; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} 1351; AVX512VL-NEXT: retq 1352 %b = bitcast i8 %pb to <8 x i1> 1353 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0> 1354 %r = fadd <8 x float> %x, %s 1355 ret <8 x float> %r 1356} 1357 1358define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { 1359; SSE2-LABEL: fadd_v8f64_cast_cond: 1360; SSE2: # %bb.0: 1361; SSE2-NEXT: movd %edi, %xmm8 1362; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 1363; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128] 1364; SSE2-NEXT: movdqa %xmm9, %xmm10 1365; SSE2-NEXT: pand %xmm8, %xmm10 1366; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 1367; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2] 1368; SSE2-NEXT: pand %xmm10, %xmm8 1369; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] 1370; SSE2-NEXT: pand %xmm8, %xmm7 1371; SSE2-NEXT: pandn %xmm10, %xmm8 1372; SSE2-NEXT: por %xmm7, %xmm8 1373; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] 1374; SSE2-NEXT: movdqa %xmm9, %xmm11 1375; SSE2-NEXT: pand %xmm7, %xmm11 1376; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 1377; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2] 1378; SSE2-NEXT: pand %xmm11, %xmm7 1379; SSE2-NEXT: pand %xmm7, %xmm6 1380; SSE2-NEXT: pandn %xmm10, %xmm7 1381; SSE2-NEXT: por %xmm6, %xmm7 1382; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 1383; SSE2-NEXT: movdqa %xmm9, %xmm11 1384; SSE2-NEXT: pand %xmm6, %xmm11 1385; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 1386; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2] 1387; SSE2-NEXT: pand %xmm11, %xmm6 1388; SSE2-NEXT: pand %xmm6, %xmm5 1389; SSE2-NEXT: pandn %xmm10, %xmm6 1390; SSE2-NEXT: por %xmm5, %xmm6 1391; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 1392; SSE2-NEXT: pand %xmm5, %xmm9 1393; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 1394; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 1395; SSE2-NEXT: pand %xmm9, %xmm5 1396; SSE2-NEXT: pand %xmm5, %xmm4 1397; SSE2-NEXT: pandn %xmm10, %xmm5 1398; SSE2-NEXT: por %xmm4, %xmm5 1399; SSE2-NEXT: addpd %xmm5, %xmm0 1400; SSE2-NEXT: addpd %xmm6, %xmm1 1401; SSE2-NEXT: addpd %xmm7, %xmm2 1402; SSE2-NEXT: addpd %xmm8, %xmm3 1403; SSE2-NEXT: retq 1404; 1405; SSE42-LABEL: fadd_v8f64_cast_cond: 1406; SSE42: # %bb.0: 1407; SSE42-NEXT: movapd %xmm0, %xmm9 1408; SSE42-NEXT: movd %edi, %xmm0 1409; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] 1410; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 1411; SSE42-NEXT: movdqa %xmm8, %xmm0 1412; SSE42-NEXT: pand %xmm10, %xmm0 1413; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 1414; SSE42-NEXT: movapd {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0] 1415; SSE42-NEXT: movapd %xmm10, %xmm11 1416; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 1417; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 1418; SSE42-NEXT: movdqa %xmm8, %xmm0 1419; SSE42-NEXT: pand %xmm7, %xmm0 1420; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 1421; SSE42-NEXT: movapd %xmm10, %xmm7 1422; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 1423; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 1424; SSE42-NEXT: movdqa %xmm8, %xmm0 1425; SSE42-NEXT: pand %xmm6, %xmm0 1426; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 1427; SSE42-NEXT: movapd %xmm10, %xmm6 1428; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 1429; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] 1430; SSE42-NEXT: pand %xmm0, %xmm8 1431; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 1432; SSE42-NEXT: movdqa %xmm8, %xmm0 1433; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 1434; SSE42-NEXT: addpd %xmm9, %xmm10 1435; SSE42-NEXT: addpd %xmm6, %xmm1 1436; SSE42-NEXT: addpd %xmm7, %xmm2 1437; SSE42-NEXT: addpd %xmm11, %xmm3 1438; SSE42-NEXT: movapd %xmm10, %xmm0 1439; SSE42-NEXT: retq 1440; 1441; AVX2-LABEL: fadd_v8f64_cast_cond: 1442; AVX2: # %bb.0: 1443; AVX2-NEXT: vmovd %edi, %xmm4 1444; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 1445; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 1446; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 1447; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 1448; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1449; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 1450; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 1451; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 1452; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 1453; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 1454; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1455; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1456; AVX2-NEXT: retq 1457; 1458; AVX512-LABEL: fadd_v8f64_cast_cond: 1459; AVX512: # %bb.0: 1460; AVX512-NEXT: kmovw %edi, %k1 1461; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} 1462; AVX512-NEXT: retq 1463 %b = bitcast i8 %pb to <8 x i1> 1464 %s = select <8 x i1> %b, <8 x double> %y, <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0> 1465 %r = fadd <8 x double> %x, %s 1466 ret <8 x double> %r 1467} 1468 1469define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { 1470; SSE2-LABEL: fsub_v8f32_cast_cond: 1471; SSE2: # %bb.0: 1472; SSE2-NEXT: movd %edi, %xmm4 1473; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1474; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 1475; SSE2-NEXT: movdqa %xmm4, %xmm6 1476; SSE2-NEXT: pand %xmm5, %xmm6 1477; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 1478; SSE2-NEXT: pand %xmm3, %xmm6 1479; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 1480; SSE2-NEXT: pand %xmm3, %xmm4 1481; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1482; SSE2-NEXT: pand %xmm2, %xmm4 1483; SSE2-NEXT: subps %xmm4, %xmm0 1484; SSE2-NEXT: subps %xmm6, %xmm1 1485; SSE2-NEXT: retq 1486; 1487; SSE42-LABEL: fsub_v8f32_cast_cond: 1488; SSE42: # %bb.0: 1489; SSE42-NEXT: movd %edi, %xmm4 1490; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1491; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] 1492; SSE42-NEXT: movdqa %xmm4, %xmm6 1493; SSE42-NEXT: pand %xmm5, %xmm6 1494; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 1495; SSE42-NEXT: pand %xmm3, %xmm6 1496; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 1497; SSE42-NEXT: pand %xmm3, %xmm4 1498; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 1499; SSE42-NEXT: pand %xmm2, %xmm4 1500; SSE42-NEXT: subps %xmm4, %xmm0 1501; SSE42-NEXT: subps %xmm6, %xmm1 1502; SSE42-NEXT: retq 1503; 1504; AVX2-LABEL: fsub_v8f32_cast_cond: 1505; AVX2: # %bb.0: 1506; AVX2-NEXT: vmovd %edi, %xmm2 1507; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1508; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 1509; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1510; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 1511; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 1512; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm0 1513; AVX2-NEXT: retq 1514; 1515; AVX512F-LABEL: fsub_v8f32_cast_cond: 1516; AVX512F: # %bb.0: 1517; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1518; AVX512F-NEXT: kmovw %edi, %k1 1519; AVX512F-NEXT: vmovaps %zmm1, %zmm1 {%k1} {z} 1520; AVX512F-NEXT: vsubps %ymm1, %ymm0, %ymm0 1521; AVX512F-NEXT: retq 1522; 1523; AVX512VL-LABEL: fsub_v8f32_cast_cond: 1524; AVX512VL: # %bb.0: 1525; AVX512VL-NEXT: kmovw %edi, %k1 1526; AVX512VL-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} 1527; AVX512VL-NEXT: retq 1528 %b = bitcast i8 %pb to <8 x i1> 1529 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> zeroinitializer 1530 %r = fsub <8 x float> %x, %s 1531 ret <8 x float> %r 1532} 1533 1534define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { 1535; SSE2-LABEL: fsub_v8f64_cast_cond: 1536; SSE2: # %bb.0: 1537; SSE2-NEXT: movd %edi, %xmm8 1538; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 1539; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 1540; SSE2-NEXT: movdqa %xmm9, %xmm8 1541; SSE2-NEXT: pand %xmm10, %xmm8 1542; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 1543; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 1544; SSE2-NEXT: pand %xmm7, %xmm8 1545; SSE2-NEXT: pand %xmm10, %xmm8 1546; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 1547; SSE2-NEXT: movdqa %xmm9, %xmm7 1548; SSE2-NEXT: pand %xmm10, %xmm7 1549; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 1550; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 1551; SSE2-NEXT: pand %xmm6, %xmm7 1552; SSE2-NEXT: pand %xmm10, %xmm7 1553; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 1554; SSE2-NEXT: movdqa %xmm9, %xmm10 1555; SSE2-NEXT: pand %xmm6, %xmm10 1556; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 1557; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] 1558; SSE2-NEXT: pand %xmm5, %xmm10 1559; SSE2-NEXT: pand %xmm6, %xmm10 1560; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 1561; SSE2-NEXT: pand %xmm5, %xmm9 1562; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 1563; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 1564; SSE2-NEXT: pand %xmm4, %xmm9 1565; SSE2-NEXT: pand %xmm5, %xmm9 1566; SSE2-NEXT: subpd %xmm9, %xmm0 1567; SSE2-NEXT: subpd %xmm10, %xmm1 1568; SSE2-NEXT: subpd %xmm7, %xmm2 1569; SSE2-NEXT: subpd %xmm8, %xmm3 1570; SSE2-NEXT: retq 1571; 1572; SSE42-LABEL: fsub_v8f64_cast_cond: 1573; SSE42: # %bb.0: 1574; SSE42-NEXT: movd %edi, %xmm8 1575; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 1576; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 1577; SSE42-NEXT: movdqa %xmm9, %xmm8 1578; SSE42-NEXT: pand %xmm10, %xmm8 1579; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 1580; SSE42-NEXT: pand %xmm7, %xmm8 1581; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 1582; SSE42-NEXT: movdqa %xmm9, %xmm10 1583; SSE42-NEXT: pand %xmm7, %xmm10 1584; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 1585; SSE42-NEXT: pand %xmm6, %xmm10 1586; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 1587; SSE42-NEXT: movdqa %xmm9, %xmm7 1588; SSE42-NEXT: pand %xmm6, %xmm7 1589; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 1590; SSE42-NEXT: pand %xmm5, %xmm7 1591; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 1592; SSE42-NEXT: pand %xmm5, %xmm9 1593; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 1594; SSE42-NEXT: pand %xmm4, %xmm9 1595; SSE42-NEXT: subpd %xmm9, %xmm0 1596; SSE42-NEXT: subpd %xmm7, %xmm1 1597; SSE42-NEXT: subpd %xmm10, %xmm2 1598; SSE42-NEXT: subpd %xmm8, %xmm3 1599; SSE42-NEXT: retq 1600; 1601; AVX2-LABEL: fsub_v8f64_cast_cond: 1602; AVX2: # %bb.0: 1603; AVX2-NEXT: vmovd %edi, %xmm4 1604; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 1605; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 1606; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 1607; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 1608; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 1609; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 1610; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 1611; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 1612; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 1613; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 1614; AVX2-NEXT: vsubpd %ymm3, %ymm1, %ymm1 1615; AVX2-NEXT: retq 1616; 1617; AVX512-LABEL: fsub_v8f64_cast_cond: 1618; AVX512: # %bb.0: 1619; AVX512-NEXT: kmovw %edi, %k1 1620; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0 {%k1} 1621; AVX512-NEXT: retq 1622 %b = bitcast i8 %pb to <8 x i1> 1623 %s = select <8 x i1> %b, <8 x double> %y, <8 x double> zeroinitializer 1624 %r = fsub <8 x double> %x, %s 1625 ret <8 x double> %r 1626} 1627 1628define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { 1629; SSE2-LABEL: fmul_v8f32_cast_cond: 1630; SSE2: # %bb.0: 1631; SSE2-NEXT: movd %edi, %xmm4 1632; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1633; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 1634; SSE2-NEXT: movdqa %xmm4, %xmm6 1635; SSE2-NEXT: pand %xmm5, %xmm6 1636; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 1637; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1638; SSE2-NEXT: pand %xmm6, %xmm3 1639; SSE2-NEXT: pandn %xmm5, %xmm6 1640; SSE2-NEXT: por %xmm3, %xmm6 1641; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 1642; SSE2-NEXT: pand %xmm3, %xmm4 1643; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1644; SSE2-NEXT: pand %xmm4, %xmm2 1645; SSE2-NEXT: pandn %xmm5, %xmm4 1646; SSE2-NEXT: por %xmm2, %xmm4 1647; SSE2-NEXT: mulps %xmm4, %xmm0 1648; SSE2-NEXT: mulps %xmm6, %xmm1 1649; SSE2-NEXT: retq 1650; 1651; SSE42-LABEL: fmul_v8f32_cast_cond: 1652; SSE42: # %bb.0: 1653; SSE42-NEXT: movaps %xmm0, %xmm4 1654; SSE42-NEXT: movd %edi, %xmm0 1655; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 1656; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 1657; SSE42-NEXT: movdqa %xmm5, %xmm0 1658; SSE42-NEXT: pand %xmm6, %xmm0 1659; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 1660; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1661; SSE42-NEXT: movaps %xmm6, %xmm7 1662; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 1663; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] 1664; SSE42-NEXT: pand %xmm0, %xmm5 1665; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 1666; SSE42-NEXT: movdqa %xmm5, %xmm0 1667; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6 1668; SSE42-NEXT: mulps %xmm4, %xmm6 1669; SSE42-NEXT: mulps %xmm7, %xmm1 1670; SSE42-NEXT: movaps %xmm6, %xmm0 1671; SSE42-NEXT: retq 1672; 1673; AVX2-LABEL: fmul_v8f32_cast_cond: 1674; AVX2: # %bb.0: 1675; AVX2-NEXT: vmovd %edi, %xmm2 1676; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1677; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 1678; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1679; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 1680; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1681; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 1682; AVX2-NEXT: vmulps %ymm1, %ymm0, %ymm0 1683; AVX2-NEXT: retq 1684; 1685; AVX512F-LABEL: fmul_v8f32_cast_cond: 1686; AVX512F: # %bb.0: 1687; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1688; AVX512F-NEXT: kmovw %edi, %k1 1689; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1690; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} 1691; AVX512F-NEXT: vmulps %ymm2, %ymm0, %ymm0 1692; AVX512F-NEXT: retq 1693; 1694; AVX512VL-LABEL: fmul_v8f32_cast_cond: 1695; AVX512VL: # %bb.0: 1696; AVX512VL-NEXT: kmovw %edi, %k1 1697; AVX512VL-NEXT: vmulps %ymm1, %ymm0, %ymm0 {%k1} 1698; AVX512VL-NEXT: retq 1699 %b = bitcast i8 %pb to <8 x i1> 1700 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 1701 %r = fmul <8 x float> %x, %s 1702 ret <8 x float> %r 1703} 1704 1705define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { 1706; SSE2-LABEL: fmul_v8f64_cast_cond: 1707; SSE2: # %bb.0: 1708; SSE2-NEXT: movd %edi, %xmm8 1709; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 1710; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128] 1711; SSE2-NEXT: movdqa %xmm9, %xmm10 1712; SSE2-NEXT: pand %xmm8, %xmm10 1713; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 1714; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2] 1715; SSE2-NEXT: pand %xmm10, %xmm8 1716; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0] 1717; SSE2-NEXT: pand %xmm8, %xmm7 1718; SSE2-NEXT: pandn %xmm10, %xmm8 1719; SSE2-NEXT: por %xmm7, %xmm8 1720; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] 1721; SSE2-NEXT: movdqa %xmm9, %xmm11 1722; SSE2-NEXT: pand %xmm7, %xmm11 1723; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 1724; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2] 1725; SSE2-NEXT: pand %xmm11, %xmm7 1726; SSE2-NEXT: pand %xmm7, %xmm6 1727; SSE2-NEXT: pandn %xmm10, %xmm7 1728; SSE2-NEXT: por %xmm6, %xmm7 1729; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 1730; SSE2-NEXT: movdqa %xmm9, %xmm11 1731; SSE2-NEXT: pand %xmm6, %xmm11 1732; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 1733; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2] 1734; SSE2-NEXT: pand %xmm11, %xmm6 1735; SSE2-NEXT: pand %xmm6, %xmm5 1736; SSE2-NEXT: pandn %xmm10, %xmm6 1737; SSE2-NEXT: por %xmm5, %xmm6 1738; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 1739; SSE2-NEXT: pand %xmm5, %xmm9 1740; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 1741; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 1742; SSE2-NEXT: pand %xmm9, %xmm5 1743; SSE2-NEXT: pand %xmm5, %xmm4 1744; SSE2-NEXT: pandn %xmm10, %xmm5 1745; SSE2-NEXT: por %xmm4, %xmm5 1746; SSE2-NEXT: mulpd %xmm5, %xmm0 1747; SSE2-NEXT: mulpd %xmm6, %xmm1 1748; SSE2-NEXT: mulpd %xmm7, %xmm2 1749; SSE2-NEXT: mulpd %xmm8, %xmm3 1750; SSE2-NEXT: retq 1751; 1752; SSE42-LABEL: fmul_v8f64_cast_cond: 1753; SSE42: # %bb.0: 1754; SSE42-NEXT: movapd %xmm0, %xmm9 1755; SSE42-NEXT: movd %edi, %xmm0 1756; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] 1757; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 1758; SSE42-NEXT: movdqa %xmm8, %xmm0 1759; SSE42-NEXT: pand %xmm10, %xmm0 1760; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 1761; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1.0E+0,1.0E+0] 1762; SSE42-NEXT: movapd %xmm10, %xmm11 1763; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 1764; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 1765; SSE42-NEXT: movdqa %xmm8, %xmm0 1766; SSE42-NEXT: pand %xmm7, %xmm0 1767; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 1768; SSE42-NEXT: movapd %xmm10, %xmm7 1769; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 1770; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 1771; SSE42-NEXT: movdqa %xmm8, %xmm0 1772; SSE42-NEXT: pand %xmm6, %xmm0 1773; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 1774; SSE42-NEXT: movapd %xmm10, %xmm6 1775; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 1776; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] 1777; SSE42-NEXT: pand %xmm0, %xmm8 1778; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 1779; SSE42-NEXT: movdqa %xmm8, %xmm0 1780; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 1781; SSE42-NEXT: mulpd %xmm9, %xmm10 1782; SSE42-NEXT: mulpd %xmm6, %xmm1 1783; SSE42-NEXT: mulpd %xmm7, %xmm2 1784; SSE42-NEXT: mulpd %xmm11, %xmm3 1785; SSE42-NEXT: movapd %xmm10, %xmm0 1786; SSE42-NEXT: retq 1787; 1788; AVX2-LABEL: fmul_v8f64_cast_cond: 1789; AVX2: # %bb.0: 1790; AVX2-NEXT: vmovd %edi, %xmm4 1791; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 1792; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 1793; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 1794; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 1795; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1796; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 1797; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 1798; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 1799; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 1800; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 1801; AVX2-NEXT: vmulpd %ymm2, %ymm0, %ymm0 1802; AVX2-NEXT: vmulpd %ymm3, %ymm1, %ymm1 1803; AVX2-NEXT: retq 1804; 1805; AVX512-LABEL: fmul_v8f64_cast_cond: 1806; AVX512: # %bb.0: 1807; AVX512-NEXT: kmovw %edi, %k1 1808; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 {%k1} 1809; AVX512-NEXT: retq 1810 %b = bitcast i8 %pb to <8 x i1> 1811 %s = select <8 x i1> %b, <8 x double> %y, <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0> 1812 %r = fmul <8 x double> %x, %s 1813 ret <8 x double> %r 1814} 1815 1816define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) { 1817; SSE2-LABEL: fdiv_v8f32_cast_cond: 1818; SSE2: # %bb.0: 1819; SSE2-NEXT: movd %edi, %xmm4 1820; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 1821; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 1822; SSE2-NEXT: movdqa %xmm4, %xmm6 1823; SSE2-NEXT: pand %xmm5, %xmm6 1824; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 1825; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1826; SSE2-NEXT: pand %xmm6, %xmm3 1827; SSE2-NEXT: pandn %xmm5, %xmm6 1828; SSE2-NEXT: por %xmm3, %xmm6 1829; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 1830; SSE2-NEXT: pand %xmm3, %xmm4 1831; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1832; SSE2-NEXT: pand %xmm4, %xmm2 1833; SSE2-NEXT: pandn %xmm5, %xmm4 1834; SSE2-NEXT: por %xmm2, %xmm4 1835; SSE2-NEXT: divps %xmm4, %xmm0 1836; SSE2-NEXT: divps %xmm6, %xmm1 1837; SSE2-NEXT: retq 1838; 1839; SSE42-LABEL: fdiv_v8f32_cast_cond: 1840; SSE42: # %bb.0: 1841; SSE42-NEXT: movaps %xmm0, %xmm4 1842; SSE42-NEXT: movd %edi, %xmm0 1843; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 1844; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 1845; SSE42-NEXT: movdqa %xmm5, %xmm0 1846; SSE42-NEXT: pand %xmm6, %xmm0 1847; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 1848; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1849; SSE42-NEXT: movaps %xmm6, %xmm7 1850; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 1851; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] 1852; SSE42-NEXT: pand %xmm0, %xmm5 1853; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 1854; SSE42-NEXT: movdqa %xmm5, %xmm0 1855; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6 1856; SSE42-NEXT: divps %xmm6, %xmm4 1857; SSE42-NEXT: divps %xmm7, %xmm1 1858; SSE42-NEXT: movaps %xmm4, %xmm0 1859; SSE42-NEXT: retq 1860; 1861; AVX2-LABEL: fdiv_v8f32_cast_cond: 1862; AVX2: # %bb.0: 1863; AVX2-NEXT: vmovd %edi, %xmm2 1864; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1865; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 1866; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1867; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 1868; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1869; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 1870; AVX2-NEXT: vdivps %ymm1, %ymm0, %ymm0 1871; AVX2-NEXT: retq 1872; 1873; AVX512F-LABEL: fdiv_v8f32_cast_cond: 1874; AVX512F: # %bb.0: 1875; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1876; AVX512F-NEXT: kmovw %edi, %k1 1877; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1878; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} 1879; AVX512F-NEXT: vdivps %ymm2, %ymm0, %ymm0 1880; AVX512F-NEXT: retq 1881; 1882; AVX512VL-LABEL: fdiv_v8f32_cast_cond: 1883; AVX512VL: # %bb.0: 1884; AVX512VL-NEXT: kmovw %edi, %k1 1885; AVX512VL-NEXT: vdivps %ymm1, %ymm0, %ymm0 {%k1} 1886; AVX512VL-NEXT: retq 1887 %b = bitcast i8 %pb to <8 x i1> 1888 %s = select <8 x i1> %b, <8 x float> %y, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 1889 %r = fdiv <8 x float> %x, %s 1890 ret <8 x float> %r 1891} 1892 1893define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { 1894; SSE2-LABEL: fdiv_v8f64_cast_cond: 1895; SSE2: # %bb.0: 1896; SSE2-NEXT: movd %edi, %xmm8 1897; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 1898; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128] 1899; SSE2-NEXT: movdqa %xmm9, %xmm10 1900; SSE2-NEXT: pand %xmm8, %xmm10 1901; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 1902; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2] 1903; SSE2-NEXT: pand %xmm10, %xmm8 1904; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0] 1905; SSE2-NEXT: pand %xmm8, %xmm7 1906; SSE2-NEXT: pandn %xmm10, %xmm8 1907; SSE2-NEXT: por %xmm7, %xmm8 1908; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32] 1909; SSE2-NEXT: movdqa %xmm9, %xmm11 1910; SSE2-NEXT: pand %xmm7, %xmm11 1911; SSE2-NEXT: pcmpeqd %xmm7, %xmm11 1912; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2] 1913; SSE2-NEXT: pand %xmm11, %xmm7 1914; SSE2-NEXT: pand %xmm7, %xmm6 1915; SSE2-NEXT: pandn %xmm10, %xmm7 1916; SSE2-NEXT: por %xmm6, %xmm7 1917; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 1918; SSE2-NEXT: movdqa %xmm9, %xmm11 1919; SSE2-NEXT: pand %xmm6, %xmm11 1920; SSE2-NEXT: pcmpeqd %xmm6, %xmm11 1921; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2] 1922; SSE2-NEXT: pand %xmm11, %xmm6 1923; SSE2-NEXT: pand %xmm6, %xmm5 1924; SSE2-NEXT: pandn %xmm10, %xmm6 1925; SSE2-NEXT: por %xmm5, %xmm6 1926; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 1927; SSE2-NEXT: pand %xmm5, %xmm9 1928; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 1929; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 1930; SSE2-NEXT: pand %xmm9, %xmm5 1931; SSE2-NEXT: pand %xmm5, %xmm4 1932; SSE2-NEXT: pandn %xmm10, %xmm5 1933; SSE2-NEXT: por %xmm4, %xmm5 1934; SSE2-NEXT: divpd %xmm5, %xmm0 1935; SSE2-NEXT: divpd %xmm6, %xmm1 1936; SSE2-NEXT: divpd %xmm7, %xmm2 1937; SSE2-NEXT: divpd %xmm8, %xmm3 1938; SSE2-NEXT: retq 1939; 1940; SSE42-LABEL: fdiv_v8f64_cast_cond: 1941; SSE42: # %bb.0: 1942; SSE42-NEXT: movapd %xmm0, %xmm9 1943; SSE42-NEXT: movd %edi, %xmm0 1944; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] 1945; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 1946; SSE42-NEXT: movdqa %xmm8, %xmm0 1947; SSE42-NEXT: pand %xmm10, %xmm0 1948; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 1949; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0] 1950; SSE42-NEXT: movapd %xmm11, %xmm10 1951; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10 1952; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 1953; SSE42-NEXT: movdqa %xmm8, %xmm0 1954; SSE42-NEXT: pand %xmm7, %xmm0 1955; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 1956; SSE42-NEXT: movapd %xmm11, %xmm7 1957; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 1958; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 1959; SSE42-NEXT: movdqa %xmm8, %xmm0 1960; SSE42-NEXT: pand %xmm6, %xmm0 1961; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 1962; SSE42-NEXT: movapd %xmm11, %xmm6 1963; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 1964; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] 1965; SSE42-NEXT: pand %xmm0, %xmm8 1966; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 1967; SSE42-NEXT: movdqa %xmm8, %xmm0 1968; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm11 1969; SSE42-NEXT: divpd %xmm11, %xmm9 1970; SSE42-NEXT: divpd %xmm6, %xmm1 1971; SSE42-NEXT: divpd %xmm7, %xmm2 1972; SSE42-NEXT: divpd %xmm10, %xmm3 1973; SSE42-NEXT: movapd %xmm9, %xmm0 1974; SSE42-NEXT: retq 1975; 1976; AVX2-LABEL: fdiv_v8f64_cast_cond: 1977; AVX2: # %bb.0: 1978; AVX2-NEXT: vmovd %edi, %xmm4 1979; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 1980; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 1981; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 1982; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 1983; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 1984; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 1985; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 1986; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 1987; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 1988; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 1989; AVX2-NEXT: vdivpd %ymm2, %ymm0, %ymm0 1990; AVX2-NEXT: vdivpd %ymm3, %ymm1, %ymm1 1991; AVX2-NEXT: retq 1992; 1993; AVX512-LABEL: fdiv_v8f64_cast_cond: 1994; AVX512: # %bb.0: 1995; AVX512-NEXT: kmovw %edi, %k1 1996; AVX512-NEXT: vdivpd %zmm1, %zmm0, %zmm0 {%k1} 1997; AVX512-NEXT: retq 1998 %b = bitcast i8 %pb to <8 x i1> 1999 %s = select <8 x i1> %b, <8 x double> %y, <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0> 2000 %r = fdiv <8 x double> %x, %s 2001 ret <8 x double> %r 2002} 2003 2004define <4 x i32> @add_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 2005; SSE-LABEL: add_v4i32: 2006; SSE: # %bb.0: 2007; SSE-NEXT: pslld $31, %xmm0 2008; SSE-NEXT: psrad $31, %xmm0 2009; SSE-NEXT: pand %xmm2, %xmm0 2010; SSE-NEXT: paddd %xmm1, %xmm0 2011; SSE-NEXT: retq 2012; 2013; AVX2-LABEL: add_v4i32: 2014; AVX2: # %bb.0: 2015; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 2016; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 2017; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2018; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2019; AVX2-NEXT: retq 2020; 2021; AVX512F-LABEL: add_v4i32: 2022; AVX512F: # %bb.0: 2023; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 2024; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2025; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 2026; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 2027; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2028; AVX512F-NEXT: vzeroupper 2029; AVX512F-NEXT: retq 2030; 2031; AVX512VL-LABEL: add_v4i32: 2032; AVX512VL: # %bb.0: 2033; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 2034; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 2035; AVX512VL-NEXT: vpaddd %xmm2, %xmm1, %xmm1 {%k1} 2036; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 2037; AVX512VL-NEXT: retq 2038 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer 2039 %r = add <4 x i32> %x, %s 2040 ret <4 x i32> %r 2041} 2042 2043define <8 x i32> @add_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2044; SSE2-LABEL: add_v8i32_commute: 2045; SSE2: # %bb.0: 2046; SSE2-NEXT: movdqa %xmm0, %xmm5 2047; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2048; SSE2-NEXT: pslld $31, %xmm5 2049; SSE2-NEXT: psrad $31, %xmm5 2050; SSE2-NEXT: pand %xmm4, %xmm5 2051; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2052; SSE2-NEXT: pslld $31, %xmm0 2053; SSE2-NEXT: psrad $31, %xmm0 2054; SSE2-NEXT: pand %xmm3, %xmm0 2055; SSE2-NEXT: paddd %xmm1, %xmm0 2056; SSE2-NEXT: paddd %xmm2, %xmm5 2057; SSE2-NEXT: movdqa %xmm5, %xmm1 2058; SSE2-NEXT: retq 2059; 2060; SSE42-LABEL: add_v8i32_commute: 2061; SSE42: # %bb.0: 2062; SSE42-NEXT: movdqa %xmm0, %xmm5 2063; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2064; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2065; SSE42-NEXT: pslld $31, %xmm5 2066; SSE42-NEXT: psrad $31, %xmm5 2067; SSE42-NEXT: pand %xmm4, %xmm5 2068; SSE42-NEXT: pslld $31, %xmm0 2069; SSE42-NEXT: psrad $31, %xmm0 2070; SSE42-NEXT: pand %xmm3, %xmm0 2071; SSE42-NEXT: paddd %xmm1, %xmm0 2072; SSE42-NEXT: paddd %xmm2, %xmm5 2073; SSE42-NEXT: movdqa %xmm5, %xmm1 2074; SSE42-NEXT: retq 2075; 2076; AVX2-LABEL: add_v8i32_commute: 2077; AVX2: # %bb.0: 2078; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2079; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2080; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 2081; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2082; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 2083; AVX2-NEXT: retq 2084; 2085; AVX512F-LABEL: add_v8i32_commute: 2086; AVX512F: # %bb.0: 2087; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 2088; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2089; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 2090; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 2091; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 2092; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 2093; AVX512F-NEXT: retq 2094; 2095; AVX512VL-LABEL: add_v8i32_commute: 2096; AVX512VL: # %bb.0: 2097; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 2098; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 2099; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 2100; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 {%k1} 2101; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 2102; AVX512VL-NEXT: retq 2103 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 2104 %r = add <8 x i32> %s, %x 2105 ret <8 x i32> %r 2106} 2107 2108define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2109; SSE2-LABEL: add_v8i32_cast_cond: 2110; SSE2: # %bb.0: 2111; SSE2-NEXT: movd %edi, %xmm4 2112; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 2113; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 2114; SSE2-NEXT: movdqa %xmm4, %xmm6 2115; SSE2-NEXT: pand %xmm5, %xmm6 2116; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 2117; SSE2-NEXT: pand %xmm3, %xmm6 2118; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 2119; SSE2-NEXT: pand %xmm3, %xmm4 2120; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2121; SSE2-NEXT: pand %xmm2, %xmm4 2122; SSE2-NEXT: paddd %xmm4, %xmm0 2123; SSE2-NEXT: paddd %xmm6, %xmm1 2124; SSE2-NEXT: retq 2125; 2126; SSE42-LABEL: add_v8i32_cast_cond: 2127; SSE42: # %bb.0: 2128; SSE42-NEXT: movd %edi, %xmm4 2129; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 2130; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] 2131; SSE42-NEXT: movdqa %xmm4, %xmm6 2132; SSE42-NEXT: pand %xmm5, %xmm6 2133; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 2134; SSE42-NEXT: pand %xmm3, %xmm6 2135; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 2136; SSE42-NEXT: pand %xmm3, %xmm4 2137; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 2138; SSE42-NEXT: pand %xmm2, %xmm4 2139; SSE42-NEXT: paddd %xmm4, %xmm0 2140; SSE42-NEXT: paddd %xmm6, %xmm1 2141; SSE42-NEXT: retq 2142; 2143; AVX2-LABEL: add_v8i32_cast_cond: 2144; AVX2: # %bb.0: 2145; AVX2-NEXT: vmovd %edi, %xmm2 2146; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 2147; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 2148; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 2149; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 2150; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 2151; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 2152; AVX2-NEXT: retq 2153; 2154; AVX512F-LABEL: add_v8i32_cast_cond: 2155; AVX512F: # %bb.0: 2156; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2157; AVX512F-NEXT: kmovw %edi, %k1 2158; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 2159; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 2160; AVX512F-NEXT: retq 2161; 2162; AVX512VL-LABEL: add_v8i32_cast_cond: 2163; AVX512VL: # %bb.0: 2164; AVX512VL-NEXT: kmovw %edi, %k1 2165; AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} 2166; AVX512VL-NEXT: retq 2167 %b = bitcast i8 %pb to <8 x i1> 2168 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 2169 %r = add <8 x i32> %x, %s 2170 ret <8 x i32> %r 2171} 2172 2173define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 2174; SSE2-LABEL: add_v8i64_cast_cond: 2175; SSE2: # %bb.0: 2176; SSE2-NEXT: movd %edi, %xmm8 2177; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 2178; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 2179; SSE2-NEXT: movdqa %xmm9, %xmm8 2180; SSE2-NEXT: pand %xmm10, %xmm8 2181; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 2182; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 2183; SSE2-NEXT: pand %xmm7, %xmm8 2184; SSE2-NEXT: pand %xmm10, %xmm8 2185; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 2186; SSE2-NEXT: movdqa %xmm9, %xmm7 2187; SSE2-NEXT: pand %xmm10, %xmm7 2188; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 2189; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 2190; SSE2-NEXT: pand %xmm6, %xmm7 2191; SSE2-NEXT: pand %xmm10, %xmm7 2192; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 2193; SSE2-NEXT: movdqa %xmm9, %xmm10 2194; SSE2-NEXT: pand %xmm6, %xmm10 2195; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 2196; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] 2197; SSE2-NEXT: pand %xmm5, %xmm10 2198; SSE2-NEXT: pand %xmm6, %xmm10 2199; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 2200; SSE2-NEXT: pand %xmm5, %xmm9 2201; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 2202; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 2203; SSE2-NEXT: pand %xmm4, %xmm9 2204; SSE2-NEXT: pand %xmm5, %xmm9 2205; SSE2-NEXT: paddq %xmm9, %xmm0 2206; SSE2-NEXT: paddq %xmm10, %xmm1 2207; SSE2-NEXT: paddq %xmm7, %xmm2 2208; SSE2-NEXT: paddq %xmm8, %xmm3 2209; SSE2-NEXT: retq 2210; 2211; SSE42-LABEL: add_v8i64_cast_cond: 2212; SSE42: # %bb.0: 2213; SSE42-NEXT: movd %edi, %xmm8 2214; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 2215; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 2216; SSE42-NEXT: movdqa %xmm9, %xmm8 2217; SSE42-NEXT: pand %xmm10, %xmm8 2218; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 2219; SSE42-NEXT: pand %xmm7, %xmm8 2220; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 2221; SSE42-NEXT: movdqa %xmm9, %xmm10 2222; SSE42-NEXT: pand %xmm7, %xmm10 2223; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 2224; SSE42-NEXT: pand %xmm6, %xmm10 2225; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 2226; SSE42-NEXT: movdqa %xmm9, %xmm7 2227; SSE42-NEXT: pand %xmm6, %xmm7 2228; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 2229; SSE42-NEXT: pand %xmm5, %xmm7 2230; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 2231; SSE42-NEXT: pand %xmm5, %xmm9 2232; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 2233; SSE42-NEXT: pand %xmm4, %xmm9 2234; SSE42-NEXT: paddq %xmm9, %xmm0 2235; SSE42-NEXT: paddq %xmm7, %xmm1 2236; SSE42-NEXT: paddq %xmm10, %xmm2 2237; SSE42-NEXT: paddq %xmm8, %xmm3 2238; SSE42-NEXT: retq 2239; 2240; AVX2-LABEL: add_v8i64_cast_cond: 2241; AVX2: # %bb.0: 2242; AVX2-NEXT: vmovd %edi, %xmm4 2243; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 2244; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 2245; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 2246; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 2247; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 2248; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 2249; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 2250; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 2251; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 2252; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 2253; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 2254; AVX2-NEXT: retq 2255; 2256; AVX512-LABEL: add_v8i64_cast_cond: 2257; AVX512: # %bb.0: 2258; AVX512-NEXT: kmovw %edi, %k1 2259; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} 2260; AVX512-NEXT: retq 2261 %b = bitcast i8 %pb to <8 x i1> 2262 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer 2263 %r = add <8 x i64> %x, %s 2264 ret <8 x i64> %r 2265} 2266 2267define <4 x i32> @sub_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 2268; SSE-LABEL: sub_v4i32: 2269; SSE: # %bb.0: 2270; SSE-NEXT: pslld $31, %xmm0 2271; SSE-NEXT: psrad $31, %xmm0 2272; SSE-NEXT: pand %xmm2, %xmm0 2273; SSE-NEXT: psubd %xmm0, %xmm1 2274; SSE-NEXT: movdqa %xmm1, %xmm0 2275; SSE-NEXT: retq 2276; 2277; AVX2-LABEL: sub_v4i32: 2278; AVX2: # %bb.0: 2279; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 2280; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 2281; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2282; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 2283; AVX2-NEXT: retq 2284; 2285; AVX512F-LABEL: sub_v4i32: 2286; AVX512F: # %bb.0: 2287; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 2288; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2289; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 2290; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 2291; AVX512F-NEXT: vpsubd %xmm0, %xmm1, %xmm0 2292; AVX512F-NEXT: vzeroupper 2293; AVX512F-NEXT: retq 2294; 2295; AVX512VL-LABEL: sub_v4i32: 2296; AVX512VL: # %bb.0: 2297; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 2298; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 2299; AVX512VL-NEXT: vpsubd %xmm2, %xmm1, %xmm1 {%k1} 2300; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 2301; AVX512VL-NEXT: retq 2302 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer 2303 %r = sub <4 x i32> %x, %s 2304 ret <4 x i32> %r 2305} 2306 2307; negative test - sub is not commutative; there is no identity constant for operand 0 2308 2309define <8 x i32> @sub_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2310; SSE2-LABEL: sub_v8i32_commute: 2311; SSE2: # %bb.0: 2312; SSE2-NEXT: movdqa %xmm0, %xmm5 2313; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2314; SSE2-NEXT: pslld $31, %xmm5 2315; SSE2-NEXT: psrad $31, %xmm5 2316; SSE2-NEXT: pand %xmm4, %xmm5 2317; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2318; SSE2-NEXT: pslld $31, %xmm0 2319; SSE2-NEXT: psrad $31, %xmm0 2320; SSE2-NEXT: pand %xmm3, %xmm0 2321; SSE2-NEXT: psubd %xmm1, %xmm0 2322; SSE2-NEXT: psubd %xmm2, %xmm5 2323; SSE2-NEXT: movdqa %xmm5, %xmm1 2324; SSE2-NEXT: retq 2325; 2326; SSE42-LABEL: sub_v8i32_commute: 2327; SSE42: # %bb.0: 2328; SSE42-NEXT: movdqa %xmm0, %xmm5 2329; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2330; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2331; SSE42-NEXT: pslld $31, %xmm5 2332; SSE42-NEXT: psrad $31, %xmm5 2333; SSE42-NEXT: pand %xmm4, %xmm5 2334; SSE42-NEXT: pslld $31, %xmm0 2335; SSE42-NEXT: psrad $31, %xmm0 2336; SSE42-NEXT: pand %xmm3, %xmm0 2337; SSE42-NEXT: psubd %xmm1, %xmm0 2338; SSE42-NEXT: psubd %xmm2, %xmm5 2339; SSE42-NEXT: movdqa %xmm5, %xmm1 2340; SSE42-NEXT: retq 2341; 2342; AVX2-LABEL: sub_v8i32_commute: 2343; AVX2: # %bb.0: 2344; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2345; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2346; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 2347; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2348; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2349; AVX2-NEXT: retq 2350; 2351; AVX512F-LABEL: sub_v8i32_commute: 2352; AVX512F: # %bb.0: 2353; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 2354; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2355; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 2356; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 2357; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 2358; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2359; AVX512F-NEXT: retq 2360; 2361; AVX512VL-LABEL: sub_v8i32_commute: 2362; AVX512VL: # %bb.0: 2363; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 2364; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 2365; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 2366; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} 2367; AVX512VL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2368; AVX512VL-NEXT: retq 2369 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 2370 %r = sub <8 x i32> %s, %x 2371 ret <8 x i32> %r 2372} 2373 2374define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 2375; SSE2-LABEL: sub_v16i32_swap: 2376; SSE2: # %bb.0: 2377; SSE2-NEXT: movdqa %xmm0, %xmm9 2378; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2379; SSE2-NEXT: movdqa %xmm9, %xmm8 2380; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 2381; SSE2-NEXT: pslld $31, %xmm8 2382; SSE2-NEXT: psrad $31, %xmm8 2383; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 2384; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 2385; SSE2-NEXT: pslld $31, %xmm9 2386; SSE2-NEXT: psrad $31, %xmm9 2387; SSE2-NEXT: pandn %xmm7, %xmm9 2388; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2389; SSE2-NEXT: movdqa %xmm0, %xmm7 2390; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 2391; SSE2-NEXT: pslld $31, %xmm7 2392; SSE2-NEXT: psrad $31, %xmm7 2393; SSE2-NEXT: pandn %xmm6, %xmm7 2394; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2395; SSE2-NEXT: pslld $31, %xmm0 2396; SSE2-NEXT: psrad $31, %xmm0 2397; SSE2-NEXT: pandn %xmm5, %xmm0 2398; SSE2-NEXT: psubd %xmm0, %xmm1 2399; SSE2-NEXT: psubd %xmm7, %xmm2 2400; SSE2-NEXT: psubd %xmm9, %xmm3 2401; SSE2-NEXT: psubd %xmm8, %xmm4 2402; SSE2-NEXT: movdqa %xmm1, %xmm0 2403; SSE2-NEXT: movdqa %xmm2, %xmm1 2404; SSE2-NEXT: movdqa %xmm3, %xmm2 2405; SSE2-NEXT: movdqa %xmm4, %xmm3 2406; SSE2-NEXT: retq 2407; 2408; SSE42-LABEL: sub_v16i32_swap: 2409; SSE42: # %bb.0: 2410; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 2411; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 2412; SSE42-NEXT: pslld $31, %xmm8 2413; SSE42-NEXT: psrad $31, %xmm8 2414; SSE42-NEXT: pandn %xmm7, %xmm8 2415; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 2416; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 2417; SSE42-NEXT: pslld $31, %xmm7 2418; SSE42-NEXT: psrad $31, %xmm7 2419; SSE42-NEXT: pandn %xmm6, %xmm7 2420; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2421; SSE42-NEXT: pslld $31, %xmm6 2422; SSE42-NEXT: psrad $31, %xmm6 2423; SSE42-NEXT: pandn %xmm5, %xmm6 2424; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2425; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2426; SSE42-NEXT: pslld $31, %xmm0 2427; SSE42-NEXT: psrad $31, %xmm0 2428; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 2429; SSE42-NEXT: psubd %xmm6, %xmm1 2430; SSE42-NEXT: psubd %xmm7, %xmm2 2431; SSE42-NEXT: psubd %xmm8, %xmm3 2432; SSE42-NEXT: psubd %xmm0, %xmm4 2433; SSE42-NEXT: movdqa %xmm1, %xmm0 2434; SSE42-NEXT: movdqa %xmm2, %xmm1 2435; SSE42-NEXT: movdqa %xmm3, %xmm2 2436; SSE42-NEXT: movdqa %xmm4, %xmm3 2437; SSE42-NEXT: retq 2438; 2439; AVX2-LABEL: sub_v16i32_swap: 2440; AVX2: # %bb.0: 2441; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2442; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2443; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 2444; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 2445; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 2446; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2447; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2448; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 2449; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 2450; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 2451; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm1 2452; AVX2-NEXT: retq 2453; 2454; AVX512-LABEL: sub_v16i32_swap: 2455; AVX512: # %bb.0: 2456; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 2457; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 2458; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 2459; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1 {%k1} 2460; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 2461; AVX512-NEXT: retq 2462 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 2463 %r = sub <16 x i32> %x, %s 2464 ret <16 x i32> %r 2465} 2466 2467; negative test - sub is not commutative; there is no identity constant for operand 0 2468 2469define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 2470; SSE2-LABEL: sub_v16i32_commute_swap: 2471; SSE2: # %bb.0: 2472; SSE2-NEXT: movdqa %xmm2, %xmm8 2473; SSE2-NEXT: movdqa %xmm0, %xmm2 2474; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2475; SSE2-NEXT: movdqa %xmm2, %xmm9 2476; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 2477; SSE2-NEXT: pslld $31, %xmm9 2478; SSE2-NEXT: psrad $31, %xmm9 2479; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 2480; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 2481; SSE2-NEXT: pslld $31, %xmm2 2482; SSE2-NEXT: psrad $31, %xmm2 2483; SSE2-NEXT: pandn %xmm7, %xmm2 2484; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2485; SSE2-NEXT: movdqa %xmm0, %xmm7 2486; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 2487; SSE2-NEXT: pslld $31, %xmm7 2488; SSE2-NEXT: psrad $31, %xmm7 2489; SSE2-NEXT: pandn %xmm6, %xmm7 2490; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2491; SSE2-NEXT: pslld $31, %xmm0 2492; SSE2-NEXT: psrad $31, %xmm0 2493; SSE2-NEXT: pandn %xmm5, %xmm0 2494; SSE2-NEXT: psubd %xmm1, %xmm0 2495; SSE2-NEXT: psubd %xmm8, %xmm7 2496; SSE2-NEXT: psubd %xmm3, %xmm2 2497; SSE2-NEXT: psubd %xmm4, %xmm9 2498; SSE2-NEXT: movdqa %xmm7, %xmm1 2499; SSE2-NEXT: movdqa %xmm9, %xmm3 2500; SSE2-NEXT: retq 2501; 2502; SSE42-LABEL: sub_v16i32_commute_swap: 2503; SSE42: # %bb.0: 2504; SSE42-NEXT: movdqa %xmm2, %xmm8 2505; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2506; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 2507; SSE42-NEXT: pslld $31, %xmm2 2508; SSE42-NEXT: psrad $31, %xmm2 2509; SSE42-NEXT: pandn %xmm7, %xmm2 2510; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 2511; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 2512; SSE42-NEXT: pslld $31, %xmm7 2513; SSE42-NEXT: psrad $31, %xmm7 2514; SSE42-NEXT: pandn %xmm6, %xmm7 2515; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2516; SSE42-NEXT: pslld $31, %xmm6 2517; SSE42-NEXT: psrad $31, %xmm6 2518; SSE42-NEXT: pandn %xmm5, %xmm6 2519; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 2520; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2521; SSE42-NEXT: pslld $31, %xmm5 2522; SSE42-NEXT: psrad $31, %xmm5 2523; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 2524; SSE42-NEXT: psubd %xmm1, %xmm6 2525; SSE42-NEXT: psubd %xmm8, %xmm7 2526; SSE42-NEXT: psubd %xmm3, %xmm2 2527; SSE42-NEXT: psubd %xmm4, %xmm5 2528; SSE42-NEXT: movdqa %xmm6, %xmm0 2529; SSE42-NEXT: movdqa %xmm7, %xmm1 2530; SSE42-NEXT: movdqa %xmm5, %xmm3 2531; SSE42-NEXT: retq 2532; 2533; AVX2-LABEL: sub_v16i32_commute_swap: 2534; AVX2: # %bb.0: 2535; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2536; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 2537; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 2538; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 2539; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 2540; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 2541; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2542; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 2543; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 2544; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2545; AVX2-NEXT: vpsubd %ymm2, %ymm4, %ymm1 2546; AVX2-NEXT: retq 2547; 2548; AVX512-LABEL: sub_v16i32_commute_swap: 2549; AVX512: # %bb.0: 2550; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 2551; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 2552; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 2553; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 2554; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 2555; AVX512-NEXT: retq 2556 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 2557 %r = sub <16 x i32> %s, %x 2558 ret <16 x i32> %r 2559} 2560 2561define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2562; SSE2-LABEL: sub_v8i32_cast_cond: 2563; SSE2: # %bb.0: 2564; SSE2-NEXT: movd %edi, %xmm4 2565; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 2566; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 2567; SSE2-NEXT: movdqa %xmm4, %xmm6 2568; SSE2-NEXT: pand %xmm5, %xmm6 2569; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 2570; SSE2-NEXT: pand %xmm3, %xmm6 2571; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 2572; SSE2-NEXT: pand %xmm3, %xmm4 2573; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2574; SSE2-NEXT: pand %xmm2, %xmm4 2575; SSE2-NEXT: psubd %xmm4, %xmm0 2576; SSE2-NEXT: psubd %xmm6, %xmm1 2577; SSE2-NEXT: retq 2578; 2579; SSE42-LABEL: sub_v8i32_cast_cond: 2580; SSE42: # %bb.0: 2581; SSE42-NEXT: movd %edi, %xmm4 2582; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 2583; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] 2584; SSE42-NEXT: movdqa %xmm4, %xmm6 2585; SSE42-NEXT: pand %xmm5, %xmm6 2586; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 2587; SSE42-NEXT: pand %xmm3, %xmm6 2588; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 2589; SSE42-NEXT: pand %xmm3, %xmm4 2590; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 2591; SSE42-NEXT: pand %xmm2, %xmm4 2592; SSE42-NEXT: psubd %xmm4, %xmm0 2593; SSE42-NEXT: psubd %xmm6, %xmm1 2594; SSE42-NEXT: retq 2595; 2596; AVX2-LABEL: sub_v8i32_cast_cond: 2597; AVX2: # %bb.0: 2598; AVX2-NEXT: vmovd %edi, %xmm2 2599; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 2600; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 2601; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 2602; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 2603; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 2604; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2605; AVX2-NEXT: retq 2606; 2607; AVX512F-LABEL: sub_v8i32_cast_cond: 2608; AVX512F: # %bb.0: 2609; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2610; AVX512F-NEXT: kmovw %edi, %k1 2611; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 2612; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm0 2613; AVX512F-NEXT: retq 2614; 2615; AVX512VL-LABEL: sub_v8i32_cast_cond: 2616; AVX512VL: # %bb.0: 2617; AVX512VL-NEXT: kmovw %edi, %k1 2618; AVX512VL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 {%k1} 2619; AVX512VL-NEXT: retq 2620 %b = bitcast i8 %pb to <8 x i1> 2621 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 2622 %r = sub <8 x i32> %x, %s 2623 ret <8 x i32> %r 2624} 2625 2626define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 2627; SSE2-LABEL: sub_v8i64_cast_cond: 2628; SSE2: # %bb.0: 2629; SSE2-NEXT: movd %edi, %xmm8 2630; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 2631; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 2632; SSE2-NEXT: movdqa %xmm9, %xmm8 2633; SSE2-NEXT: pand %xmm10, %xmm8 2634; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 2635; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 2636; SSE2-NEXT: pand %xmm7, %xmm8 2637; SSE2-NEXT: pand %xmm10, %xmm8 2638; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 2639; SSE2-NEXT: movdqa %xmm9, %xmm7 2640; SSE2-NEXT: pand %xmm10, %xmm7 2641; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 2642; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 2643; SSE2-NEXT: pand %xmm6, %xmm7 2644; SSE2-NEXT: pand %xmm10, %xmm7 2645; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 2646; SSE2-NEXT: movdqa %xmm9, %xmm10 2647; SSE2-NEXT: pand %xmm6, %xmm10 2648; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 2649; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] 2650; SSE2-NEXT: pand %xmm5, %xmm10 2651; SSE2-NEXT: pand %xmm6, %xmm10 2652; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 2653; SSE2-NEXT: pand %xmm5, %xmm9 2654; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 2655; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 2656; SSE2-NEXT: pand %xmm4, %xmm9 2657; SSE2-NEXT: pand %xmm5, %xmm9 2658; SSE2-NEXT: psubq %xmm9, %xmm0 2659; SSE2-NEXT: psubq %xmm10, %xmm1 2660; SSE2-NEXT: psubq %xmm7, %xmm2 2661; SSE2-NEXT: psubq %xmm8, %xmm3 2662; SSE2-NEXT: retq 2663; 2664; SSE42-LABEL: sub_v8i64_cast_cond: 2665; SSE42: # %bb.0: 2666; SSE42-NEXT: movd %edi, %xmm8 2667; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 2668; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 2669; SSE42-NEXT: movdqa %xmm9, %xmm8 2670; SSE42-NEXT: pand %xmm10, %xmm8 2671; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 2672; SSE42-NEXT: pand %xmm7, %xmm8 2673; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 2674; SSE42-NEXT: movdqa %xmm9, %xmm10 2675; SSE42-NEXT: pand %xmm7, %xmm10 2676; SSE42-NEXT: pcmpeqq %xmm7, %xmm10 2677; SSE42-NEXT: pand %xmm6, %xmm10 2678; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 2679; SSE42-NEXT: movdqa %xmm9, %xmm7 2680; SSE42-NEXT: pand %xmm6, %xmm7 2681; SSE42-NEXT: pcmpeqq %xmm6, %xmm7 2682; SSE42-NEXT: pand %xmm5, %xmm7 2683; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 2684; SSE42-NEXT: pand %xmm5, %xmm9 2685; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 2686; SSE42-NEXT: pand %xmm4, %xmm9 2687; SSE42-NEXT: psubq %xmm9, %xmm0 2688; SSE42-NEXT: psubq %xmm7, %xmm1 2689; SSE42-NEXT: psubq %xmm10, %xmm2 2690; SSE42-NEXT: psubq %xmm8, %xmm3 2691; SSE42-NEXT: retq 2692; 2693; AVX2-LABEL: sub_v8i64_cast_cond: 2694; AVX2: # %bb.0: 2695; AVX2-NEXT: vmovd %edi, %xmm4 2696; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 2697; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 2698; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 2699; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 2700; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 2701; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 2702; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 2703; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 2704; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 2705; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 2706; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 2707; AVX2-NEXT: retq 2708; 2709; AVX512-LABEL: sub_v8i64_cast_cond: 2710; AVX512: # %bb.0: 2711; AVX512-NEXT: kmovw %edi, %k1 2712; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} 2713; AVX512-NEXT: retq 2714 %b = bitcast i8 %pb to <8 x i1> 2715 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer 2716 %r = sub <8 x i64> %x, %s 2717 ret <8 x i64> %r 2718} 2719 2720define <4 x i32> @mul_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 2721; SSE2-LABEL: mul_v4i32: 2722; SSE2: # %bb.0: 2723; SSE2-NEXT: pslld $31, %xmm0 2724; SSE2-NEXT: psrad $31, %xmm0 2725; SSE2-NEXT: pand %xmm0, %xmm2 2726; SSE2-NEXT: paddd %xmm0, %xmm2 2727; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2728; SSE2-NEXT: psubd %xmm0, %xmm2 2729; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2730; SSE2-NEXT: pmuludq %xmm2, %xmm1 2731; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 2732; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2733; SSE2-NEXT: pmuludq %xmm3, %xmm1 2734; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2735; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2736; SSE2-NEXT: retq 2737; 2738; SSE42-LABEL: mul_v4i32: 2739; SSE42: # %bb.0: 2740; SSE42-NEXT: pslld $31, %xmm0 2741; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1,1,1,1] 2742; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3 2743; SSE42-NEXT: pmulld %xmm1, %xmm3 2744; SSE42-NEXT: movdqa %xmm3, %xmm0 2745; SSE42-NEXT: retq 2746; 2747; AVX2-LABEL: mul_v4i32: 2748; AVX2: # %bb.0: 2749; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 2750; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] 2751; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 2752; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0 2753; AVX2-NEXT: retq 2754; 2755; AVX512F-LABEL: mul_v4i32: 2756; AVX512F: # %bb.0: 2757; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 2758; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2759; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 2760; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] 2761; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 2762; AVX512F-NEXT: vpmulld %xmm0, %xmm1, %xmm0 2763; AVX512F-NEXT: vzeroupper 2764; AVX512F-NEXT: retq 2765; 2766; AVX512VL-LABEL: mul_v4i32: 2767; AVX512VL: # %bb.0: 2768; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 2769; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 2770; AVX512VL-NEXT: vpmulld %xmm2, %xmm1, %xmm1 {%k1} 2771; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 2772; AVX512VL-NEXT: retq 2773 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 2774 %r = mul <4 x i32> %x, %s 2775 ret <4 x i32> %r 2776} 2777 2778define <8 x i32> @mul_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2779; SSE2-LABEL: mul_v8i32_commute: 2780; SSE2: # %bb.0: 2781; SSE2-NEXT: movdqa %xmm0, %xmm5 2782; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2783; SSE2-NEXT: pslld $31, %xmm5 2784; SSE2-NEXT: psrad $31, %xmm5 2785; SSE2-NEXT: pand %xmm5, %xmm4 2786; SSE2-NEXT: paddd %xmm5, %xmm4 2787; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2788; SSE2-NEXT: psubd %xmm5, %xmm4 2789; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 2790; SSE2-NEXT: pslld $31, %xmm0 2791; SSE2-NEXT: psrad $31, %xmm0 2792; SSE2-NEXT: pand %xmm0, %xmm3 2793; SSE2-NEXT: paddd %xmm0, %xmm3 2794; SSE2-NEXT: psubd %xmm5, %xmm3 2795; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2796; SSE2-NEXT: pmuludq %xmm1, %xmm3 2797; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2798; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2799; SSE2-NEXT: pmuludq %xmm5, %xmm1 2800; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2801; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2802; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] 2803; SSE2-NEXT: pmuludq %xmm2, %xmm4 2804; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] 2805; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2806; SSE2-NEXT: pmuludq %xmm3, %xmm2 2807; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2808; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2809; SSE2-NEXT: retq 2810; 2811; SSE42-LABEL: mul_v8i32_commute: 2812; SSE42: # %bb.0: 2813; SSE42-NEXT: movdqa %xmm0, %xmm5 2814; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2815; SSE42-NEXT: pslld $31, %xmm0 2816; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1] 2817; SSE42-NEXT: movaps %xmm6, %xmm7 2818; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 2819; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 2820; SSE42-NEXT: pslld $31, %xmm5 2821; SSE42-NEXT: movdqa %xmm5, %xmm0 2822; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6 2823; SSE42-NEXT: pmulld %xmm1, %xmm7 2824; SSE42-NEXT: pmulld %xmm2, %xmm6 2825; SSE42-NEXT: movdqa %xmm7, %xmm0 2826; SSE42-NEXT: movdqa %xmm6, %xmm1 2827; SSE42-NEXT: retq 2828; 2829; AVX2-LABEL: mul_v8i32_commute: 2830; AVX2: # %bb.0: 2831; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2832; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 2833; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 2834; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 2835; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2836; AVX2-NEXT: retq 2837; 2838; AVX512F-LABEL: mul_v8i32_commute: 2839; AVX512F: # %bb.0: 2840; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 2841; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 2842; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 2843; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 2844; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] 2845; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} 2846; AVX512F-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2847; AVX512F-NEXT: retq 2848; 2849; AVX512VL-LABEL: mul_v8i32_commute: 2850; AVX512VL: # %bb.0: 2851; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 2852; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 2853; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 2854; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm1 {%k1} 2855; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 2856; AVX512VL-NEXT: retq 2857 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2858 %r = mul <8 x i32> %s, %x 2859 ret <8 x i32> %r 2860} 2861 2862define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 2863; SSE2-LABEL: mul_v8i32_cast_cond: 2864; SSE2: # %bb.0: 2865; SSE2-NEXT: movd %edi, %xmm4 2866; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 2867; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128] 2868; SSE2-NEXT: movdqa %xmm4, %xmm6 2869; SSE2-NEXT: pand %xmm5, %xmm6 2870; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 2871; SSE2-NEXT: pand %xmm6, %xmm3 2872; SSE2-NEXT: paddd %xmm6, %xmm3 2873; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2874; SSE2-NEXT: psubd %xmm5, %xmm3 2875; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8] 2876; SSE2-NEXT: pand %xmm6, %xmm4 2877; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 2878; SSE2-NEXT: pand %xmm4, %xmm2 2879; SSE2-NEXT: paddd %xmm4, %xmm2 2880; SSE2-NEXT: psubd %xmm5, %xmm2 2881; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2882; SSE2-NEXT: pmuludq %xmm2, %xmm0 2883; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2884; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2885; SSE2-NEXT: pmuludq %xmm4, %xmm2 2886; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2887; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2888; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2889; SSE2-NEXT: pmuludq %xmm3, %xmm1 2890; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2891; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2892; SSE2-NEXT: pmuludq %xmm2, %xmm3 2893; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 2894; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 2895; SSE2-NEXT: retq 2896; 2897; SSE42-LABEL: mul_v8i32_cast_cond: 2898; SSE42: # %bb.0: 2899; SSE42-NEXT: movdqa %xmm0, %xmm4 2900; SSE42-NEXT: movd %edi, %xmm0 2901; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0] 2902; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 2903; SSE42-NEXT: movdqa %xmm5, %xmm0 2904; SSE42-NEXT: pand %xmm6, %xmm0 2905; SSE42-NEXT: pcmpeqd %xmm6, %xmm0 2906; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1] 2907; SSE42-NEXT: movaps %xmm6, %xmm7 2908; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7 2909; SSE42-NEXT: pmovsxbd {{.*#+}} xmm0 = [1,2,4,8] 2910; SSE42-NEXT: pand %xmm0, %xmm5 2911; SSE42-NEXT: pcmpeqd %xmm0, %xmm5 2912; SSE42-NEXT: movdqa %xmm5, %xmm0 2913; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6 2914; SSE42-NEXT: pmulld %xmm4, %xmm6 2915; SSE42-NEXT: pmulld %xmm7, %xmm1 2916; SSE42-NEXT: movdqa %xmm6, %xmm0 2917; SSE42-NEXT: retq 2918; 2919; AVX2-LABEL: mul_v8i32_cast_cond: 2920; AVX2: # %bb.0: 2921; AVX2-NEXT: vmovd %edi, %xmm2 2922; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 2923; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 2924; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 2925; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 2926; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] 2927; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 2928; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 2929; AVX2-NEXT: retq 2930; 2931; AVX512F-LABEL: mul_v8i32_cast_cond: 2932; AVX512F: # %bb.0: 2933; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2934; AVX512F-NEXT: kmovw %edi, %k1 2935; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] 2936; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} 2937; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2938; AVX512F-NEXT: retq 2939; 2940; AVX512VL-LABEL: mul_v8i32_cast_cond: 2941; AVX512VL: # %bb.0: 2942; AVX512VL-NEXT: kmovw %edi, %k1 2943; AVX512VL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 {%k1} 2944; AVX512VL-NEXT: retq 2945 %b = bitcast i8 %pb to <8 x i1> 2946 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2947 %r = mul <8 x i32> %x, %s 2948 ret <8 x i32> %r 2949} 2950 2951define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 2952; SSE2-LABEL: mul_v8i64_cast_cond: 2953; SSE2: # %bb.0: 2954; SSE2-NEXT: movd %edi, %xmm8 2955; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,0,1] 2956; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128] 2957; SSE2-NEXT: movdqa %xmm10, %xmm9 2958; SSE2-NEXT: pand %xmm8, %xmm9 2959; SSE2-NEXT: pcmpeqd %xmm8, %xmm9 2960; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm9[1,0,3,2] 2961; SSE2-NEXT: pand %xmm9, %xmm8 2962; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1,1] 2963; SSE2-NEXT: pand %xmm8, %xmm7 2964; SSE2-NEXT: pandn %xmm11, %xmm8 2965; SSE2-NEXT: por %xmm7, %xmm8 2966; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [16,32] 2967; SSE2-NEXT: movdqa %xmm10, %xmm12 2968; SSE2-NEXT: pand %xmm9, %xmm12 2969; SSE2-NEXT: pcmpeqd %xmm9, %xmm12 2970; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,0,3,2] 2971; SSE2-NEXT: pand %xmm12, %xmm9 2972; SSE2-NEXT: pand %xmm9, %xmm6 2973; SSE2-NEXT: pandn %xmm11, %xmm9 2974; SSE2-NEXT: por %xmm6, %xmm9 2975; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [4,8] 2976; SSE2-NEXT: movdqa %xmm10, %xmm13 2977; SSE2-NEXT: pand %xmm12, %xmm13 2978; SSE2-NEXT: pcmpeqd %xmm12, %xmm13 2979; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm13[1,0,3,2] 2980; SSE2-NEXT: pand %xmm13, %xmm12 2981; SSE2-NEXT: pand %xmm12, %xmm5 2982; SSE2-NEXT: pandn %xmm11, %xmm12 2983; SSE2-NEXT: por %xmm5, %xmm12 2984; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [1,2] 2985; SSE2-NEXT: pand %xmm13, %xmm10 2986; SSE2-NEXT: pcmpeqd %xmm13, %xmm10 2987; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm10[1,0,3,2] 2988; SSE2-NEXT: pand %xmm10, %xmm13 2989; SSE2-NEXT: pand %xmm13, %xmm4 2990; SSE2-NEXT: pandn %xmm11, %xmm13 2991; SSE2-NEXT: por %xmm4, %xmm13 2992; SSE2-NEXT: movdqa %xmm0, %xmm10 2993; SSE2-NEXT: pmuludq %xmm13, %xmm10 2994; SSE2-NEXT: movdqa %xmm0, %xmm11 2995; SSE2-NEXT: psrlq $32, %xmm11 2996; SSE2-NEXT: pmuludq %xmm13, %xmm11 2997; SSE2-NEXT: psrlq $32, %xmm4 2998; SSE2-NEXT: pmuludq %xmm4, %xmm0 2999; SSE2-NEXT: paddq %xmm11, %xmm0 3000; SSE2-NEXT: psllq $32, %xmm0 3001; SSE2-NEXT: paddq %xmm10, %xmm0 3002; SSE2-NEXT: movdqa %xmm1, %xmm4 3003; SSE2-NEXT: pmuludq %xmm12, %xmm4 3004; SSE2-NEXT: movdqa %xmm1, %xmm10 3005; SSE2-NEXT: psrlq $32, %xmm10 3006; SSE2-NEXT: pmuludq %xmm12, %xmm10 3007; SSE2-NEXT: psrlq $32, %xmm5 3008; SSE2-NEXT: pmuludq %xmm5, %xmm1 3009; SSE2-NEXT: paddq %xmm10, %xmm1 3010; SSE2-NEXT: psllq $32, %xmm1 3011; SSE2-NEXT: paddq %xmm4, %xmm1 3012; SSE2-NEXT: movdqa %xmm2, %xmm4 3013; SSE2-NEXT: pmuludq %xmm9, %xmm4 3014; SSE2-NEXT: movdqa %xmm2, %xmm5 3015; SSE2-NEXT: psrlq $32, %xmm5 3016; SSE2-NEXT: pmuludq %xmm9, %xmm5 3017; SSE2-NEXT: psrlq $32, %xmm6 3018; SSE2-NEXT: pmuludq %xmm6, %xmm2 3019; SSE2-NEXT: paddq %xmm5, %xmm2 3020; SSE2-NEXT: psllq $32, %xmm2 3021; SSE2-NEXT: paddq %xmm4, %xmm2 3022; SSE2-NEXT: movdqa %xmm3, %xmm4 3023; SSE2-NEXT: pmuludq %xmm8, %xmm4 3024; SSE2-NEXT: movdqa %xmm3, %xmm5 3025; SSE2-NEXT: psrlq $32, %xmm5 3026; SSE2-NEXT: pmuludq %xmm8, %xmm5 3027; SSE2-NEXT: psrlq $32, %xmm7 3028; SSE2-NEXT: pmuludq %xmm7, %xmm3 3029; SSE2-NEXT: paddq %xmm5, %xmm3 3030; SSE2-NEXT: psllq $32, %xmm3 3031; SSE2-NEXT: paddq %xmm4, %xmm3 3032; SSE2-NEXT: retq 3033; 3034; SSE42-LABEL: mul_v8i64_cast_cond: 3035; SSE42: # %bb.0: 3036; SSE42-NEXT: movdqa %xmm0, %xmm9 3037; SSE42-NEXT: movd %edi, %xmm0 3038; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,1] 3039; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 3040; SSE42-NEXT: movdqa %xmm8, %xmm0 3041; SSE42-NEXT: pand %xmm10, %xmm0 3042; SSE42-NEXT: pcmpeqq %xmm10, %xmm0 3043; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] 3044; SSE42-NEXT: movapd %xmm10, %xmm11 3045; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11 3046; SSE42-NEXT: pmovsxbq {{.*#+}} xmm7 = [16,32] 3047; SSE42-NEXT: movdqa %xmm8, %xmm0 3048; SSE42-NEXT: pand %xmm7, %xmm0 3049; SSE42-NEXT: pcmpeqq %xmm7, %xmm0 3050; SSE42-NEXT: movapd %xmm10, %xmm7 3051; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7 3052; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 3053; SSE42-NEXT: movdqa %xmm8, %xmm0 3054; SSE42-NEXT: pand %xmm6, %xmm0 3055; SSE42-NEXT: pcmpeqq %xmm6, %xmm0 3056; SSE42-NEXT: movapd %xmm10, %xmm6 3057; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6 3058; SSE42-NEXT: pmovsxbq {{.*#+}} xmm0 = [1,2] 3059; SSE42-NEXT: pand %xmm0, %xmm8 3060; SSE42-NEXT: pcmpeqq %xmm0, %xmm8 3061; SSE42-NEXT: movdqa %xmm8, %xmm0 3062; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm10 3063; SSE42-NEXT: movdqa %xmm9, %xmm0 3064; SSE42-NEXT: psrlq $32, %xmm0 3065; SSE42-NEXT: pmuludq %xmm10, %xmm0 3066; SSE42-NEXT: movdqa %xmm10, %xmm4 3067; SSE42-NEXT: psrlq $32, %xmm4 3068; SSE42-NEXT: pmuludq %xmm9, %xmm4 3069; SSE42-NEXT: paddq %xmm0, %xmm4 3070; SSE42-NEXT: psllq $32, %xmm4 3071; SSE42-NEXT: pmuludq %xmm9, %xmm10 3072; SSE42-NEXT: paddq %xmm4, %xmm10 3073; SSE42-NEXT: movdqa %xmm1, %xmm0 3074; SSE42-NEXT: psrlq $32, %xmm0 3075; SSE42-NEXT: pmuludq %xmm6, %xmm0 3076; SSE42-NEXT: movdqa %xmm6, %xmm4 3077; SSE42-NEXT: psrlq $32, %xmm4 3078; SSE42-NEXT: pmuludq %xmm1, %xmm4 3079; SSE42-NEXT: paddq %xmm0, %xmm4 3080; SSE42-NEXT: psllq $32, %xmm4 3081; SSE42-NEXT: pmuludq %xmm6, %xmm1 3082; SSE42-NEXT: paddq %xmm4, %xmm1 3083; SSE42-NEXT: movdqa %xmm2, %xmm0 3084; SSE42-NEXT: psrlq $32, %xmm0 3085; SSE42-NEXT: pmuludq %xmm7, %xmm0 3086; SSE42-NEXT: movdqa %xmm7, %xmm4 3087; SSE42-NEXT: psrlq $32, %xmm4 3088; SSE42-NEXT: pmuludq %xmm2, %xmm4 3089; SSE42-NEXT: paddq %xmm0, %xmm4 3090; SSE42-NEXT: psllq $32, %xmm4 3091; SSE42-NEXT: pmuludq %xmm7, %xmm2 3092; SSE42-NEXT: paddq %xmm4, %xmm2 3093; SSE42-NEXT: movdqa %xmm3, %xmm0 3094; SSE42-NEXT: psrlq $32, %xmm0 3095; SSE42-NEXT: pmuludq %xmm11, %xmm0 3096; SSE42-NEXT: movdqa %xmm11, %xmm4 3097; SSE42-NEXT: psrlq $32, %xmm4 3098; SSE42-NEXT: pmuludq %xmm3, %xmm4 3099; SSE42-NEXT: paddq %xmm0, %xmm4 3100; SSE42-NEXT: psllq $32, %xmm4 3101; SSE42-NEXT: pmuludq %xmm11, %xmm3 3102; SSE42-NEXT: paddq %xmm4, %xmm3 3103; SSE42-NEXT: movdqa %xmm10, %xmm0 3104; SSE42-NEXT: retq 3105; 3106; AVX2-LABEL: mul_v8i64_cast_cond: 3107; AVX2: # %bb.0: 3108; AVX2-NEXT: vmovd %edi, %xmm4 3109; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 3110; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 3111; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 3112; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 3113; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] 3114; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 3115; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 3116; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 3117; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 3118; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 3119; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm4 3120; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 3121; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm5 3122; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5 3123; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 3124; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 3125; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 3126; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 3127; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2 3128; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 3129; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4 3130; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm4 3131; AVX2-NEXT: vpaddq %ymm2, %ymm4, %ymm2 3132; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 3133; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 3134; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 3135; AVX2-NEXT: retq 3136; 3137; AVX512-LABEL: mul_v8i64_cast_cond: 3138; AVX512: # %bb.0: 3139; AVX512-NEXT: kmovw %edi, %k1 3140; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2 3141; AVX512-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 3142; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm3 3143; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 3144; AVX512-NEXT: vpaddq %zmm2, %zmm3, %zmm2 3145; AVX512-NEXT: vpsllq $32, %zmm2, %zmm2 3146; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm1 3147; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1} 3148; AVX512-NEXT: retq 3149 %b = bitcast i8 %pb to <8 x i1> 3150 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 3151 %r = mul <8 x i64> %x, %s 3152 ret <8 x i64> %r 3153} 3154 3155define <4 x i32> @shl_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 3156; SSE2-LABEL: shl_v4i32: 3157; SSE2: # %bb.0: 3158; SSE2-NEXT: pslld $31, %xmm0 3159; SSE2-NEXT: psrad $31, %xmm0 3160; SSE2-NEXT: pand %xmm2, %xmm0 3161; SSE2-NEXT: pslld $23, %xmm0 3162; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3163; SSE2-NEXT: cvttps2dq %xmm0, %xmm2 3164; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 3165; SSE2-NEXT: pmuludq %xmm2, %xmm1 3166; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 3167; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 3168; SSE2-NEXT: pmuludq %xmm3, %xmm1 3169; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3170; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3171; SSE2-NEXT: retq 3172; 3173; SSE42-LABEL: shl_v4i32: 3174; SSE42: # %bb.0: 3175; SSE42-NEXT: pslld $31, %xmm0 3176; SSE42-NEXT: psrad $31, %xmm0 3177; SSE42-NEXT: pand %xmm2, %xmm0 3178; SSE42-NEXT: pslld $23, %xmm0 3179; SSE42-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3180; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 3181; SSE42-NEXT: pmulld %xmm1, %xmm0 3182; SSE42-NEXT: retq 3183; 3184; AVX2-LABEL: shl_v4i32: 3185; AVX2: # %bb.0: 3186; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 3187; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 3188; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 3189; AVX2-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 3190; AVX2-NEXT: retq 3191; 3192; AVX512F-LABEL: shl_v4i32: 3193; AVX512F: # %bb.0: 3194; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 3195; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 3196; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 3197; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 3198; AVX512F-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 3199; AVX512F-NEXT: vzeroupper 3200; AVX512F-NEXT: retq 3201; 3202; AVX512VL-LABEL: shl_v4i32: 3203; AVX512VL: # %bb.0: 3204; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 3205; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 3206; AVX512VL-NEXT: vpsllvd %xmm2, %xmm1, %xmm1 {%k1} 3207; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 3208; AVX512VL-NEXT: retq 3209 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer 3210 %r = shl <4 x i32> %x, %s 3211 ret <4 x i32> %r 3212} 3213 3214; negative test - shl is not commutative; there is no identity constant for operand 0 3215 3216define <8 x i32> @shl_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 3217; SSE2-LABEL: shl_v8i32_commute: 3218; SSE2: # %bb.0: 3219; SSE2-NEXT: movdqa %xmm0, %xmm5 3220; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 3221; SSE2-NEXT: pslld $31, %xmm5 3222; SSE2-NEXT: psrad $31, %xmm5 3223; SSE2-NEXT: pand %xmm4, %xmm5 3224; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 3225; SSE2-NEXT: pslld $31, %xmm0 3226; SSE2-NEXT: psrad $31, %xmm0 3227; SSE2-NEXT: pand %xmm3, %xmm0 3228; SSE2-NEXT: pslld $23, %xmm1 3229; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 3230; SSE2-NEXT: paddd %xmm3, %xmm1 3231; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 3232; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 3233; SSE2-NEXT: pmuludq %xmm1, %xmm0 3234; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3235; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3236; SSE2-NEXT: pmuludq %xmm4, %xmm1 3237; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3238; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3239; SSE2-NEXT: pslld $23, %xmm2 3240; SSE2-NEXT: paddd %xmm3, %xmm2 3241; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 3242; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 3243; SSE2-NEXT: pmuludq %xmm2, %xmm5 3244; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] 3245; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3246; SSE2-NEXT: pmuludq %xmm3, %xmm2 3247; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 3248; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3249; SSE2-NEXT: retq 3250; 3251; SSE42-LABEL: shl_v8i32_commute: 3252; SSE42: # %bb.0: 3253; SSE42-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3254; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 3255; SSE42-NEXT: pslld $31, %xmm0 3256; SSE42-NEXT: psrad $31, %xmm0 3257; SSE42-NEXT: pand %xmm4, %xmm0 3258; SSE42-NEXT: pslld $31, %xmm5 3259; SSE42-NEXT: psrad $31, %xmm5 3260; SSE42-NEXT: pand %xmm3, %xmm5 3261; SSE42-NEXT: pslld $23, %xmm1 3262; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 3263; SSE42-NEXT: paddd %xmm4, %xmm1 3264; SSE42-NEXT: cvttps2dq %xmm1, %xmm3 3265; SSE42-NEXT: pmulld %xmm5, %xmm3 3266; SSE42-NEXT: pslld $23, %xmm2 3267; SSE42-NEXT: paddd %xmm4, %xmm2 3268; SSE42-NEXT: cvttps2dq %xmm2, %xmm1 3269; SSE42-NEXT: pmulld %xmm0, %xmm1 3270; SSE42-NEXT: movdqa %xmm3, %xmm0 3271; SSE42-NEXT: retq 3272; 3273; AVX2-LABEL: shl_v8i32_commute: 3274; AVX2: # %bb.0: 3275; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3276; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 3277; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 3278; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3279; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3280; AVX2-NEXT: retq 3281; 3282; AVX512F-LABEL: shl_v8i32_commute: 3283; AVX512F: # %bb.0: 3284; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 3285; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 3286; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 3287; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 3288; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 3289; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3290; AVX512F-NEXT: retq 3291; 3292; AVX512VL-LABEL: shl_v8i32_commute: 3293; AVX512VL: # %bb.0: 3294; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 3295; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 3296; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 3297; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} 3298; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3299; AVX512VL-NEXT: retq 3300 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 3301 %r = shl <8 x i32> %s, %x 3302 ret <8 x i32> %r 3303} 3304 3305define <16 x i32> @shl_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 3306; SSE2-LABEL: shl_v16i32_swap: 3307; SSE2: # %bb.0: 3308; SSE2-NEXT: movdqa %xmm0, %xmm9 3309; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3310; SSE2-NEXT: movdqa %xmm9, %xmm8 3311; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 3312; SSE2-NEXT: pslld $31, %xmm8 3313; SSE2-NEXT: psrad $31, %xmm8 3314; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 3315; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 3316; SSE2-NEXT: pslld $31, %xmm9 3317; SSE2-NEXT: psrad $31, %xmm9 3318; SSE2-NEXT: pandn %xmm7, %xmm9 3319; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3320; SSE2-NEXT: movdqa %xmm0, %xmm7 3321; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 3322; SSE2-NEXT: pslld $31, %xmm7 3323; SSE2-NEXT: psrad $31, %xmm7 3324; SSE2-NEXT: pandn %xmm6, %xmm7 3325; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 3326; SSE2-NEXT: pslld $31, %xmm0 3327; SSE2-NEXT: psrad $31, %xmm0 3328; SSE2-NEXT: pandn %xmm5, %xmm0 3329; SSE2-NEXT: pslld $23, %xmm0 3330; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 3331; SSE2-NEXT: paddd %xmm5, %xmm0 3332; SSE2-NEXT: cvttps2dq %xmm0, %xmm6 3333; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] 3334; SSE2-NEXT: pmuludq %xmm6, %xmm1 3335; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 3336; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] 3337; SSE2-NEXT: pmuludq %xmm10, %xmm1 3338; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3339; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3340; SSE2-NEXT: pslld $23, %xmm7 3341; SSE2-NEXT: paddd %xmm5, %xmm7 3342; SSE2-NEXT: cvttps2dq %xmm7, %xmm6 3343; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 3344; SSE2-NEXT: pmuludq %xmm6, %xmm2 3345; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 3346; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 3347; SSE2-NEXT: pmuludq %xmm7, %xmm2 3348; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 3349; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3350; SSE2-NEXT: pslld $23, %xmm9 3351; SSE2-NEXT: paddd %xmm5, %xmm9 3352; SSE2-NEXT: cvttps2dq %xmm9, %xmm6 3353; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] 3354; SSE2-NEXT: pmuludq %xmm6, %xmm3 3355; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 3356; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] 3357; SSE2-NEXT: pmuludq %xmm7, %xmm3 3358; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 3359; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 3360; SSE2-NEXT: pslld $23, %xmm8 3361; SSE2-NEXT: paddd %xmm5, %xmm8 3362; SSE2-NEXT: cvttps2dq %xmm8, %xmm5 3363; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] 3364; SSE2-NEXT: pmuludq %xmm5, %xmm4 3365; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 3366; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] 3367; SSE2-NEXT: pmuludq %xmm6, %xmm4 3368; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 3369; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 3370; SSE2-NEXT: retq 3371; 3372; SSE42-LABEL: shl_v16i32_swap: 3373; SSE42: # %bb.0: 3374; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 3375; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 3376; SSE42-NEXT: pslld $31, %xmm8 3377; SSE42-NEXT: psrad $31, %xmm8 3378; SSE42-NEXT: pandn %xmm7, %xmm8 3379; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 3380; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 3381; SSE42-NEXT: pslld $31, %xmm7 3382; SSE42-NEXT: psrad $31, %xmm7 3383; SSE42-NEXT: pandn %xmm6, %xmm7 3384; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3385; SSE42-NEXT: pslld $31, %xmm6 3386; SSE42-NEXT: psrad $31, %xmm6 3387; SSE42-NEXT: pandn %xmm5, %xmm6 3388; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3389; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3390; SSE42-NEXT: pslld $31, %xmm5 3391; SSE42-NEXT: psrad $31, %xmm5 3392; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 3393; SSE42-NEXT: pslld $23, %xmm6 3394; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] 3395; SSE42-NEXT: paddd %xmm9, %xmm6 3396; SSE42-NEXT: cvttps2dq %xmm6, %xmm0 3397; SSE42-NEXT: pmulld %xmm1, %xmm0 3398; SSE42-NEXT: pslld $23, %xmm7 3399; SSE42-NEXT: paddd %xmm9, %xmm7 3400; SSE42-NEXT: cvttps2dq %xmm7, %xmm1 3401; SSE42-NEXT: pmulld %xmm2, %xmm1 3402; SSE42-NEXT: pslld $23, %xmm8 3403; SSE42-NEXT: paddd %xmm9, %xmm8 3404; SSE42-NEXT: cvttps2dq %xmm8, %xmm2 3405; SSE42-NEXT: pmulld %xmm3, %xmm2 3406; SSE42-NEXT: pslld $23, %xmm5 3407; SSE42-NEXT: paddd %xmm9, %xmm5 3408; SSE42-NEXT: cvttps2dq %xmm5, %xmm3 3409; SSE42-NEXT: pmulld %xmm4, %xmm3 3410; SSE42-NEXT: retq 3411; 3412; AVX2-LABEL: shl_v16i32_swap: 3413; AVX2: # %bb.0: 3414; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3415; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 3416; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 3417; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 3418; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 3419; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3420; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 3421; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 3422; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 3423; AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 3424; AVX2-NEXT: vpsllvd %ymm4, %ymm2, %ymm1 3425; AVX2-NEXT: retq 3426; 3427; AVX512-LABEL: shl_v16i32_swap: 3428; AVX512: # %bb.0: 3429; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 3430; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 3431; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 3432; AVX512-NEXT: vpsllvd %zmm2, %zmm1, %zmm1 {%k1} 3433; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 3434; AVX512-NEXT: retq 3435 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 3436 %r = shl <16 x i32> %x, %s 3437 ret <16 x i32> %r 3438} 3439 3440; negative test - shl is not commutative; there is no identity constant for operand 0 3441 3442define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 3443; SSE2-LABEL: shl_v16i32_commute_swap: 3444; SSE2: # %bb.0: 3445; SSE2-NEXT: movdqa %xmm0, %xmm9 3446; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3447; SSE2-NEXT: movdqa %xmm9, %xmm8 3448; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7] 3449; SSE2-NEXT: pslld $31, %xmm8 3450; SSE2-NEXT: psrad $31, %xmm8 3451; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8 3452; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3] 3453; SSE2-NEXT: pslld $31, %xmm9 3454; SSE2-NEXT: psrad $31, %xmm9 3455; SSE2-NEXT: pandn %xmm7, %xmm9 3456; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 3457; SSE2-NEXT: movdqa %xmm0, %xmm7 3458; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 3459; SSE2-NEXT: pslld $31, %xmm7 3460; SSE2-NEXT: psrad $31, %xmm7 3461; SSE2-NEXT: pandn %xmm6, %xmm7 3462; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 3463; SSE2-NEXT: pslld $31, %xmm0 3464; SSE2-NEXT: psrad $31, %xmm0 3465; SSE2-NEXT: pandn %xmm5, %xmm0 3466; SSE2-NEXT: pslld $23, %xmm1 3467; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 3468; SSE2-NEXT: paddd %xmm5, %xmm1 3469; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 3470; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 3471; SSE2-NEXT: pmuludq %xmm1, %xmm0 3472; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3473; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3474; SSE2-NEXT: pmuludq %xmm6, %xmm1 3475; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3476; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3477; SSE2-NEXT: pslld $23, %xmm2 3478; SSE2-NEXT: paddd %xmm5, %xmm2 3479; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 3480; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] 3481; SSE2-NEXT: pmuludq %xmm2, %xmm7 3482; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] 3483; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3484; SSE2-NEXT: pmuludq %xmm6, %xmm2 3485; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 3486; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3487; SSE2-NEXT: pslld $23, %xmm3 3488; SSE2-NEXT: paddd %xmm5, %xmm3 3489; SSE2-NEXT: cvttps2dq %xmm3, %xmm3 3490; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3] 3491; SSE2-NEXT: pmuludq %xmm3, %xmm9 3492; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3] 3493; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3494; SSE2-NEXT: pmuludq %xmm6, %xmm3 3495; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 3496; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 3497; SSE2-NEXT: pslld $23, %xmm4 3498; SSE2-NEXT: paddd %xmm5, %xmm4 3499; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 3500; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] 3501; SSE2-NEXT: pmuludq %xmm4, %xmm8 3502; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] 3503; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3504; SSE2-NEXT: pmuludq %xmm5, %xmm4 3505; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 3506; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 3507; SSE2-NEXT: retq 3508; 3509; SSE42-LABEL: shl_v16i32_commute_swap: 3510; SSE42: # %bb.0: 3511; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 3512; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 3513; SSE42-NEXT: pslld $31, %xmm8 3514; SSE42-NEXT: psrad $31, %xmm8 3515; SSE42-NEXT: pandn %xmm7, %xmm8 3516; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 3517; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 3518; SSE42-NEXT: pslld $31, %xmm7 3519; SSE42-NEXT: psrad $31, %xmm7 3520; SSE42-NEXT: pandn %xmm6, %xmm7 3521; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3522; SSE42-NEXT: pslld $31, %xmm6 3523; SSE42-NEXT: psrad $31, %xmm6 3524; SSE42-NEXT: pandn %xmm5, %xmm6 3525; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3526; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 3527; SSE42-NEXT: pslld $31, %xmm5 3528; SSE42-NEXT: psrad $31, %xmm5 3529; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 3530; SSE42-NEXT: pslld $23, %xmm1 3531; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216] 3532; SSE42-NEXT: paddd %xmm9, %xmm1 3533; SSE42-NEXT: cvttps2dq %xmm1, %xmm0 3534; SSE42-NEXT: pmulld %xmm6, %xmm0 3535; SSE42-NEXT: pslld $23, %xmm2 3536; SSE42-NEXT: paddd %xmm9, %xmm2 3537; SSE42-NEXT: cvttps2dq %xmm2, %xmm1 3538; SSE42-NEXT: pmulld %xmm7, %xmm1 3539; SSE42-NEXT: pslld $23, %xmm3 3540; SSE42-NEXT: paddd %xmm9, %xmm3 3541; SSE42-NEXT: cvttps2dq %xmm3, %xmm2 3542; SSE42-NEXT: pmulld %xmm8, %xmm2 3543; SSE42-NEXT: pslld $23, %xmm4 3544; SSE42-NEXT: paddd %xmm9, %xmm4 3545; SSE42-NEXT: cvttps2dq %xmm4, %xmm3 3546; SSE42-NEXT: pmulld %xmm5, %xmm3 3547; SSE42-NEXT: retq 3548; 3549; AVX2-LABEL: shl_v16i32_commute_swap: 3550; AVX2: # %bb.0: 3551; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3552; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 3553; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 3554; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 3555; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 3556; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3557; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 3558; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 3559; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 3560; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3561; AVX2-NEXT: vpsllvd %ymm2, %ymm4, %ymm1 3562; AVX2-NEXT: retq 3563; 3564; AVX512-LABEL: shl_v16i32_commute_swap: 3565; AVX512: # %bb.0: 3566; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 3567; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 3568; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 3569; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 3570; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 3571; AVX512-NEXT: retq 3572 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 3573 %r = shl <16 x i32> %s, %x 3574 ret <16 x i32> %r 3575} 3576 3577define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 3578; SSE2-LABEL: shl_v8i32_cast_cond: 3579; SSE2: # %bb.0: 3580; SSE2-NEXT: movd %edi, %xmm4 3581; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 3582; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] 3583; SSE2-NEXT: movdqa %xmm5, %xmm4 3584; SSE2-NEXT: pand %xmm6, %xmm4 3585; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 3586; SSE2-NEXT: pand %xmm3, %xmm4 3587; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 3588; SSE2-NEXT: pand %xmm3, %xmm5 3589; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 3590; SSE2-NEXT: pand %xmm2, %xmm5 3591; SSE2-NEXT: pslld $23, %xmm5 3592; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] 3593; SSE2-NEXT: paddd %xmm2, %xmm5 3594; SSE2-NEXT: cvttps2dq %xmm5, %xmm3 3595; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 3596; SSE2-NEXT: pmuludq %xmm3, %xmm0 3597; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3598; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3599; SSE2-NEXT: pmuludq %xmm5, %xmm3 3600; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 3601; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 3602; SSE2-NEXT: pslld $23, %xmm4 3603; SSE2-NEXT: paddd %xmm2, %xmm4 3604; SSE2-NEXT: cvttps2dq %xmm4, %xmm2 3605; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 3606; SSE2-NEXT: pmuludq %xmm2, %xmm1 3607; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3608; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3609; SSE2-NEXT: pmuludq %xmm3, %xmm2 3610; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 3611; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 3612; SSE2-NEXT: retq 3613; 3614; SSE42-LABEL: shl_v8i32_cast_cond: 3615; SSE42: # %bb.0: 3616; SSE42-NEXT: movd %edi, %xmm4 3617; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] 3618; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = [16,32,64,128] 3619; SSE42-NEXT: movdqa %xmm4, %xmm6 3620; SSE42-NEXT: pand %xmm5, %xmm6 3621; SSE42-NEXT: pcmpeqd %xmm5, %xmm6 3622; SSE42-NEXT: pand %xmm3, %xmm6 3623; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 3624; SSE42-NEXT: pand %xmm3, %xmm4 3625; SSE42-NEXT: pcmpeqd %xmm3, %xmm4 3626; SSE42-NEXT: pand %xmm2, %xmm4 3627; SSE42-NEXT: pslld $23, %xmm4 3628; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216] 3629; SSE42-NEXT: paddd %xmm2, %xmm4 3630; SSE42-NEXT: cvttps2dq %xmm4, %xmm3 3631; SSE42-NEXT: pmulld %xmm3, %xmm0 3632; SSE42-NEXT: pslld $23, %xmm6 3633; SSE42-NEXT: paddd %xmm2, %xmm6 3634; SSE42-NEXT: cvttps2dq %xmm6, %xmm2 3635; SSE42-NEXT: pmulld %xmm2, %xmm1 3636; SSE42-NEXT: retq 3637; 3638; AVX2-LABEL: shl_v8i32_cast_cond: 3639; AVX2: # %bb.0: 3640; AVX2-NEXT: vmovd %edi, %xmm2 3641; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 3642; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 3643; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 3644; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 3645; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 3646; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3647; AVX2-NEXT: retq 3648; 3649; AVX512F-LABEL: shl_v8i32_cast_cond: 3650; AVX512F: # %bb.0: 3651; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3652; AVX512F-NEXT: kmovw %edi, %k1 3653; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 3654; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 3655; AVX512F-NEXT: retq 3656; 3657; AVX512VL-LABEL: shl_v8i32_cast_cond: 3658; AVX512VL: # %bb.0: 3659; AVX512VL-NEXT: kmovw %edi, %k1 3660; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 {%k1} 3661; AVX512VL-NEXT: retq 3662 %b = bitcast i8 %pb to <8 x i1> 3663 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 3664 %r = shl <8 x i32> %x, %s 3665 ret <8 x i32> %r 3666} 3667 3668define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 3669; SSE2-LABEL: shl_v8i64_cast_cond: 3670; SSE2: # %bb.0: 3671; SSE2-NEXT: movd %edi, %xmm8 3672; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 3673; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 3674; SSE2-NEXT: movdqa %xmm9, %xmm8 3675; SSE2-NEXT: pand %xmm10, %xmm8 3676; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 3677; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 3678; SSE2-NEXT: pand %xmm7, %xmm8 3679; SSE2-NEXT: pand %xmm10, %xmm8 3680; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 3681; SSE2-NEXT: movdqa %xmm9, %xmm7 3682; SSE2-NEXT: pand %xmm10, %xmm7 3683; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 3684; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 3685; SSE2-NEXT: pand %xmm6, %xmm7 3686; SSE2-NEXT: pand %xmm10, %xmm7 3687; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 3688; SSE2-NEXT: movdqa %xmm9, %xmm10 3689; SSE2-NEXT: pand %xmm6, %xmm10 3690; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 3691; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] 3692; SSE2-NEXT: pand %xmm5, %xmm10 3693; SSE2-NEXT: pand %xmm6, %xmm10 3694; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 3695; SSE2-NEXT: pand %xmm5, %xmm9 3696; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 3697; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 3698; SSE2-NEXT: pand %xmm4, %xmm9 3699; SSE2-NEXT: pand %xmm5, %xmm9 3700; SSE2-NEXT: movdqa %xmm0, %xmm4 3701; SSE2-NEXT: psllq %xmm9, %xmm4 3702; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 3703; SSE2-NEXT: psllq %xmm5, %xmm0 3704; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3705; SSE2-NEXT: movdqa %xmm1, %xmm4 3706; SSE2-NEXT: psllq %xmm10, %xmm4 3707; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 3708; SSE2-NEXT: psllq %xmm5, %xmm1 3709; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 3710; SSE2-NEXT: movdqa %xmm2, %xmm4 3711; SSE2-NEXT: psllq %xmm7, %xmm4 3712; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 3713; SSE2-NEXT: psllq %xmm5, %xmm2 3714; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] 3715; SSE2-NEXT: movdqa %xmm3, %xmm4 3716; SSE2-NEXT: psllq %xmm8, %xmm4 3717; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 3718; SSE2-NEXT: psllq %xmm5, %xmm3 3719; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 3720; SSE2-NEXT: retq 3721; 3722; SSE42-LABEL: shl_v8i64_cast_cond: 3723; SSE42: # %bb.0: 3724; SSE42-NEXT: movd %edi, %xmm8 3725; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 3726; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 3727; SSE42-NEXT: movdqa %xmm9, %xmm8 3728; SSE42-NEXT: pand %xmm10, %xmm8 3729; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 3730; SSE42-NEXT: pand %xmm7, %xmm8 3731; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] 3732; SSE42-NEXT: movdqa %xmm9, %xmm7 3733; SSE42-NEXT: pand %xmm10, %xmm7 3734; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 3735; SSE42-NEXT: pand %xmm6, %xmm7 3736; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 3737; SSE42-NEXT: movdqa %xmm9, %xmm10 3738; SSE42-NEXT: pand %xmm6, %xmm10 3739; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 3740; SSE42-NEXT: pand %xmm5, %xmm10 3741; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 3742; SSE42-NEXT: pand %xmm5, %xmm9 3743; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 3744; SSE42-NEXT: pand %xmm4, %xmm9 3745; SSE42-NEXT: movdqa %xmm0, %xmm4 3746; SSE42-NEXT: psllq %xmm9, %xmm4 3747; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 3748; SSE42-NEXT: psllq %xmm5, %xmm0 3749; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 3750; SSE42-NEXT: movdqa %xmm1, %xmm4 3751; SSE42-NEXT: psllq %xmm10, %xmm4 3752; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 3753; SSE42-NEXT: psllq %xmm5, %xmm1 3754; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] 3755; SSE42-NEXT: movdqa %xmm2, %xmm4 3756; SSE42-NEXT: psllq %xmm7, %xmm4 3757; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 3758; SSE42-NEXT: psllq %xmm5, %xmm2 3759; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 3760; SSE42-NEXT: movdqa %xmm3, %xmm4 3761; SSE42-NEXT: psllq %xmm8, %xmm4 3762; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 3763; SSE42-NEXT: psllq %xmm5, %xmm3 3764; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 3765; SSE42-NEXT: retq 3766; 3767; AVX2-LABEL: shl_v8i64_cast_cond: 3768; AVX2: # %bb.0: 3769; AVX2-NEXT: vmovd %edi, %xmm4 3770; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 3771; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 3772; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 3773; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 3774; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 3775; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 3776; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 3777; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 3778; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 3779; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 3780; AVX2-NEXT: vpsllvq %ymm3, %ymm1, %ymm1 3781; AVX2-NEXT: retq 3782; 3783; AVX512-LABEL: shl_v8i64_cast_cond: 3784; AVX512: # %bb.0: 3785; AVX512-NEXT: kmovw %edi, %k1 3786; AVX512-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} 3787; AVX512-NEXT: retq 3788 %b = bitcast i8 %pb to <8 x i1> 3789 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer 3790 %r = shl <8 x i64> %x, %s 3791 ret <8 x i64> %r 3792} 3793 3794define <4 x i32> @lshr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 3795; SSE2-LABEL: lshr_v4i32: 3796; SSE2: # %bb.0: 3797; SSE2-NEXT: pslld $31, %xmm0 3798; SSE2-NEXT: psrad $31, %xmm0 3799; SSE2-NEXT: pand %xmm2, %xmm0 3800; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] 3801; SSE2-NEXT: movdqa %xmm1, %xmm3 3802; SSE2-NEXT: psrld %xmm2, %xmm3 3803; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] 3804; SSE2-NEXT: movdqa %xmm1, %xmm2 3805; SSE2-NEXT: psrld %xmm4, %xmm2 3806; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 3807; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 3808; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7] 3809; SSE2-NEXT: movdqa %xmm1, %xmm4 3810; SSE2-NEXT: psrld %xmm3, %xmm4 3811; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] 3812; SSE2-NEXT: psrld %xmm0, %xmm1 3813; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] 3814; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] 3815; SSE2-NEXT: movaps %xmm2, %xmm0 3816; SSE2-NEXT: retq 3817; 3818; SSE42-LABEL: lshr_v4i32: 3819; SSE42: # %bb.0: 3820; SSE42-NEXT: pslld $31, %xmm0 3821; SSE42-NEXT: psrad $31, %xmm0 3822; SSE42-NEXT: pand %xmm2, %xmm0 3823; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] 3824; SSE42-NEXT: movdqa %xmm1, %xmm3 3825; SSE42-NEXT: psrld %xmm2, %xmm3 3826; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 3827; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 3828; SSE42-NEXT: movdqa %xmm1, %xmm5 3829; SSE42-NEXT: psrld %xmm4, %xmm5 3830; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 3831; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7] 3832; SSE42-NEXT: movdqa %xmm1, %xmm0 3833; SSE42-NEXT: psrld %xmm3, %xmm0 3834; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 3835; SSE42-NEXT: psrld %xmm2, %xmm1 3836; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 3837; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 3838; SSE42-NEXT: retq 3839; 3840; AVX2-LABEL: lshr_v4i32: 3841; AVX2: # %bb.0: 3842; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 3843; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 3844; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 3845; AVX2-NEXT: vpsrlvd %xmm0, %xmm1, %xmm0 3846; AVX2-NEXT: retq 3847; 3848; AVX512F-LABEL: lshr_v4i32: 3849; AVX512F: # %bb.0: 3850; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 3851; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 3852; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 3853; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 3854; AVX512F-NEXT: vpsrlvd %xmm0, %xmm1, %xmm0 3855; AVX512F-NEXT: vzeroupper 3856; AVX512F-NEXT: retq 3857; 3858; AVX512VL-LABEL: lshr_v4i32: 3859; AVX512VL: # %bb.0: 3860; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 3861; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 3862; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm1 {%k1} 3863; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 3864; AVX512VL-NEXT: retq 3865 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer 3866 %r = lshr <4 x i32> %x, %s 3867 ret <4 x i32> %r 3868} 3869 3870; negative test - lshr is not commutative; there is no identity constant for operand 0 3871 3872define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 3873; SSE2-LABEL: lshr_v8i32_commute: 3874; SSE2: # %bb.0: 3875; SSE2-NEXT: movdqa %xmm1, %xmm5 3876; SSE2-NEXT: movdqa %xmm0, %xmm1 3877; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 3878; SSE2-NEXT: pslld $31, %xmm1 3879; SSE2-NEXT: psrad $31, %xmm1 3880; SSE2-NEXT: pand %xmm4, %xmm1 3881; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 3882; SSE2-NEXT: pslld $31, %xmm0 3883; SSE2-NEXT: psrad $31, %xmm0 3884; SSE2-NEXT: pand %xmm3, %xmm0 3885; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 3886; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 3887; SSE2-NEXT: movdqa %xmm0, %xmm6 3888; SSE2-NEXT: psrld %xmm4, %xmm6 3889; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 3890; SSE2-NEXT: movdqa %xmm0, %xmm4 3891; SSE2-NEXT: psrld %xmm3, %xmm4 3892; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] 3893; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 3894; SSE2-NEXT: movdqa %xmm0, %xmm6 3895; SSE2-NEXT: psrld %xmm3, %xmm6 3896; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] 3897; SSE2-NEXT: psrld %xmm3, %xmm0 3898; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 3899; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] 3900; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 3901; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 3902; SSE2-NEXT: movdqa %xmm1, %xmm5 3903; SSE2-NEXT: psrld %xmm4, %xmm5 3904; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 3905; SSE2-NEXT: movdqa %xmm1, %xmm4 3906; SSE2-NEXT: psrld %xmm3, %xmm4 3907; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] 3908; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] 3909; SSE2-NEXT: movdqa %xmm1, %xmm5 3910; SSE2-NEXT: psrld %xmm3, %xmm5 3911; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 3912; SSE2-NEXT: psrld %xmm2, %xmm1 3913; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 3914; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3] 3915; SSE2-NEXT: retq 3916; 3917; SSE42-LABEL: lshr_v8i32_commute: 3918; SSE42: # %bb.0: 3919; SSE42-NEXT: movdqa %xmm0, %xmm5 3920; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 3921; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 3922; SSE42-NEXT: pslld $31, %xmm5 3923; SSE42-NEXT: psrad $31, %xmm5 3924; SSE42-NEXT: pand %xmm4, %xmm5 3925; SSE42-NEXT: pslld $31, %xmm0 3926; SSE42-NEXT: psrad $31, %xmm0 3927; SSE42-NEXT: pand %xmm3, %xmm0 3928; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 3929; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 3930; SSE42-NEXT: movdqa %xmm0, %xmm6 3931; SSE42-NEXT: psrld %xmm4, %xmm6 3932; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] 3933; SSE42-NEXT: movdqa %xmm0, %xmm7 3934; SSE42-NEXT: psrld %xmm4, %xmm7 3935; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 3936; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 3937; SSE42-NEXT: movdqa %xmm0, %xmm4 3938; SSE42-NEXT: psrld %xmm3, %xmm4 3939; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 3940; SSE42-NEXT: psrld %xmm1, %xmm0 3941; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 3942; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] 3943; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 3944; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 3945; SSE42-NEXT: movdqa %xmm5, %xmm4 3946; SSE42-NEXT: psrld %xmm3, %xmm4 3947; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] 3948; SSE42-NEXT: movdqa %xmm5, %xmm6 3949; SSE42-NEXT: psrld %xmm3, %xmm6 3950; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] 3951; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] 3952; SSE42-NEXT: movdqa %xmm5, %xmm1 3953; SSE42-NEXT: psrld %xmm3, %xmm1 3954; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 3955; SSE42-NEXT: psrld %xmm2, %xmm5 3956; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 3957; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 3958; SSE42-NEXT: retq 3959; 3960; AVX2-LABEL: lshr_v8i32_commute: 3961; AVX2: # %bb.0: 3962; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3963; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 3964; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 3965; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3966; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 3967; AVX2-NEXT: retq 3968; 3969; AVX512F-LABEL: lshr_v8i32_commute: 3970; AVX512F: # %bb.0: 3971; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 3972; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 3973; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 3974; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 3975; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 3976; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 3977; AVX512F-NEXT: retq 3978; 3979; AVX512VL-LABEL: lshr_v8i32_commute: 3980; AVX512VL: # %bb.0: 3981; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 3982; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 3983; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 3984; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} 3985; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 3986; AVX512VL-NEXT: retq 3987 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 3988 %r = lshr <8 x i32> %s, %x 3989 ret <8 x i32> %r 3990} 3991 3992define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 3993; SSE2-LABEL: lshr_v16i32_swap: 3994; SSE2: # %bb.0: 3995; SSE2-NEXT: movdqa %xmm0, %xmm8 3996; SSE2-NEXT: movdqa %xmm0, %xmm10 3997; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 3998; SSE2-NEXT: movdqa %xmm10, %xmm9 3999; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 4000; SSE2-NEXT: pslld $31, %xmm9 4001; SSE2-NEXT: psrad $31, %xmm9 4002; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 4003; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] 4004; SSE2-NEXT: pslld $31, %xmm10 4005; SSE2-NEXT: psrad $31, %xmm10 4006; SSE2-NEXT: pandn %xmm7, %xmm10 4007; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 4008; SSE2-NEXT: movdqa %xmm8, %xmm7 4009; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 4010; SSE2-NEXT: pslld $31, %xmm7 4011; SSE2-NEXT: psrad $31, %xmm7 4012; SSE2-NEXT: pandn %xmm6, %xmm7 4013; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] 4014; SSE2-NEXT: pslld $31, %xmm8 4015; SSE2-NEXT: psrad $31, %xmm8 4016; SSE2-NEXT: pandn %xmm5, %xmm8 4017; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7] 4018; SSE2-NEXT: movdqa %xmm1, %xmm5 4019; SSE2-NEXT: psrld %xmm0, %xmm5 4020; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] 4021; SSE2-NEXT: movdqa %xmm1, %xmm0 4022; SSE2-NEXT: psrld %xmm6, %xmm0 4023; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 4024; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4025; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4026; SSE2-NEXT: movdqa %xmm1, %xmm8 4027; SSE2-NEXT: psrld %xmm6, %xmm8 4028; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4029; SSE2-NEXT: psrld %xmm5, %xmm1 4030; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] 4031; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3] 4032; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] 4033; SSE2-NEXT: movdqa %xmm2, %xmm5 4034; SSE2-NEXT: psrld %xmm1, %xmm5 4035; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4036; SSE2-NEXT: movdqa %xmm2, %xmm1 4037; SSE2-NEXT: psrld %xmm6, %xmm1 4038; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 4039; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 4040; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4041; SSE2-NEXT: movdqa %xmm2, %xmm7 4042; SSE2-NEXT: psrld %xmm6, %xmm7 4043; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4044; SSE2-NEXT: psrld %xmm5, %xmm2 4045; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] 4046; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] 4047; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7] 4048; SSE2-NEXT: movdqa %xmm3, %xmm5 4049; SSE2-NEXT: psrld %xmm2, %xmm5 4050; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] 4051; SSE2-NEXT: movdqa %xmm3, %xmm2 4052; SSE2-NEXT: psrld %xmm6, %xmm2 4053; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 4054; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4055; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4056; SSE2-NEXT: movdqa %xmm3, %xmm7 4057; SSE2-NEXT: psrld %xmm6, %xmm7 4058; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4059; SSE2-NEXT: psrld %xmm5, %xmm3 4060; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] 4061; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] 4062; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7] 4063; SSE2-NEXT: movdqa %xmm4, %xmm5 4064; SSE2-NEXT: psrld %xmm3, %xmm5 4065; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] 4066; SSE2-NEXT: movdqa %xmm4, %xmm3 4067; SSE2-NEXT: psrld %xmm6, %xmm3 4068; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 4069; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4070; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4071; SSE2-NEXT: movdqa %xmm4, %xmm7 4072; SSE2-NEXT: psrld %xmm6, %xmm7 4073; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4074; SSE2-NEXT: psrld %xmm5, %xmm4 4075; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] 4076; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] 4077; SSE2-NEXT: retq 4078; 4079; SSE42-LABEL: lshr_v16i32_swap: 4080; SSE42: # %bb.0: 4081; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 4082; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 4083; SSE42-NEXT: pslld $31, %xmm8 4084; SSE42-NEXT: psrad $31, %xmm8 4085; SSE42-NEXT: pandn %xmm7, %xmm8 4086; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 4087; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 4088; SSE42-NEXT: pslld $31, %xmm7 4089; SSE42-NEXT: psrad $31, %xmm7 4090; SSE42-NEXT: pandn %xmm6, %xmm7 4091; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4092; SSE42-NEXT: pslld $31, %xmm6 4093; SSE42-NEXT: psrad $31, %xmm6 4094; SSE42-NEXT: pandn %xmm5, %xmm6 4095; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4096; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4097; SSE42-NEXT: pslld $31, %xmm5 4098; SSE42-NEXT: psrad $31, %xmm5 4099; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 4100; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] 4101; SSE42-NEXT: movdqa %xmm1, %xmm9 4102; SSE42-NEXT: psrld %xmm0, %xmm9 4103; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] 4104; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] 4105; SSE42-NEXT: movdqa %xmm1, %xmm11 4106; SSE42-NEXT: psrld %xmm0, %xmm11 4107; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] 4108; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] 4109; SSE42-NEXT: movdqa %xmm1, %xmm0 4110; SSE42-NEXT: psrld %xmm6, %xmm0 4111; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] 4112; SSE42-NEXT: psrld %xmm6, %xmm1 4113; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4114; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] 4115; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] 4116; SSE42-NEXT: movdqa %xmm2, %xmm6 4117; SSE42-NEXT: psrld %xmm1, %xmm6 4118; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] 4119; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] 4120; SSE42-NEXT: movdqa %xmm2, %xmm10 4121; SSE42-NEXT: psrld %xmm1, %xmm10 4122; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] 4123; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4124; SSE42-NEXT: movdqa %xmm2, %xmm1 4125; SSE42-NEXT: psrld %xmm6, %xmm1 4126; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] 4127; SSE42-NEXT: psrld %xmm6, %xmm2 4128; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 4129; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] 4130; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] 4131; SSE42-NEXT: movdqa %xmm3, %xmm6 4132; SSE42-NEXT: psrld %xmm2, %xmm6 4133; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] 4134; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] 4135; SSE42-NEXT: movdqa %xmm3, %xmm9 4136; SSE42-NEXT: psrld %xmm2, %xmm9 4137; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] 4138; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] 4139; SSE42-NEXT: movdqa %xmm3, %xmm2 4140; SSE42-NEXT: psrld %xmm6, %xmm2 4141; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4142; SSE42-NEXT: psrld %xmm6, %xmm3 4143; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 4144; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] 4145; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 4146; SSE42-NEXT: movdqa %xmm4, %xmm6 4147; SSE42-NEXT: psrld %xmm3, %xmm6 4148; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] 4149; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] 4150; SSE42-NEXT: movdqa %xmm4, %xmm8 4151; SSE42-NEXT: psrld %xmm3, %xmm8 4152; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] 4153; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4154; SSE42-NEXT: movdqa %xmm4, %xmm3 4155; SSE42-NEXT: psrld %xmm5, %xmm3 4156; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] 4157; SSE42-NEXT: psrld %xmm5, %xmm4 4158; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 4159; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] 4160; SSE42-NEXT: retq 4161; 4162; AVX2-LABEL: lshr_v16i32_swap: 4163; AVX2: # %bb.0: 4164; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 4165; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 4166; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 4167; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 4168; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 4169; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 4170; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 4171; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 4172; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 4173; AVX2-NEXT: vpsrlvd %ymm0, %ymm1, %ymm0 4174; AVX2-NEXT: vpsrlvd %ymm4, %ymm2, %ymm1 4175; AVX2-NEXT: retq 4176; 4177; AVX512-LABEL: lshr_v16i32_swap: 4178; AVX512: # %bb.0: 4179; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 4180; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 4181; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 4182; AVX512-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 {%k1} 4183; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 4184; AVX512-NEXT: retq 4185 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 4186 %r = lshr <16 x i32> %x, %s 4187 ret <16 x i32> %r 4188} 4189 4190; negative test - lshr is not commutative; there is no identity constant for operand 0 4191 4192define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 4193; SSE2-LABEL: lshr_v16i32_commute_swap: 4194; SSE2: # %bb.0: 4195; SSE2-NEXT: movdqa %xmm3, %xmm8 4196; SSE2-NEXT: movdqa %xmm2, %xmm9 4197; SSE2-NEXT: movdqa %xmm1, %xmm10 4198; SSE2-NEXT: movdqa %xmm0, %xmm2 4199; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 4200; SSE2-NEXT: movdqa %xmm2, %xmm3 4201; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 4202; SSE2-NEXT: pslld $31, %xmm3 4203; SSE2-NEXT: psrad $31, %xmm3 4204; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 4205; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 4206; SSE2-NEXT: pslld $31, %xmm2 4207; SSE2-NEXT: psrad $31, %xmm2 4208; SSE2-NEXT: pandn %xmm7, %xmm2 4209; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 4210; SSE2-NEXT: movdqa %xmm0, %xmm1 4211; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 4212; SSE2-NEXT: pslld $31, %xmm1 4213; SSE2-NEXT: psrad $31, %xmm1 4214; SSE2-NEXT: pandn %xmm6, %xmm1 4215; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 4216; SSE2-NEXT: pslld $31, %xmm0 4217; SSE2-NEXT: psrad $31, %xmm0 4218; SSE2-NEXT: pandn %xmm5, %xmm0 4219; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4220; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4221; SSE2-NEXT: movdqa %xmm0, %xmm7 4222; SSE2-NEXT: psrld %xmm6, %xmm7 4223; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4224; SSE2-NEXT: movdqa %xmm0, %xmm6 4225; SSE2-NEXT: psrld %xmm5, %xmm6 4226; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 4227; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] 4228; SSE2-NEXT: movdqa %xmm0, %xmm7 4229; SSE2-NEXT: psrld %xmm5, %xmm7 4230; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] 4231; SSE2-NEXT: psrld %xmm5, %xmm0 4232; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] 4233; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] 4234; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4235; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4236; SSE2-NEXT: movdqa %xmm1, %xmm7 4237; SSE2-NEXT: psrld %xmm6, %xmm7 4238; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4239; SSE2-NEXT: movdqa %xmm1, %xmm6 4240; SSE2-NEXT: psrld %xmm5, %xmm6 4241; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 4242; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] 4243; SSE2-NEXT: movdqa %xmm1, %xmm7 4244; SSE2-NEXT: psrld %xmm5, %xmm7 4245; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] 4246; SSE2-NEXT: psrld %xmm5, %xmm1 4247; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] 4248; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] 4249; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4250; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4251; SSE2-NEXT: movdqa %xmm2, %xmm7 4252; SSE2-NEXT: psrld %xmm6, %xmm7 4253; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4254; SSE2-NEXT: movdqa %xmm2, %xmm6 4255; SSE2-NEXT: psrld %xmm5, %xmm6 4256; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 4257; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] 4258; SSE2-NEXT: movdqa %xmm2, %xmm7 4259; SSE2-NEXT: psrld %xmm5, %xmm7 4260; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] 4261; SSE2-NEXT: psrld %xmm5, %xmm2 4262; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] 4263; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] 4264; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 4265; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4266; SSE2-NEXT: movdqa %xmm3, %xmm7 4267; SSE2-NEXT: psrld %xmm6, %xmm7 4268; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4269; SSE2-NEXT: movdqa %xmm3, %xmm6 4270; SSE2-NEXT: psrld %xmm5, %xmm6 4271; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 4272; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] 4273; SSE2-NEXT: movdqa %xmm3, %xmm7 4274; SSE2-NEXT: psrld %xmm5, %xmm7 4275; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 4276; SSE2-NEXT: psrld %xmm4, %xmm3 4277; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] 4278; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] 4279; SSE2-NEXT: retq 4280; 4281; SSE42-LABEL: lshr_v16i32_commute_swap: 4282; SSE42: # %bb.0: 4283; SSE42-NEXT: movdqa %xmm3, %xmm10 4284; SSE42-NEXT: movdqa %xmm2, %xmm9 4285; SSE42-NEXT: movdqa %xmm1, %xmm8 4286; SSE42-NEXT: movdqa %xmm0, %xmm3 4287; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4288; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4289; SSE42-NEXT: pslld $31, %xmm2 4290; SSE42-NEXT: psrad $31, %xmm2 4291; SSE42-NEXT: pandn %xmm7, %xmm2 4292; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 4293; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4294; SSE42-NEXT: pslld $31, %xmm1 4295; SSE42-NEXT: psrad $31, %xmm1 4296; SSE42-NEXT: pandn %xmm6, %xmm1 4297; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 4298; SSE42-NEXT: pslld $31, %xmm0 4299; SSE42-NEXT: psrad $31, %xmm0 4300; SSE42-NEXT: pandn %xmm5, %xmm0 4301; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 4302; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 4303; SSE42-NEXT: pslld $31, %xmm3 4304; SSE42-NEXT: psrad $31, %xmm3 4305; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 4306; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4307; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4308; SSE42-NEXT: movdqa %xmm0, %xmm7 4309; SSE42-NEXT: psrld %xmm6, %xmm7 4310; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] 4311; SSE42-NEXT: movdqa %xmm0, %xmm11 4312; SSE42-NEXT: psrld %xmm6, %xmm11 4313; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] 4314; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4315; SSE42-NEXT: movdqa %xmm0, %xmm6 4316; SSE42-NEXT: psrld %xmm5, %xmm6 4317; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] 4318; SSE42-NEXT: psrld %xmm5, %xmm0 4319; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] 4320; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] 4321; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4322; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4323; SSE42-NEXT: movdqa %xmm1, %xmm7 4324; SSE42-NEXT: psrld %xmm6, %xmm7 4325; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] 4326; SSE42-NEXT: movdqa %xmm1, %xmm8 4327; SSE42-NEXT: psrld %xmm6, %xmm8 4328; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 4329; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4330; SSE42-NEXT: movdqa %xmm1, %xmm6 4331; SSE42-NEXT: psrld %xmm5, %xmm6 4332; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] 4333; SSE42-NEXT: psrld %xmm5, %xmm1 4334; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] 4335; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] 4336; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4337; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4338; SSE42-NEXT: movdqa %xmm2, %xmm7 4339; SSE42-NEXT: psrld %xmm6, %xmm7 4340; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] 4341; SSE42-NEXT: movdqa %xmm2, %xmm8 4342; SSE42-NEXT: psrld %xmm6, %xmm8 4343; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 4344; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4345; SSE42-NEXT: movdqa %xmm2, %xmm6 4346; SSE42-NEXT: psrld %xmm5, %xmm6 4347; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] 4348; SSE42-NEXT: psrld %xmm5, %xmm2 4349; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 4350; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] 4351; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 4352; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4353; SSE42-NEXT: movdqa %xmm3, %xmm7 4354; SSE42-NEXT: psrld %xmm6, %xmm7 4355; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] 4356; SSE42-NEXT: movdqa %xmm3, %xmm8 4357; SSE42-NEXT: psrld %xmm6, %xmm8 4358; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 4359; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4360; SSE42-NEXT: movdqa %xmm3, %xmm6 4361; SSE42-NEXT: psrld %xmm5, %xmm6 4362; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 4363; SSE42-NEXT: psrld %xmm4, %xmm3 4364; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] 4365; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] 4366; SSE42-NEXT: retq 4367; 4368; AVX2-LABEL: lshr_v16i32_commute_swap: 4369; AVX2: # %bb.0: 4370; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 4371; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 4372; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 4373; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 4374; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 4375; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 4376; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 4377; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 4378; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 4379; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 4380; AVX2-NEXT: vpsrlvd %ymm2, %ymm4, %ymm1 4381; AVX2-NEXT: retq 4382; 4383; AVX512-LABEL: lshr_v16i32_commute_swap: 4384; AVX512: # %bb.0: 4385; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 4386; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 4387; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 4388; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 4389; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 4390; AVX512-NEXT: retq 4391 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 4392 %r = lshr <16 x i32> %s, %x 4393 ret <16 x i32> %r 4394} 4395 4396define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 4397; SSE2-LABEL: lshr_v8i32_cast_cond: 4398; SSE2: # %bb.0: 4399; SSE2-NEXT: movd %edi, %xmm4 4400; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 4401; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] 4402; SSE2-NEXT: movdqa %xmm5, %xmm4 4403; SSE2-NEXT: pand %xmm6, %xmm4 4404; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 4405; SSE2-NEXT: pand %xmm3, %xmm4 4406; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 4407; SSE2-NEXT: pand %xmm3, %xmm5 4408; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 4409; SSE2-NEXT: pand %xmm2, %xmm5 4410; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] 4411; SSE2-NEXT: movdqa %xmm0, %xmm3 4412; SSE2-NEXT: psrld %xmm2, %xmm3 4413; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] 4414; SSE2-NEXT: movdqa %xmm0, %xmm2 4415; SSE2-NEXT: psrld %xmm6, %xmm2 4416; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 4417; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 4418; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] 4419; SSE2-NEXT: movdqa %xmm0, %xmm6 4420; SSE2-NEXT: psrld %xmm5, %xmm6 4421; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 4422; SSE2-NEXT: psrld %xmm3, %xmm0 4423; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] 4424; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 4425; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] 4426; SSE2-NEXT: movdqa %xmm1, %xmm5 4427; SSE2-NEXT: psrld %xmm0, %xmm5 4428; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] 4429; SSE2-NEXT: movdqa %xmm1, %xmm3 4430; SSE2-NEXT: psrld %xmm0, %xmm3 4431; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 4432; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 4433; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] 4434; SSE2-NEXT: movdqa %xmm1, %xmm5 4435; SSE2-NEXT: psrld %xmm4, %xmm5 4436; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] 4437; SSE2-NEXT: psrld %xmm0, %xmm1 4438; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] 4439; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 4440; SSE2-NEXT: movaps %xmm2, %xmm0 4441; SSE2-NEXT: movaps %xmm3, %xmm1 4442; SSE2-NEXT: retq 4443; 4444; SSE42-LABEL: lshr_v8i32_cast_cond: 4445; SSE42: # %bb.0: 4446; SSE42-NEXT: movd %edi, %xmm4 4447; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 4448; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 4449; SSE42-NEXT: movdqa %xmm5, %xmm4 4450; SSE42-NEXT: pand %xmm6, %xmm4 4451; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 4452; SSE42-NEXT: pand %xmm3, %xmm4 4453; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 4454; SSE42-NEXT: pand %xmm3, %xmm5 4455; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 4456; SSE42-NEXT: pand %xmm2, %xmm5 4457; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] 4458; SSE42-NEXT: movdqa %xmm0, %xmm3 4459; SSE42-NEXT: psrld %xmm2, %xmm3 4460; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] 4461; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] 4462; SSE42-NEXT: movdqa %xmm0, %xmm7 4463; SSE42-NEXT: psrld %xmm6, %xmm7 4464; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] 4465; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] 4466; SSE42-NEXT: movdqa %xmm0, %xmm5 4467; SSE42-NEXT: psrld %xmm3, %xmm5 4468; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 4469; SSE42-NEXT: psrld %xmm2, %xmm0 4470; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] 4471; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] 4472; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] 4473; SSE42-NEXT: movdqa %xmm1, %xmm3 4474; SSE42-NEXT: psrld %xmm2, %xmm3 4475; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] 4476; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 4477; SSE42-NEXT: movdqa %xmm1, %xmm6 4478; SSE42-NEXT: psrld %xmm5, %xmm6 4479; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] 4480; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] 4481; SSE42-NEXT: movdqa %xmm1, %xmm4 4482; SSE42-NEXT: psrld %xmm3, %xmm4 4483; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 4484; SSE42-NEXT: psrld %xmm2, %xmm1 4485; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] 4486; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 4487; SSE42-NEXT: retq 4488; 4489; AVX2-LABEL: lshr_v8i32_cast_cond: 4490; AVX2: # %bb.0: 4491; AVX2-NEXT: vmovd %edi, %xmm2 4492; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 4493; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 4494; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 4495; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 4496; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 4497; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 4498; AVX2-NEXT: retq 4499; 4500; AVX512F-LABEL: lshr_v8i32_cast_cond: 4501; AVX512F: # %bb.0: 4502; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 4503; AVX512F-NEXT: kmovw %edi, %k1 4504; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 4505; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 4506; AVX512F-NEXT: retq 4507; 4508; AVX512VL-LABEL: lshr_v8i32_cast_cond: 4509; AVX512VL: # %bb.0: 4510; AVX512VL-NEXT: kmovw %edi, %k1 4511; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 {%k1} 4512; AVX512VL-NEXT: retq 4513 %b = bitcast i8 %pb to <8 x i1> 4514 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 4515 %r = lshr <8 x i32> %x, %s 4516 ret <8 x i32> %r 4517} 4518 4519define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 4520; SSE2-LABEL: lshr_v8i64_cast_cond: 4521; SSE2: # %bb.0: 4522; SSE2-NEXT: movd %edi, %xmm8 4523; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 4524; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 4525; SSE2-NEXT: movdqa %xmm9, %xmm8 4526; SSE2-NEXT: pand %xmm10, %xmm8 4527; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 4528; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 4529; SSE2-NEXT: pand %xmm7, %xmm8 4530; SSE2-NEXT: pand %xmm10, %xmm8 4531; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 4532; SSE2-NEXT: movdqa %xmm9, %xmm7 4533; SSE2-NEXT: pand %xmm10, %xmm7 4534; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 4535; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 4536; SSE2-NEXT: pand %xmm6, %xmm7 4537; SSE2-NEXT: pand %xmm10, %xmm7 4538; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8] 4539; SSE2-NEXT: movdqa %xmm9, %xmm10 4540; SSE2-NEXT: pand %xmm6, %xmm10 4541; SSE2-NEXT: pcmpeqd %xmm6, %xmm10 4542; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2] 4543; SSE2-NEXT: pand %xmm5, %xmm10 4544; SSE2-NEXT: pand %xmm6, %xmm10 4545; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 4546; SSE2-NEXT: pand %xmm5, %xmm9 4547; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 4548; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 4549; SSE2-NEXT: pand %xmm4, %xmm9 4550; SSE2-NEXT: pand %xmm5, %xmm9 4551; SSE2-NEXT: movdqa %xmm0, %xmm4 4552; SSE2-NEXT: psrlq %xmm9, %xmm4 4553; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4554; SSE2-NEXT: psrlq %xmm5, %xmm0 4555; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4556; SSE2-NEXT: movdqa %xmm1, %xmm4 4557; SSE2-NEXT: psrlq %xmm10, %xmm4 4558; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4559; SSE2-NEXT: psrlq %xmm5, %xmm1 4560; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 4561; SSE2-NEXT: movdqa %xmm2, %xmm4 4562; SSE2-NEXT: psrlq %xmm7, %xmm4 4563; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 4564; SSE2-NEXT: psrlq %xmm5, %xmm2 4565; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1] 4566; SSE2-NEXT: movdqa %xmm3, %xmm4 4567; SSE2-NEXT: psrlq %xmm8, %xmm4 4568; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4569; SSE2-NEXT: psrlq %xmm5, %xmm3 4570; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 4571; SSE2-NEXT: retq 4572; 4573; SSE42-LABEL: lshr_v8i64_cast_cond: 4574; SSE42: # %bb.0: 4575; SSE42-NEXT: movd %edi, %xmm8 4576; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 4577; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 4578; SSE42-NEXT: movdqa %xmm9, %xmm8 4579; SSE42-NEXT: pand %xmm10, %xmm8 4580; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 4581; SSE42-NEXT: pand %xmm7, %xmm8 4582; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] 4583; SSE42-NEXT: movdqa %xmm9, %xmm7 4584; SSE42-NEXT: pand %xmm10, %xmm7 4585; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 4586; SSE42-NEXT: pand %xmm6, %xmm7 4587; SSE42-NEXT: pmovsxbq {{.*#+}} xmm6 = [4,8] 4588; SSE42-NEXT: movdqa %xmm9, %xmm10 4589; SSE42-NEXT: pand %xmm6, %xmm10 4590; SSE42-NEXT: pcmpeqq %xmm6, %xmm10 4591; SSE42-NEXT: pand %xmm5, %xmm10 4592; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 4593; SSE42-NEXT: pand %xmm5, %xmm9 4594; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 4595; SSE42-NEXT: pand %xmm4, %xmm9 4596; SSE42-NEXT: movdqa %xmm0, %xmm4 4597; SSE42-NEXT: psrlq %xmm9, %xmm4 4598; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4599; SSE42-NEXT: psrlq %xmm5, %xmm0 4600; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 4601; SSE42-NEXT: movdqa %xmm1, %xmm4 4602; SSE42-NEXT: psrlq %xmm10, %xmm4 4603; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4604; SSE42-NEXT: psrlq %xmm5, %xmm1 4605; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] 4606; SSE42-NEXT: movdqa %xmm2, %xmm4 4607; SSE42-NEXT: psrlq %xmm7, %xmm4 4608; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 4609; SSE42-NEXT: psrlq %xmm5, %xmm2 4610; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 4611; SSE42-NEXT: movdqa %xmm3, %xmm4 4612; SSE42-NEXT: psrlq %xmm8, %xmm4 4613; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4614; SSE42-NEXT: psrlq %xmm5, %xmm3 4615; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 4616; SSE42-NEXT: retq 4617; 4618; AVX2-LABEL: lshr_v8i64_cast_cond: 4619; AVX2: # %bb.0: 4620; AVX2-NEXT: vmovd %edi, %xmm4 4621; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 4622; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 4623; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 4624; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 4625; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 4626; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 4627; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 4628; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 4629; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 4630; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 4631; AVX2-NEXT: vpsrlvq %ymm3, %ymm1, %ymm1 4632; AVX2-NEXT: retq 4633; 4634; AVX512-LABEL: lshr_v8i64_cast_cond: 4635; AVX512: # %bb.0: 4636; AVX512-NEXT: kmovw %edi, %k1 4637; AVX512-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} 4638; AVX512-NEXT: retq 4639 %b = bitcast i8 %pb to <8 x i1> 4640 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer 4641 %r = lshr <8 x i64> %x, %s 4642 ret <8 x i64> %r 4643} 4644 4645define <4 x i32> @ashr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) { 4646; SSE2-LABEL: ashr_v4i32: 4647; SSE2: # %bb.0: 4648; SSE2-NEXT: pslld $31, %xmm0 4649; SSE2-NEXT: psrad $31, %xmm0 4650; SSE2-NEXT: pand %xmm2, %xmm0 4651; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] 4652; SSE2-NEXT: movdqa %xmm1, %xmm3 4653; SSE2-NEXT: psrad %xmm2, %xmm3 4654; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7] 4655; SSE2-NEXT: movdqa %xmm1, %xmm2 4656; SSE2-NEXT: psrad %xmm4, %xmm2 4657; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 4658; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 4659; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7] 4660; SSE2-NEXT: movdqa %xmm1, %xmm4 4661; SSE2-NEXT: psrad %xmm3, %xmm4 4662; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] 4663; SSE2-NEXT: psrad %xmm0, %xmm1 4664; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] 4665; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3] 4666; SSE2-NEXT: movaps %xmm2, %xmm0 4667; SSE2-NEXT: retq 4668; 4669; SSE42-LABEL: ashr_v4i32: 4670; SSE42: # %bb.0: 4671; SSE42-NEXT: pslld $31, %xmm0 4672; SSE42-NEXT: psrad $31, %xmm0 4673; SSE42-NEXT: pand %xmm2, %xmm0 4674; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7] 4675; SSE42-NEXT: movdqa %xmm1, %xmm3 4676; SSE42-NEXT: psrad %xmm2, %xmm3 4677; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 4678; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] 4679; SSE42-NEXT: movdqa %xmm1, %xmm5 4680; SSE42-NEXT: psrad %xmm4, %xmm5 4681; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] 4682; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7] 4683; SSE42-NEXT: movdqa %xmm1, %xmm0 4684; SSE42-NEXT: psrad %xmm3, %xmm0 4685; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 4686; SSE42-NEXT: psrad %xmm2, %xmm1 4687; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4688; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] 4689; SSE42-NEXT: retq 4690; 4691; AVX2-LABEL: ashr_v4i32: 4692; AVX2: # %bb.0: 4693; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 4694; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 4695; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 4696; AVX2-NEXT: vpsravd %xmm0, %xmm1, %xmm0 4697; AVX2-NEXT: retq 4698; 4699; AVX512F-LABEL: ashr_v4i32: 4700; AVX512F: # %bb.0: 4701; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 4702; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 4703; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 4704; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 4705; AVX512F-NEXT: vpsravd %xmm0, %xmm1, %xmm0 4706; AVX512F-NEXT: vzeroupper 4707; AVX512F-NEXT: retq 4708; 4709; AVX512VL-LABEL: ashr_v4i32: 4710; AVX512VL: # %bb.0: 4711; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 4712; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 4713; AVX512VL-NEXT: vpsravd %xmm2, %xmm1, %xmm1 {%k1} 4714; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 4715; AVX512VL-NEXT: retq 4716 %s = select <4 x i1> %b, <4 x i32> %y, <4 x i32> zeroinitializer 4717 %r = ashr <4 x i32> %x, %s 4718 ret <4 x i32> %r 4719} 4720 4721; negative test - ashr is not commutative; there is no identity constant for operand 0 4722 4723define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) { 4724; SSE2-LABEL: ashr_v8i32_commute: 4725; SSE2: # %bb.0: 4726; SSE2-NEXT: movdqa %xmm1, %xmm5 4727; SSE2-NEXT: movdqa %xmm0, %xmm1 4728; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 4729; SSE2-NEXT: pslld $31, %xmm1 4730; SSE2-NEXT: psrad $31, %xmm1 4731; SSE2-NEXT: pand %xmm4, %xmm1 4732; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 4733; SSE2-NEXT: pslld $31, %xmm0 4734; SSE2-NEXT: psrad $31, %xmm0 4735; SSE2-NEXT: pand %xmm3, %xmm0 4736; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 4737; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 4738; SSE2-NEXT: movdqa %xmm0, %xmm6 4739; SSE2-NEXT: psrad %xmm4, %xmm6 4740; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 4741; SSE2-NEXT: movdqa %xmm0, %xmm4 4742; SSE2-NEXT: psrad %xmm3, %xmm4 4743; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1] 4744; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 4745; SSE2-NEXT: movdqa %xmm0, %xmm6 4746; SSE2-NEXT: psrad %xmm3, %xmm6 4747; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] 4748; SSE2-NEXT: psrad %xmm3, %xmm0 4749; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 4750; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] 4751; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 4752; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 4753; SSE2-NEXT: movdqa %xmm1, %xmm5 4754; SSE2-NEXT: psrad %xmm4, %xmm5 4755; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 4756; SSE2-NEXT: movdqa %xmm1, %xmm4 4757; SSE2-NEXT: psrad %xmm3, %xmm4 4758; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1] 4759; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] 4760; SSE2-NEXT: movdqa %xmm1, %xmm5 4761; SSE2-NEXT: psrad %xmm3, %xmm5 4762; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 4763; SSE2-NEXT: psrad %xmm2, %xmm1 4764; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 4765; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3] 4766; SSE2-NEXT: retq 4767; 4768; SSE42-LABEL: ashr_v8i32_commute: 4769; SSE42: # %bb.0: 4770; SSE42-NEXT: movdqa %xmm0, %xmm5 4771; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 4772; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 4773; SSE42-NEXT: pslld $31, %xmm5 4774; SSE42-NEXT: psrad $31, %xmm5 4775; SSE42-NEXT: pand %xmm4, %xmm5 4776; SSE42-NEXT: pslld $31, %xmm0 4777; SSE42-NEXT: psrad $31, %xmm0 4778; SSE42-NEXT: pand %xmm3, %xmm0 4779; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 4780; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7] 4781; SSE42-NEXT: movdqa %xmm0, %xmm6 4782; SSE42-NEXT: psrad %xmm4, %xmm6 4783; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7] 4784; SSE42-NEXT: movdqa %xmm0, %xmm7 4785; SSE42-NEXT: psrad %xmm4, %xmm7 4786; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] 4787; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 4788; SSE42-NEXT: movdqa %xmm0, %xmm4 4789; SSE42-NEXT: psrad %xmm3, %xmm4 4790; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] 4791; SSE42-NEXT: psrad %xmm1, %xmm0 4792; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] 4793; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] 4794; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 4795; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] 4796; SSE42-NEXT: movdqa %xmm5, %xmm4 4797; SSE42-NEXT: psrad %xmm3, %xmm4 4798; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7] 4799; SSE42-NEXT: movdqa %xmm5, %xmm6 4800; SSE42-NEXT: psrad %xmm3, %xmm6 4801; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7] 4802; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7] 4803; SSE42-NEXT: movdqa %xmm5, %xmm1 4804; SSE42-NEXT: psrad %xmm3, %xmm1 4805; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 4806; SSE42-NEXT: psrad %xmm2, %xmm5 4807; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 4808; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 4809; SSE42-NEXT: retq 4810; 4811; AVX2-LABEL: ashr_v8i32_commute: 4812; AVX2: # %bb.0: 4813; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 4814; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 4815; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 4816; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4817; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 4818; AVX2-NEXT: retq 4819; 4820; AVX512F-LABEL: ashr_v8i32_commute: 4821; AVX512F: # %bb.0: 4822; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 4823; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 4824; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 4825; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 4826; AVX512F-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 4827; AVX512F-NEXT: vpsravd %ymm1, %ymm0, %ymm0 4828; AVX512F-NEXT: retq 4829; 4830; AVX512VL-LABEL: ashr_v8i32_commute: 4831; AVX512VL: # %bb.0: 4832; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 4833; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 4834; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 4835; AVX512VL-NEXT: vmovdqa32 %ymm2, %ymm0 {%k1} {z} 4836; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 4837; AVX512VL-NEXT: retq 4838 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 4839 %r = ashr <8 x i32> %s, %x 4840 ret <8 x i32> %r 4841} 4842 4843define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 4844; SSE2-LABEL: ashr_v16i32_swap: 4845; SSE2: # %bb.0: 4846; SSE2-NEXT: movdqa %xmm0, %xmm8 4847; SSE2-NEXT: movdqa %xmm0, %xmm10 4848; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 4849; SSE2-NEXT: movdqa %xmm10, %xmm9 4850; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7] 4851; SSE2-NEXT: pslld $31, %xmm9 4852; SSE2-NEXT: psrad $31, %xmm9 4853; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9 4854; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3] 4855; SSE2-NEXT: pslld $31, %xmm10 4856; SSE2-NEXT: psrad $31, %xmm10 4857; SSE2-NEXT: pandn %xmm7, %xmm10 4858; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 4859; SSE2-NEXT: movdqa %xmm8, %xmm7 4860; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] 4861; SSE2-NEXT: pslld $31, %xmm7 4862; SSE2-NEXT: psrad $31, %xmm7 4863; SSE2-NEXT: pandn %xmm6, %xmm7 4864; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] 4865; SSE2-NEXT: pslld $31, %xmm8 4866; SSE2-NEXT: psrad $31, %xmm8 4867; SSE2-NEXT: pandn %xmm5, %xmm8 4868; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7] 4869; SSE2-NEXT: movdqa %xmm1, %xmm5 4870; SSE2-NEXT: psrad %xmm0, %xmm5 4871; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] 4872; SSE2-NEXT: movdqa %xmm1, %xmm0 4873; SSE2-NEXT: psrad %xmm6, %xmm0 4874; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] 4875; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 4876; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4877; SSE2-NEXT: movdqa %xmm1, %xmm8 4878; SSE2-NEXT: psrad %xmm6, %xmm8 4879; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4880; SSE2-NEXT: psrad %xmm5, %xmm1 4881; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1] 4882; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3] 4883; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] 4884; SSE2-NEXT: movdqa %xmm2, %xmm5 4885; SSE2-NEXT: psrad %xmm1, %xmm5 4886; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4887; SSE2-NEXT: movdqa %xmm2, %xmm1 4888; SSE2-NEXT: psrad %xmm6, %xmm1 4889; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 4890; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] 4891; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4892; SSE2-NEXT: movdqa %xmm2, %xmm7 4893; SSE2-NEXT: psrad %xmm6, %xmm7 4894; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4895; SSE2-NEXT: psrad %xmm5, %xmm2 4896; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1] 4897; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] 4898; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7] 4899; SSE2-NEXT: movdqa %xmm3, %xmm5 4900; SSE2-NEXT: psrad %xmm2, %xmm5 4901; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] 4902; SSE2-NEXT: movdqa %xmm3, %xmm2 4903; SSE2-NEXT: psrad %xmm6, %xmm2 4904; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 4905; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 4906; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4907; SSE2-NEXT: movdqa %xmm3, %xmm7 4908; SSE2-NEXT: psrad %xmm6, %xmm7 4909; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4910; SSE2-NEXT: psrad %xmm5, %xmm3 4911; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1] 4912; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] 4913; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7] 4914; SSE2-NEXT: movdqa %xmm4, %xmm5 4915; SSE2-NEXT: psrad %xmm3, %xmm5 4916; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] 4917; SSE2-NEXT: movdqa %xmm4, %xmm3 4918; SSE2-NEXT: psrad %xmm6, %xmm3 4919; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 4920; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 4921; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 4922; SSE2-NEXT: movdqa %xmm4, %xmm7 4923; SSE2-NEXT: psrad %xmm6, %xmm7 4924; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 4925; SSE2-NEXT: psrad %xmm5, %xmm4 4926; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1] 4927; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] 4928; SSE2-NEXT: retq 4929; 4930; SSE42-LABEL: ashr_v16i32_swap: 4931; SSE42: # %bb.0: 4932; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3] 4933; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero 4934; SSE42-NEXT: pslld $31, %xmm8 4935; SSE42-NEXT: psrad $31, %xmm8 4936; SSE42-NEXT: pandn %xmm7, %xmm8 4937; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 4938; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 4939; SSE42-NEXT: pslld $31, %xmm7 4940; SSE42-NEXT: psrad $31, %xmm7 4941; SSE42-NEXT: pandn %xmm6, %xmm7 4942; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4943; SSE42-NEXT: pslld $31, %xmm6 4944; SSE42-NEXT: psrad $31, %xmm6 4945; SSE42-NEXT: pandn %xmm5, %xmm6 4946; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 4947; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 4948; SSE42-NEXT: pslld $31, %xmm5 4949; SSE42-NEXT: psrad $31, %xmm5 4950; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 4951; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7] 4952; SSE42-NEXT: movdqa %xmm1, %xmm9 4953; SSE42-NEXT: psrad %xmm0, %xmm9 4954; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3] 4955; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7] 4956; SSE42-NEXT: movdqa %xmm1, %xmm11 4957; SSE42-NEXT: psrad %xmm0, %xmm11 4958; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7] 4959; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7] 4960; SSE42-NEXT: movdqa %xmm1, %xmm0 4961; SSE42-NEXT: psrad %xmm6, %xmm0 4962; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7] 4963; SSE42-NEXT: psrad %xmm6, %xmm1 4964; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 4965; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] 4966; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7] 4967; SSE42-NEXT: movdqa %xmm2, %xmm6 4968; SSE42-NEXT: psrad %xmm1, %xmm6 4969; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3] 4970; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7] 4971; SSE42-NEXT: movdqa %xmm2, %xmm10 4972; SSE42-NEXT: psrad %xmm1, %xmm10 4973; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7] 4974; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4975; SSE42-NEXT: movdqa %xmm2, %xmm1 4976; SSE42-NEXT: psrad %xmm6, %xmm1 4977; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7] 4978; SSE42-NEXT: psrad %xmm6, %xmm2 4979; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] 4980; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7] 4981; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7] 4982; SSE42-NEXT: movdqa %xmm3, %xmm6 4983; SSE42-NEXT: psrad %xmm2, %xmm6 4984; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] 4985; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7] 4986; SSE42-NEXT: movdqa %xmm3, %xmm9 4987; SSE42-NEXT: psrad %xmm2, %xmm9 4988; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7] 4989; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7] 4990; SSE42-NEXT: movdqa %xmm3, %xmm2 4991; SSE42-NEXT: psrad %xmm6, %xmm2 4992; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7] 4993; SSE42-NEXT: psrad %xmm6, %xmm3 4994; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 4995; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7] 4996; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7] 4997; SSE42-NEXT: movdqa %xmm4, %xmm6 4998; SSE42-NEXT: psrad %xmm3, %xmm6 4999; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3] 5000; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7] 5001; SSE42-NEXT: movdqa %xmm4, %xmm8 5002; SSE42-NEXT: psrad %xmm3, %xmm8 5003; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7] 5004; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5005; SSE42-NEXT: movdqa %xmm4, %xmm3 5006; SSE42-NEXT: psrad %xmm5, %xmm3 5007; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7] 5008; SSE42-NEXT: psrad %xmm5, %xmm4 5009; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 5010; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] 5011; SSE42-NEXT: retq 5012; 5013; AVX2-LABEL: ashr_v16i32_swap: 5014; AVX2: # %bb.0: 5015; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5016; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 5017; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 5018; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 5019; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 5020; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 5021; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 5022; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 5023; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 5024; AVX2-NEXT: vpsravd %ymm0, %ymm1, %ymm0 5025; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm1 5026; AVX2-NEXT: retq 5027; 5028; AVX512-LABEL: ashr_v16i32_swap: 5029; AVX512: # %bb.0: 5030; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 5031; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 5032; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 5033; AVX512-NEXT: vpsravd %zmm2, %zmm1, %zmm1 {%k1} 5034; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 5035; AVX512-NEXT: retq 5036 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 5037 %r = ashr <16 x i32> %x, %s 5038 ret <16 x i32> %r 5039} 5040 5041; negative test - ashr is not commutative; there is no identity constant for operand 0 5042 5043define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) { 5044; SSE2-LABEL: ashr_v16i32_commute_swap: 5045; SSE2: # %bb.0: 5046; SSE2-NEXT: movdqa %xmm3, %xmm8 5047; SSE2-NEXT: movdqa %xmm2, %xmm9 5048; SSE2-NEXT: movdqa %xmm1, %xmm10 5049; SSE2-NEXT: movdqa %xmm0, %xmm2 5050; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5051; SSE2-NEXT: movdqa %xmm2, %xmm3 5052; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 5053; SSE2-NEXT: pslld $31, %xmm3 5054; SSE2-NEXT: psrad $31, %xmm3 5055; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 5056; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 5057; SSE2-NEXT: pslld $31, %xmm2 5058; SSE2-NEXT: psrad $31, %xmm2 5059; SSE2-NEXT: pandn %xmm7, %xmm2 5060; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 5061; SSE2-NEXT: movdqa %xmm0, %xmm1 5062; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 5063; SSE2-NEXT: pslld $31, %xmm1 5064; SSE2-NEXT: psrad $31, %xmm1 5065; SSE2-NEXT: pandn %xmm6, %xmm1 5066; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 5067; SSE2-NEXT: pslld $31, %xmm0 5068; SSE2-NEXT: psrad $31, %xmm0 5069; SSE2-NEXT: pandn %xmm5, %xmm0 5070; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 5071; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5072; SSE2-NEXT: movdqa %xmm0, %xmm7 5073; SSE2-NEXT: psrad %xmm6, %xmm7 5074; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5075; SSE2-NEXT: movdqa %xmm0, %xmm6 5076; SSE2-NEXT: psrad %xmm5, %xmm6 5077; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 5078; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7] 5079; SSE2-NEXT: movdqa %xmm0, %xmm7 5080; SSE2-NEXT: psrad %xmm5, %xmm7 5081; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] 5082; SSE2-NEXT: psrad %xmm5, %xmm0 5083; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] 5084; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] 5085; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 5086; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5087; SSE2-NEXT: movdqa %xmm1, %xmm7 5088; SSE2-NEXT: psrad %xmm6, %xmm7 5089; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5090; SSE2-NEXT: movdqa %xmm1, %xmm6 5091; SSE2-NEXT: psrad %xmm5, %xmm6 5092; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 5093; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7] 5094; SSE2-NEXT: movdqa %xmm1, %xmm7 5095; SSE2-NEXT: psrad %xmm5, %xmm7 5096; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] 5097; SSE2-NEXT: psrad %xmm5, %xmm1 5098; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0] 5099; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] 5100; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 5101; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5102; SSE2-NEXT: movdqa %xmm2, %xmm7 5103; SSE2-NEXT: psrad %xmm6, %xmm7 5104; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5105; SSE2-NEXT: movdqa %xmm2, %xmm6 5106; SSE2-NEXT: psrad %xmm5, %xmm6 5107; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 5108; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7] 5109; SSE2-NEXT: movdqa %xmm2, %xmm7 5110; SSE2-NEXT: psrad %xmm5, %xmm7 5111; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] 5112; SSE2-NEXT: psrad %xmm5, %xmm2 5113; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0] 5114; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3] 5115; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 5116; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5117; SSE2-NEXT: movdqa %xmm3, %xmm7 5118; SSE2-NEXT: psrad %xmm6, %xmm7 5119; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5120; SSE2-NEXT: movdqa %xmm3, %xmm6 5121; SSE2-NEXT: psrad %xmm5, %xmm6 5122; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1] 5123; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7] 5124; SSE2-NEXT: movdqa %xmm3, %xmm7 5125; SSE2-NEXT: psrad %xmm5, %xmm7 5126; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 5127; SSE2-NEXT: psrad %xmm4, %xmm3 5128; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] 5129; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3] 5130; SSE2-NEXT: retq 5131; 5132; SSE42-LABEL: ashr_v16i32_commute_swap: 5133; SSE42: # %bb.0: 5134; SSE42-NEXT: movdqa %xmm3, %xmm10 5135; SSE42-NEXT: movdqa %xmm2, %xmm9 5136; SSE42-NEXT: movdqa %xmm1, %xmm8 5137; SSE42-NEXT: movdqa %xmm0, %xmm3 5138; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 5139; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5140; SSE42-NEXT: pslld $31, %xmm2 5141; SSE42-NEXT: psrad $31, %xmm2 5142; SSE42-NEXT: pandn %xmm7, %xmm2 5143; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 5144; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 5145; SSE42-NEXT: pslld $31, %xmm1 5146; SSE42-NEXT: psrad $31, %xmm1 5147; SSE42-NEXT: pandn %xmm6, %xmm1 5148; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 5149; SSE42-NEXT: pslld $31, %xmm0 5150; SSE42-NEXT: psrad $31, %xmm0 5151; SSE42-NEXT: pandn %xmm5, %xmm0 5152; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 5153; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 5154; SSE42-NEXT: pslld $31, %xmm3 5155; SSE42-NEXT: psrad $31, %xmm3 5156; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3 5157; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3] 5158; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5159; SSE42-NEXT: movdqa %xmm0, %xmm7 5160; SSE42-NEXT: psrad %xmm6, %xmm7 5161; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7] 5162; SSE42-NEXT: movdqa %xmm0, %xmm11 5163; SSE42-NEXT: psrad %xmm6, %xmm11 5164; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7] 5165; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5166; SSE42-NEXT: movdqa %xmm0, %xmm6 5167; SSE42-NEXT: psrad %xmm5, %xmm6 5168; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7] 5169; SSE42-NEXT: psrad %xmm5, %xmm0 5170; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7] 5171; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7] 5172; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3] 5173; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5174; SSE42-NEXT: movdqa %xmm1, %xmm7 5175; SSE42-NEXT: psrad %xmm6, %xmm7 5176; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7] 5177; SSE42-NEXT: movdqa %xmm1, %xmm8 5178; SSE42-NEXT: psrad %xmm6, %xmm8 5179; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 5180; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5181; SSE42-NEXT: movdqa %xmm1, %xmm6 5182; SSE42-NEXT: psrad %xmm5, %xmm6 5183; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7] 5184; SSE42-NEXT: psrad %xmm5, %xmm1 5185; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] 5186; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] 5187; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3] 5188; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5189; SSE42-NEXT: movdqa %xmm2, %xmm7 5190; SSE42-NEXT: psrad %xmm6, %xmm7 5191; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7] 5192; SSE42-NEXT: movdqa %xmm2, %xmm8 5193; SSE42-NEXT: psrad %xmm6, %xmm8 5194; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 5195; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5196; SSE42-NEXT: movdqa %xmm2, %xmm6 5197; SSE42-NEXT: psrad %xmm5, %xmm6 5198; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7] 5199; SSE42-NEXT: psrad %xmm5, %xmm2 5200; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] 5201; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7] 5202; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] 5203; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7] 5204; SSE42-NEXT: movdqa %xmm3, %xmm7 5205; SSE42-NEXT: psrad %xmm6, %xmm7 5206; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7] 5207; SSE42-NEXT: movdqa %xmm3, %xmm8 5208; SSE42-NEXT: psrad %xmm6, %xmm8 5209; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7] 5210; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7] 5211; SSE42-NEXT: movdqa %xmm3, %xmm6 5212; SSE42-NEXT: psrad %xmm5, %xmm6 5213; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7] 5214; SSE42-NEXT: psrad %xmm4, %xmm3 5215; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] 5216; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7] 5217; SSE42-NEXT: retq 5218; 5219; AVX2-LABEL: ashr_v16i32_commute_swap: 5220; AVX2: # %bb.0: 5221; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 5222; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 5223; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 5224; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 5225; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 5226; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 5227; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 5228; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 5229; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 5230; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 5231; AVX2-NEXT: vpsravd %ymm2, %ymm4, %ymm1 5232; AVX2-NEXT: retq 5233; 5234; AVX512-LABEL: ashr_v16i32_commute_swap: 5235; AVX512: # %bb.0: 5236; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 5237; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 5238; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 5239; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} {z} 5240; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 5241; AVX512-NEXT: retq 5242 %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y 5243 %r = ashr <16 x i32> %s, %x 5244 ret <16 x i32> %r 5245} 5246 5247define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) { 5248; SSE2-LABEL: ashr_v8i32_cast_cond: 5249; SSE2: # %bb.0: 5250; SSE2-NEXT: movd %edi, %xmm4 5251; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 5252; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128] 5253; SSE2-NEXT: movdqa %xmm5, %xmm4 5254; SSE2-NEXT: pand %xmm6, %xmm4 5255; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 5256; SSE2-NEXT: pand %xmm3, %xmm4 5257; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8] 5258; SSE2-NEXT: pand %xmm3, %xmm5 5259; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 5260; SSE2-NEXT: pand %xmm2, %xmm5 5261; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] 5262; SSE2-NEXT: movdqa %xmm0, %xmm3 5263; SSE2-NEXT: psrad %xmm2, %xmm3 5264; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7] 5265; SSE2-NEXT: movdqa %xmm0, %xmm2 5266; SSE2-NEXT: psrad %xmm6, %xmm2 5267; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 5268; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] 5269; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7] 5270; SSE2-NEXT: movdqa %xmm0, %xmm6 5271; SSE2-NEXT: psrad %xmm5, %xmm6 5272; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7] 5273; SSE2-NEXT: psrad %xmm3, %xmm0 5274; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1] 5275; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] 5276; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7] 5277; SSE2-NEXT: movdqa %xmm1, %xmm5 5278; SSE2-NEXT: psrad %xmm0, %xmm5 5279; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7] 5280; SSE2-NEXT: movdqa %xmm1, %xmm3 5281; SSE2-NEXT: psrad %xmm0, %xmm3 5282; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5283; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 5284; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7] 5285; SSE2-NEXT: movdqa %xmm1, %xmm5 5286; SSE2-NEXT: psrad %xmm4, %xmm5 5287; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7] 5288; SSE2-NEXT: psrad %xmm0, %xmm1 5289; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1] 5290; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3] 5291; SSE2-NEXT: movaps %xmm2, %xmm0 5292; SSE2-NEXT: movaps %xmm3, %xmm1 5293; SSE2-NEXT: retq 5294; 5295; SSE42-LABEL: ashr_v8i32_cast_cond: 5296; SSE42: # %bb.0: 5297; SSE42-NEXT: movd %edi, %xmm4 5298; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 5299; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = [16,32,64,128] 5300; SSE42-NEXT: movdqa %xmm5, %xmm4 5301; SSE42-NEXT: pand %xmm6, %xmm4 5302; SSE42-NEXT: pcmpeqd %xmm6, %xmm4 5303; SSE42-NEXT: pand %xmm3, %xmm4 5304; SSE42-NEXT: pmovsxbd {{.*#+}} xmm3 = [1,2,4,8] 5305; SSE42-NEXT: pand %xmm3, %xmm5 5306; SSE42-NEXT: pcmpeqd %xmm3, %xmm5 5307; SSE42-NEXT: pand %xmm2, %xmm5 5308; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7] 5309; SSE42-NEXT: movdqa %xmm0, %xmm3 5310; SSE42-NEXT: psrad %xmm2, %xmm3 5311; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] 5312; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7] 5313; SSE42-NEXT: movdqa %xmm0, %xmm7 5314; SSE42-NEXT: psrad %xmm6, %xmm7 5315; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7] 5316; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7] 5317; SSE42-NEXT: movdqa %xmm0, %xmm5 5318; SSE42-NEXT: psrad %xmm3, %xmm5 5319; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 5320; SSE42-NEXT: psrad %xmm2, %xmm0 5321; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] 5322; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] 5323; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7] 5324; SSE42-NEXT: movdqa %xmm1, %xmm3 5325; SSE42-NEXT: psrad %xmm2, %xmm3 5326; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3] 5327; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7] 5328; SSE42-NEXT: movdqa %xmm1, %xmm6 5329; SSE42-NEXT: psrad %xmm5, %xmm6 5330; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7] 5331; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7] 5332; SSE42-NEXT: movdqa %xmm1, %xmm4 5333; SSE42-NEXT: psrad %xmm3, %xmm4 5334; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7] 5335; SSE42-NEXT: psrad %xmm2, %xmm1 5336; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] 5337; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 5338; SSE42-NEXT: retq 5339; 5340; AVX2-LABEL: ashr_v8i32_cast_cond: 5341; AVX2: # %bb.0: 5342; AVX2-NEXT: vmovd %edi, %xmm2 5343; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 5344; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] 5345; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 5346; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 5347; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 5348; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 5349; AVX2-NEXT: retq 5350; 5351; AVX512F-LABEL: ashr_v8i32_cast_cond: 5352; AVX512F: # %bb.0: 5353; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 5354; AVX512F-NEXT: kmovw %edi, %k1 5355; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 5356; AVX512F-NEXT: vpsravd %ymm1, %ymm0, %ymm0 5357; AVX512F-NEXT: retq 5358; 5359; AVX512VL-LABEL: ashr_v8i32_cast_cond: 5360; AVX512VL: # %bb.0: 5361; AVX512VL-NEXT: kmovw %edi, %k1 5362; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 {%k1} 5363; AVX512VL-NEXT: retq 5364 %b = bitcast i8 %pb to <8 x i1> 5365 %s = select <8 x i1> %b, <8 x i32> %y, <8 x i32> zeroinitializer 5366 %r = ashr <8 x i32> %x, %s 5367 ret <8 x i32> %r 5368} 5369 5370define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { 5371; SSE2-LABEL: ashr_v8i64_cast_cond: 5372; SSE2: # %bb.0: 5373; SSE2-NEXT: movd %edi, %xmm8 5374; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 5375; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128] 5376; SSE2-NEXT: movdqa %xmm9, %xmm8 5377; SSE2-NEXT: pand %xmm10, %xmm8 5378; SSE2-NEXT: pcmpeqd %xmm10, %xmm8 5379; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2] 5380; SSE2-NEXT: pand %xmm7, %xmm8 5381; SSE2-NEXT: pand %xmm10, %xmm8 5382; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32] 5383; SSE2-NEXT: movdqa %xmm9, %xmm7 5384; SSE2-NEXT: pand %xmm10, %xmm7 5385; SSE2-NEXT: pcmpeqd %xmm10, %xmm7 5386; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2] 5387; SSE2-NEXT: pand %xmm6, %xmm7 5388; SSE2-NEXT: pand %xmm10, %xmm7 5389; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4,8] 5390; SSE2-NEXT: movdqa %xmm9, %xmm6 5391; SSE2-NEXT: pand %xmm10, %xmm6 5392; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 5393; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,0,3,2] 5394; SSE2-NEXT: pand %xmm5, %xmm6 5395; SSE2-NEXT: pand %xmm10, %xmm6 5396; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2] 5397; SSE2-NEXT: pand %xmm5, %xmm9 5398; SSE2-NEXT: pcmpeqd %xmm5, %xmm9 5399; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2] 5400; SSE2-NEXT: pand %xmm4, %xmm9 5401; SSE2-NEXT: pand %xmm5, %xmm9 5402; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 5403; SSE2-NEXT: movdqa %xmm4, %xmm5 5404; SSE2-NEXT: psrlq %xmm9, %xmm5 5405; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] 5406; SSE2-NEXT: movdqa %xmm4, %xmm11 5407; SSE2-NEXT: psrlq %xmm10, %xmm11 5408; SSE2-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1] 5409; SSE2-NEXT: movdqa %xmm0, %xmm5 5410; SSE2-NEXT: psrlq %xmm9, %xmm5 5411; SSE2-NEXT: psrlq %xmm10, %xmm0 5412; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1] 5413; SSE2-NEXT: xorpd %xmm11, %xmm0 5414; SSE2-NEXT: psubq %xmm11, %xmm0 5415; SSE2-NEXT: movdqa %xmm4, %xmm5 5416; SSE2-NEXT: psrlq %xmm6, %xmm5 5417; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] 5418; SSE2-NEXT: movdqa %xmm4, %xmm10 5419; SSE2-NEXT: psrlq %xmm9, %xmm10 5420; SSE2-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1] 5421; SSE2-NEXT: movdqa %xmm1, %xmm5 5422; SSE2-NEXT: psrlq %xmm6, %xmm5 5423; SSE2-NEXT: psrlq %xmm9, %xmm1 5424; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1] 5425; SSE2-NEXT: xorpd %xmm10, %xmm1 5426; SSE2-NEXT: psubq %xmm10, %xmm1 5427; SSE2-NEXT: movdqa %xmm4, %xmm5 5428; SSE2-NEXT: psrlq %xmm7, %xmm5 5429; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] 5430; SSE2-NEXT: movdqa %xmm4, %xmm9 5431; SSE2-NEXT: psrlq %xmm6, %xmm9 5432; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1] 5433; SSE2-NEXT: movdqa %xmm2, %xmm5 5434; SSE2-NEXT: psrlq %xmm7, %xmm5 5435; SSE2-NEXT: psrlq %xmm6, %xmm2 5436; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1] 5437; SSE2-NEXT: xorpd %xmm9, %xmm2 5438; SSE2-NEXT: psubq %xmm9, %xmm2 5439; SSE2-NEXT: movdqa %xmm4, %xmm5 5440; SSE2-NEXT: psrlq %xmm8, %xmm5 5441; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] 5442; SSE2-NEXT: psrlq %xmm6, %xmm4 5443; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 5444; SSE2-NEXT: movdqa %xmm3, %xmm5 5445; SSE2-NEXT: psrlq %xmm8, %xmm5 5446; SSE2-NEXT: psrlq %xmm6, %xmm3 5447; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1] 5448; SSE2-NEXT: xorpd %xmm4, %xmm3 5449; SSE2-NEXT: psubq %xmm4, %xmm3 5450; SSE2-NEXT: retq 5451; 5452; SSE42-LABEL: ashr_v8i64_cast_cond: 5453; SSE42: # %bb.0: 5454; SSE42-NEXT: movd %edi, %xmm8 5455; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1] 5456; SSE42-NEXT: pmovzxbq {{.*#+}} xmm10 = [64,128] 5457; SSE42-NEXT: movdqa %xmm9, %xmm8 5458; SSE42-NEXT: pand %xmm10, %xmm8 5459; SSE42-NEXT: pcmpeqq %xmm10, %xmm8 5460; SSE42-NEXT: pand %xmm7, %xmm8 5461; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [16,32] 5462; SSE42-NEXT: movdqa %xmm9, %xmm7 5463; SSE42-NEXT: pand %xmm10, %xmm7 5464; SSE42-NEXT: pcmpeqq %xmm10, %xmm7 5465; SSE42-NEXT: pand %xmm6, %xmm7 5466; SSE42-NEXT: pmovsxbq {{.*#+}} xmm10 = [4,8] 5467; SSE42-NEXT: movdqa %xmm9, %xmm6 5468; SSE42-NEXT: pand %xmm10, %xmm6 5469; SSE42-NEXT: pcmpeqq %xmm10, %xmm6 5470; SSE42-NEXT: pand %xmm5, %xmm6 5471; SSE42-NEXT: pmovsxbq {{.*#+}} xmm5 = [1,2] 5472; SSE42-NEXT: pand %xmm5, %xmm9 5473; SSE42-NEXT: pcmpeqq %xmm5, %xmm9 5474; SSE42-NEXT: pand %xmm4, %xmm9 5475; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] 5476; SSE42-NEXT: movdqa %xmm4, %xmm5 5477; SSE42-NEXT: psrlq %xmm9, %xmm5 5478; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] 5479; SSE42-NEXT: movdqa %xmm4, %xmm11 5480; SSE42-NEXT: psrlq %xmm10, %xmm11 5481; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm5[0,1,2,3],xmm11[4,5,6,7] 5482; SSE42-NEXT: movdqa %xmm0, %xmm5 5483; SSE42-NEXT: psrlq %xmm9, %xmm5 5484; SSE42-NEXT: psrlq %xmm10, %xmm0 5485; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] 5486; SSE42-NEXT: pxor %xmm11, %xmm0 5487; SSE42-NEXT: psubq %xmm11, %xmm0 5488; SSE42-NEXT: movdqa %xmm4, %xmm5 5489; SSE42-NEXT: psrlq %xmm6, %xmm5 5490; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3] 5491; SSE42-NEXT: movdqa %xmm4, %xmm10 5492; SSE42-NEXT: psrlq %xmm9, %xmm10 5493; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm10[4,5,6,7] 5494; SSE42-NEXT: movdqa %xmm1, %xmm5 5495; SSE42-NEXT: psrlq %xmm6, %xmm5 5496; SSE42-NEXT: psrlq %xmm9, %xmm1 5497; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7] 5498; SSE42-NEXT: pxor %xmm10, %xmm1 5499; SSE42-NEXT: psubq %xmm10, %xmm1 5500; SSE42-NEXT: movdqa %xmm4, %xmm5 5501; SSE42-NEXT: psrlq %xmm7, %xmm5 5502; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] 5503; SSE42-NEXT: movdqa %xmm4, %xmm9 5504; SSE42-NEXT: psrlq %xmm6, %xmm9 5505; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm9[4,5,6,7] 5506; SSE42-NEXT: movdqa %xmm2, %xmm5 5507; SSE42-NEXT: psrlq %xmm7, %xmm5 5508; SSE42-NEXT: psrlq %xmm6, %xmm2 5509; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 5510; SSE42-NEXT: pxor %xmm9, %xmm2 5511; SSE42-NEXT: psubq %xmm9, %xmm2 5512; SSE42-NEXT: movdqa %xmm4, %xmm5 5513; SSE42-NEXT: psrlq %xmm8, %xmm5 5514; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] 5515; SSE42-NEXT: psrlq %xmm6, %xmm4 5516; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 5517; SSE42-NEXT: movdqa %xmm3, %xmm5 5518; SSE42-NEXT: psrlq %xmm8, %xmm5 5519; SSE42-NEXT: psrlq %xmm6, %xmm3 5520; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 5521; SSE42-NEXT: pxor %xmm4, %xmm3 5522; SSE42-NEXT: psubq %xmm4, %xmm3 5523; SSE42-NEXT: retq 5524; 5525; AVX2-LABEL: ashr_v8i64_cast_cond: 5526; AVX2: # %bb.0: 5527; AVX2-NEXT: vmovd %edi, %xmm4 5528; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 5529; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm5 = [16,32,64,128] 5530; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 5531; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 5532; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 5533; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,4,8] 5534; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 5535; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 5536; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 5537; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 5538; AVX2-NEXT: vpsrlvq %ymm2, %ymm4, %ymm5 5539; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 5540; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm0 5541; AVX2-NEXT: vpsubq %ymm5, %ymm0, %ymm0 5542; AVX2-NEXT: vpsrlvq %ymm3, %ymm4, %ymm2 5543; AVX2-NEXT: vpsrlvq %ymm3, %ymm1, %ymm1 5544; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 5545; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 5546; AVX2-NEXT: retq 5547; 5548; AVX512-LABEL: ashr_v8i64_cast_cond: 5549; AVX512: # %bb.0: 5550; AVX512-NEXT: kmovw %edi, %k1 5551; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} 5552; AVX512-NEXT: retq 5553 %b = bitcast i8 %pb to <8 x i1> 5554 %s = select <8 x i1> %b, <8 x i64> %y, <8 x i64> zeroinitializer 5555 %r = ashr <8 x i64> %x, %s 5556 ret <8 x i64> %r 5557} 5558 5559define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, <8 x i64> %y) { 5560; SSE2-LABEL: select_sdiv_neutral_constant_v8i64: 5561; SSE2: # %bb.0: 5562; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2] 5563; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] 5564; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,2,2] 5565; SSE2-NEXT: pslld $31, %xmm8 5566; SSE2-NEXT: psrad $31, %xmm8 5567; SSE2-NEXT: movdqa %xmm8, %xmm10 5568; SSE2-NEXT: pandn %xmm7, %xmm10 5569; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,1] 5570; SSE2-NEXT: pand %xmm9, %xmm8 5571; SSE2-NEXT: por %xmm10, %xmm8 5572; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] 5573; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5] 5574; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,2,2] 5575; SSE2-NEXT: pslld $31, %xmm7 5576; SSE2-NEXT: psrad $31, %xmm7 5577; SSE2-NEXT: movdqa %xmm7, %xmm10 5578; SSE2-NEXT: pandn %xmm6, %xmm10 5579; SSE2-NEXT: pand %xmm9, %xmm7 5580; SSE2-NEXT: por %xmm10, %xmm7 5581; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0] 5582; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] 5583; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,2] 5584; SSE2-NEXT: pslld $31, %xmm6 5585; SSE2-NEXT: psrad $31, %xmm6 5586; SSE2-NEXT: movdqa %xmm6, %xmm10 5587; SSE2-NEXT: pandn %xmm5, %xmm10 5588; SSE2-NEXT: pand %xmm9, %xmm6 5589; SSE2-NEXT: por %xmm10, %xmm6 5590; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5591; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 5592; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] 5593; SSE2-NEXT: pslld $31, %xmm5 5594; SSE2-NEXT: psrad $31, %xmm5 5595; SSE2-NEXT: pand %xmm5, %xmm9 5596; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5 5597; SSE2-NEXT: por %xmm9, %xmm5 5598; SSE2-NEXT: movq %xmm6, %rcx 5599; SSE2-NEXT: movq %xmm1, %rax 5600; SSE2-NEXT: cqto 5601; SSE2-NEXT: idivq %rcx 5602; SSE2-NEXT: movq %rax, %xmm0 5603; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] 5604; SSE2-NEXT: movq %xmm6, %rcx 5605; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 5606; SSE2-NEXT: movq %xmm1, %rax 5607; SSE2-NEXT: cqto 5608; SSE2-NEXT: idivq %rcx 5609; SSE2-NEXT: movq %rax, %xmm1 5610; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5611; SSE2-NEXT: movq %xmm7, %rcx 5612; SSE2-NEXT: movq %xmm2, %rax 5613; SSE2-NEXT: cqto 5614; SSE2-NEXT: idivq %rcx 5615; SSE2-NEXT: movq %rax, %xmm1 5616; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3] 5617; SSE2-NEXT: movq %xmm6, %rcx 5618; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 5619; SSE2-NEXT: movq %xmm2, %rax 5620; SSE2-NEXT: cqto 5621; SSE2-NEXT: idivq %rcx 5622; SSE2-NEXT: movq %rax, %xmm2 5623; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 5624; SSE2-NEXT: movq %xmm8, %rcx 5625; SSE2-NEXT: movq %xmm3, %rax 5626; SSE2-NEXT: cqto 5627; SSE2-NEXT: idivq %rcx 5628; SSE2-NEXT: movq %rax, %xmm2 5629; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3] 5630; SSE2-NEXT: movq %xmm6, %rcx 5631; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 5632; SSE2-NEXT: movq %xmm3, %rax 5633; SSE2-NEXT: cqto 5634; SSE2-NEXT: idivq %rcx 5635; SSE2-NEXT: movq %rax, %xmm3 5636; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 5637; SSE2-NEXT: movq %xmm5, %rcx 5638; SSE2-NEXT: movq %xmm4, %rax 5639; SSE2-NEXT: cqto 5640; SSE2-NEXT: idivq %rcx 5641; SSE2-NEXT: movq %rax, %xmm3 5642; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 5643; SSE2-NEXT: movq %xmm5, %rcx 5644; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 5645; SSE2-NEXT: movq %xmm4, %rax 5646; SSE2-NEXT: cqto 5647; SSE2-NEXT: idivq %rcx 5648; SSE2-NEXT: movq %rax, %xmm4 5649; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] 5650; SSE2-NEXT: retq 5651; 5652; SSE42-LABEL: select_sdiv_neutral_constant_v8i64: 5653; SSE42: # %bb.0: 5654; SSE42-NEXT: movdqa %xmm0, %xmm8 5655; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9 5656; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 5657; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5658; SSE42-NEXT: psllq $63, %xmm0 5659; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1] 5660; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9 5661; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 5662; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5663; SSE42-NEXT: psllq $63, %xmm0 5664; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7 5665; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 5666; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 5667; SSE42-NEXT: psllq $63, %xmm0 5668; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6 5669; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero 5670; SSE42-NEXT: psllq $63, %xmm0 5671; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5 5672; SSE42-NEXT: pextrq $1, %xmm5, %rcx 5673; SSE42-NEXT: pextrq $1, %xmm1, %rax 5674; SSE42-NEXT: cqto 5675; SSE42-NEXT: idivq %rcx 5676; SSE42-NEXT: movq %rax, %xmm8 5677; SSE42-NEXT: movq %xmm5, %rcx 5678; SSE42-NEXT: movq %xmm1, %rax 5679; SSE42-NEXT: cqto 5680; SSE42-NEXT: idivq %rcx 5681; SSE42-NEXT: movq %rax, %xmm0 5682; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] 5683; SSE42-NEXT: pextrq $1, %xmm6, %rcx 5684; SSE42-NEXT: pextrq $1, %xmm2, %rax 5685; SSE42-NEXT: cqto 5686; SSE42-NEXT: idivq %rcx 5687; SSE42-NEXT: movq %rax, %xmm5 5688; SSE42-NEXT: movq %xmm6, %rcx 5689; SSE42-NEXT: movq %xmm2, %rax 5690; SSE42-NEXT: cqto 5691; SSE42-NEXT: idivq %rcx 5692; SSE42-NEXT: movq %rax, %xmm1 5693; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 5694; SSE42-NEXT: pextrq $1, %xmm7, %rcx 5695; SSE42-NEXT: pextrq $1, %xmm3, %rax 5696; SSE42-NEXT: cqto 5697; SSE42-NEXT: idivq %rcx 5698; SSE42-NEXT: movq %rax, %xmm5 5699; SSE42-NEXT: movq %xmm7, %rcx 5700; SSE42-NEXT: movq %xmm3, %rax 5701; SSE42-NEXT: cqto 5702; SSE42-NEXT: idivq %rcx 5703; SSE42-NEXT: movq %rax, %xmm2 5704; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 5705; SSE42-NEXT: pextrq $1, %xmm9, %rcx 5706; SSE42-NEXT: pextrq $1, %xmm4, %rax 5707; SSE42-NEXT: cqto 5708; SSE42-NEXT: idivq %rcx 5709; SSE42-NEXT: movq %rax, %xmm5 5710; SSE42-NEXT: movq %xmm9, %rcx 5711; SSE42-NEXT: movq %xmm4, %rax 5712; SSE42-NEXT: cqto 5713; SSE42-NEXT: idivq %rcx 5714; SSE42-NEXT: movq %rax, %xmm3 5715; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5716; SSE42-NEXT: retq 5717; 5718; AVX2-LABEL: select_sdiv_neutral_constant_v8i64: 5719; AVX2: # %bb.0: 5720; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 5721; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 5722; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 5723; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] 5724; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm3, %ymm5 5725; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 5726; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 5727; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 5728; AVX2-NEXT: vblendvpd %ymm0, %ymm6, %ymm4, %ymm3 5729; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm0 5730; AVX2-NEXT: vpextrq $1, %xmm0, %rcx 5731; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 5732; AVX2-NEXT: vpextrq $1, %xmm4, %rax 5733; AVX2-NEXT: cqto 5734; AVX2-NEXT: idivq %rcx 5735; AVX2-NEXT: vmovq %rax, %xmm6 5736; AVX2-NEXT: vmovq %xmm0, %rcx 5737; AVX2-NEXT: vmovq %xmm4, %rax 5738; AVX2-NEXT: cqto 5739; AVX2-NEXT: idivq %rcx 5740; AVX2-NEXT: vmovq %rax, %xmm0 5741; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] 5742; AVX2-NEXT: vpextrq $1, %xmm5, %rcx 5743; AVX2-NEXT: vpextrq $1, %xmm1, %rax 5744; AVX2-NEXT: cqto 5745; AVX2-NEXT: idivq %rcx 5746; AVX2-NEXT: vmovq %rax, %xmm4 5747; AVX2-NEXT: vmovq %xmm5, %rcx 5748; AVX2-NEXT: vmovq %xmm1, %rax 5749; AVX2-NEXT: cqto 5750; AVX2-NEXT: idivq %rcx 5751; AVX2-NEXT: vmovq %rax, %xmm1 5752; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] 5753; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 5754; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1 5755; AVX2-NEXT: vpextrq $1, %xmm1, %rcx 5756; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 5757; AVX2-NEXT: vpextrq $1, %xmm4, %rax 5758; AVX2-NEXT: cqto 5759; AVX2-NEXT: idivq %rcx 5760; AVX2-NEXT: vmovq %rax, %xmm5 5761; AVX2-NEXT: vmovq %xmm1, %rcx 5762; AVX2-NEXT: vmovq %xmm4, %rax 5763; AVX2-NEXT: cqto 5764; AVX2-NEXT: idivq %rcx 5765; AVX2-NEXT: vmovq %rax, %xmm1 5766; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] 5767; AVX2-NEXT: vpextrq $1, %xmm3, %rcx 5768; AVX2-NEXT: vpextrq $1, %xmm2, %rax 5769; AVX2-NEXT: cqto 5770; AVX2-NEXT: idivq %rcx 5771; AVX2-NEXT: vmovq %rax, %xmm4 5772; AVX2-NEXT: vmovq %xmm3, %rcx 5773; AVX2-NEXT: vmovq %xmm2, %rax 5774; AVX2-NEXT: cqto 5775; AVX2-NEXT: idivq %rcx 5776; AVX2-NEXT: vmovq %rax, %xmm2 5777; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 5778; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 5779; AVX2-NEXT: retq 5780; 5781; AVX512F-LABEL: select_sdiv_neutral_constant_v8i64: 5782; AVX512F: # %bb.0: 5783; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 5784; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 5785; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 5786; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 {%k1} = [1,1,1,1,1,1,1,1] 5787; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm0 5788; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx 5789; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm3 5790; AVX512F-NEXT: vpextrq $1, %xmm3, %rax 5791; AVX512F-NEXT: cqto 5792; AVX512F-NEXT: idivq %rcx 5793; AVX512F-NEXT: vmovq %rax, %xmm4 5794; AVX512F-NEXT: vmovq %xmm0, %rcx 5795; AVX512F-NEXT: vmovq %xmm3, %rax 5796; AVX512F-NEXT: cqto 5797; AVX512F-NEXT: idivq %rcx 5798; AVX512F-NEXT: vmovq %rax, %xmm0 5799; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5800; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm3 5801; AVX512F-NEXT: vpextrq $1, %xmm3, %rcx 5802; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm4 5803; AVX512F-NEXT: vpextrq $1, %xmm4, %rax 5804; AVX512F-NEXT: cqto 5805; AVX512F-NEXT: idivq %rcx 5806; AVX512F-NEXT: vmovq %rax, %xmm5 5807; AVX512F-NEXT: vmovq %xmm3, %rcx 5808; AVX512F-NEXT: vmovq %xmm4, %rax 5809; AVX512F-NEXT: cqto 5810; AVX512F-NEXT: idivq %rcx 5811; AVX512F-NEXT: vmovq %rax, %xmm3 5812; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5813; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 5814; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 5815; AVX512F-NEXT: vpextrq $1, %xmm3, %rcx 5816; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 5817; AVX512F-NEXT: vpextrq $1, %xmm4, %rax 5818; AVX512F-NEXT: cqto 5819; AVX512F-NEXT: idivq %rcx 5820; AVX512F-NEXT: vmovq %rax, %xmm5 5821; AVX512F-NEXT: vmovq %xmm3, %rcx 5822; AVX512F-NEXT: vmovq %xmm4, %rax 5823; AVX512F-NEXT: cqto 5824; AVX512F-NEXT: idivq %rcx 5825; AVX512F-NEXT: vmovq %rax, %xmm3 5826; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5827; AVX512F-NEXT: vpextrq $1, %xmm2, %rcx 5828; AVX512F-NEXT: vpextrq $1, %xmm1, %rax 5829; AVX512F-NEXT: cqto 5830; AVX512F-NEXT: idivq %rcx 5831; AVX512F-NEXT: vmovq %rax, %xmm4 5832; AVX512F-NEXT: vmovq %xmm2, %rcx 5833; AVX512F-NEXT: vmovq %xmm1, %rax 5834; AVX512F-NEXT: cqto 5835; AVX512F-NEXT: idivq %rcx 5836; AVX512F-NEXT: vmovq %rax, %xmm1 5837; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] 5838; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 5839; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 5840; AVX512F-NEXT: retq 5841; 5842; AVX512VL-LABEL: select_sdiv_neutral_constant_v8i64: 5843; AVX512VL: # %bb.0: 5844; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 5845; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 5846; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 5847; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 {%k1} = [1,1,1,1,1,1,1,1] 5848; AVX512VL-NEXT: vextracti32x4 $3, %zmm2, %xmm0 5849; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx 5850; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm3 5851; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax 5852; AVX512VL-NEXT: cqto 5853; AVX512VL-NEXT: idivq %rcx 5854; AVX512VL-NEXT: vmovq %rax, %xmm4 5855; AVX512VL-NEXT: vmovq %xmm0, %rcx 5856; AVX512VL-NEXT: vmovq %xmm3, %rax 5857; AVX512VL-NEXT: cqto 5858; AVX512VL-NEXT: idivq %rcx 5859; AVX512VL-NEXT: vmovq %rax, %xmm0 5860; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 5861; AVX512VL-NEXT: vextracti32x4 $2, %zmm2, %xmm3 5862; AVX512VL-NEXT: vpextrq $1, %xmm3, %rcx 5863; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm4 5864; AVX512VL-NEXT: vpextrq $1, %xmm4, %rax 5865; AVX512VL-NEXT: cqto 5866; AVX512VL-NEXT: idivq %rcx 5867; AVX512VL-NEXT: vmovq %rax, %xmm5 5868; AVX512VL-NEXT: vmovq %xmm3, %rcx 5869; AVX512VL-NEXT: vmovq %xmm4, %rax 5870; AVX512VL-NEXT: cqto 5871; AVX512VL-NEXT: idivq %rcx 5872; AVX512VL-NEXT: vmovq %rax, %xmm3 5873; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5874; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 5875; AVX512VL-NEXT: vextracti128 $1, %ymm2, %xmm3 5876; AVX512VL-NEXT: vpextrq $1, %xmm3, %rcx 5877; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm4 5878; AVX512VL-NEXT: vpextrq $1, %xmm4, %rax 5879; AVX512VL-NEXT: cqto 5880; AVX512VL-NEXT: idivq %rcx 5881; AVX512VL-NEXT: vmovq %rax, %xmm5 5882; AVX512VL-NEXT: vmovq %xmm3, %rcx 5883; AVX512VL-NEXT: vmovq %xmm4, %rax 5884; AVX512VL-NEXT: cqto 5885; AVX512VL-NEXT: idivq %rcx 5886; AVX512VL-NEXT: vmovq %rax, %xmm3 5887; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] 5888; AVX512VL-NEXT: vpextrq $1, %xmm2, %rcx 5889; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax 5890; AVX512VL-NEXT: cqto 5891; AVX512VL-NEXT: idivq %rcx 5892; AVX512VL-NEXT: vmovq %rax, %xmm4 5893; AVX512VL-NEXT: vmovq %xmm2, %rcx 5894; AVX512VL-NEXT: vmovq %xmm1, %rax 5895; AVX512VL-NEXT: cqto 5896; AVX512VL-NEXT: idivq %rcx 5897; AVX512VL-NEXT: vmovq %rax, %xmm1 5898; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] 5899; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 5900; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 5901; AVX512VL-NEXT: retq 5902 %sel = select <8 x i1> %b, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> %y 5903 %r = sdiv <8 x i64> %x, %sel 5904 ret <8 x i64> %r 5905} 5906