1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST %s 3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-perlane-shuffle %s -o - | FileCheck --check-prefixes=CHECK,CHECK-FAST-PERLANE %s 4 5; FIXME: All cases here should be fixed by PR34380 6 7define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { 8; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8] 11; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 12; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 13; CHECK-NEXT: vzeroupper 14; CHECK-NEXT: retq 15 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 16 ret <8 x i16> %res 17} 18define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 19; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: 20; CHECK: # %bb.0: 21; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 22; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8] 23; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 24; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} 25; CHECK-NEXT: vmovdqa %xmm1, %xmm0 26; CHECK-NEXT: vzeroupper 27; CHECK-NEXT: retq 28 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 29 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 30 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 31 ret <8 x i16> %res 32} 33 34define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { 35; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: 36; CHECK: # %bb.0: 37; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8] 38; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 39; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 40; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 41; CHECK-NEXT: vzeroupper 42; CHECK-NEXT: retq 43 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8> 44 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 45 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 46 ret <8 x i16> %res 47} 48define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 49; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: 50; CHECK: # %bb.0: 51; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 52; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14] 53; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 54; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} 55; CHECK-NEXT: vmovdqa %xmm1, %xmm0 56; CHECK-NEXT: vzeroupper 57; CHECK-NEXT: retq 58 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> 59 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 60 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 61 ret <8 x i16> %res 62} 63 64define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { 65; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: 66; CHECK: # %bb.0: 67; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14] 68; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 69; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 70; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 71; CHECK-NEXT: vzeroupper 72; CHECK-NEXT: retq 73 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14> 74 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 75 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 76 ret <8 x i16> %res 77} 78define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 79; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: 80; CHECK: # %bb.0: 81; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 82; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9] 83; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 84; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} 85; CHECK-NEXT: vmovdqa %xmm1, %xmm0 86; CHECK-NEXT: vzeroupper 87; CHECK-NEXT: retq 88 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> 89 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 90 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 91 ret <8 x i16> %res 92} 93 94define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { 95; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: 96; CHECK: # %bb.0: 97; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9] 98; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 99; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 100; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 101; CHECK-NEXT: vzeroupper 102; CHECK-NEXT: retq 103 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9> 104 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 105 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 106 ret <8 x i16> %res 107} 108define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) { 109; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3: 110; CHECK: # %bb.0: 111; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0] 112; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 113; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 114; CHECK-NEXT: vzeroupper 115; CHECK-NEXT: retq 116 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 117 ret <8 x i16> %res 118} 119define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 120; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3: 121; CHECK: # %bb.0: 122; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 123; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0] 124; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 125; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} 126; CHECK-NEXT: vmovdqa %xmm1, %xmm0 127; CHECK-NEXT: vzeroupper 128; CHECK-NEXT: retq 129 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 130 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 131 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 132 ret <8 x i16> %res 133} 134 135define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) { 136; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3: 137; CHECK: # %bb.0: 138; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0] 139; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 140; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} 141; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 142; CHECK-NEXT: vzeroupper 143; CHECK-NEXT: retq 144 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0> 145 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 146 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 147 ret <8 x i16> %res 148} 149define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(ptr %vp) { 150; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: 151; CHECK: # %bb.0: 152; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,7,13,3,5,13,3,9] 153; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 154; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 155; CHECK-NEXT: vzeroupper 156; CHECK-NEXT: retq 157 %vec = load <16 x i16>, ptr %vp 158 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 159 ret <8 x i16> %res 160} 161define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 162; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: 163; CHECK: # %bb.0: 164; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 165; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,13,3,5,13,3,9] 166; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 167; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} 168; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 169; CHECK-NEXT: vzeroupper 170; CHECK-NEXT: retq 171 %vec = load <16 x i16>, ptr %vp 172 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 173 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 174 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 175 ret <8 x i16> %res 176} 177 178define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { 179; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: 180; CHECK: # %bb.0: 181; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,7,13,3,5,13,3,9] 182; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 183; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} 184; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 185; CHECK-NEXT: vzeroupper 186; CHECK-NEXT: retq 187 %vec = load <16 x i16>, ptr %vp 188 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9> 189 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 190 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 191 ret <8 x i16> %res 192} 193 194define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 195; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1: 196; CHECK: # %bb.0: 197; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 198; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [3,15,12,7,1,5,8,14] 199; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 200; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} 201; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 202; CHECK-NEXT: vzeroupper 203; CHECK-NEXT: retq 204 %vec = load <16 x i16>, ptr %vp 205 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> 206 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 207 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 208 ret <8 x i16> %res 209} 210 211define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { 212; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1: 213; CHECK: # %bb.0: 214; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [3,15,12,7,1,5,8,14] 215; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 216; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} 217; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 218; CHECK-NEXT: vzeroupper 219; CHECK-NEXT: retq 220 %vec = load <16 x i16>, ptr %vp 221 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14> 222 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 223 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 224 ret <8 x i16> %res 225} 226 227define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 228; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: 229; CHECK: # %bb.0: 230; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 231; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] 232; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 233; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 234; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 235; CHECK-NEXT: retq 236 %vec = load <16 x i16>, ptr %vp 237 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> 238 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 239 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 240 ret <8 x i16> %res 241} 242 243define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { 244; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: 245; CHECK: # %bb.0: 246; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 247; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] 248; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 249; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} 250; CHECK-NEXT: vmovdqa %xmm1, %xmm0 251; CHECK-NEXT: retq 252 %vec = load <16 x i16>, ptr %vp 253 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9> 254 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 255 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 256 ret <8 x i16> %res 257} 258 259define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(ptr %vp) { 260; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3: 261; CHECK: # %bb.0: 262; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [9,7,9,6,9,4,3,2] 263; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 264; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 265; CHECK-NEXT: vzeroupper 266; CHECK-NEXT: retq 267 %vec = load <16 x i16>, ptr %vp 268 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 269 ret <8 x i16> %res 270} 271define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 272; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3: 273; CHECK: # %bb.0: 274; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 275; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [9,7,9,6,9,4,3,2] 276; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 277; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} 278; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 279; CHECK-NEXT: vzeroupper 280; CHECK-NEXT: retq 281 %vec = load <16 x i16>, ptr %vp 282 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 283 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 284 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 285 ret <8 x i16> %res 286} 287 288define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { 289; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3: 290; CHECK: # %bb.0: 291; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [9,7,9,6,9,4,3,2] 292; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 293; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} 294; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 295; CHECK-NEXT: vzeroupper 296; CHECK-NEXT: retq 297 %vec = load <16 x i16>, ptr %vp 298 %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2> 299 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 300 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 301 ret <8 x i16> %res 302} 303 304define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) { 305; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0: 306; CHECK: # %bb.0: 307; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 308; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 309; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1 310; CHECK-NEXT: vmovdqa %ymm1, %ymm0 311; CHECK-NEXT: retq 312 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 313 ret <16 x i16> %res 314} 315define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 316; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0: 317; CHECK: # %bb.0: 318; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 319; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 320; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 321; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 322; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 323; CHECK-NEXT: retq 324 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 325 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 326 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 327 ret <16 x i16> %res 328} 329 330define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) { 331; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0: 332; CHECK: # %bb.0: 333; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 334; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2] 335; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 336; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 337; CHECK-NEXT: vmovdqa %ymm2, %ymm0 338; CHECK-NEXT: retq 339 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18> 340 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 341 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 342 ret <16 x i16> %res 343} 344define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 345; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1: 346; CHECK: # %bb.0: 347; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 348; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] 349; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 350; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 351; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 352; CHECK-NEXT: retq 353 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> 354 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 355 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 356 ret <16 x i16> %res 357} 358 359define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) { 360; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1: 361; CHECK: # %bb.0: 362; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 363; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26] 364; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 365; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 366; CHECK-NEXT: vmovdqa %ymm2, %ymm0 367; CHECK-NEXT: retq 368 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10> 369 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 370 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 371 ret <16 x i16> %res 372} 373define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 374; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2: 375; CHECK: # %bb.0: 376; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 377; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] 378; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4 379; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 380; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1} 381; CHECK-NEXT: retq 382 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> 383 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 384 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 385 ret <16 x i16> %res 386} 387 388define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) { 389; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2: 390; CHECK: # %bb.0: 391; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 392; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15] 393; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 394; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z} 395; CHECK-NEXT: vmovdqa %ymm2, %ymm0 396; CHECK-NEXT: retq 397 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31> 398 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 399 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 400 ret <16 x i16> %res 401} 402define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) { 403; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3: 404; CHECK: # %bb.0: 405; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 406; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 407; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 408; CHECK-NEXT: retq 409 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 410 ret <16 x i16> %res 411} 412define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { 413; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3: 414; CHECK: # %bb.0: 415; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 416; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 417; CHECK-NEXT: vptestnmw %ymm2, %ymm2, %k1 418; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} 419; CHECK-NEXT: vmovdqa %ymm1, %ymm0 420; CHECK-NEXT: retq 421 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 422 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 423 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 424 ret <16 x i16> %res 425} 426 427define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) { 428; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3: 429; CHECK: # %bb.0: 430; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5] 431; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 432; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} 433; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 434; CHECK-NEXT: retq 435 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5> 436 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 437 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 438 ret <16 x i16> %res 439} 440define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) { 441; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0: 442; CHECK: # %bb.0: 443; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14] 444; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 445; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1 446; CHECK-NEXT: vmovdqa %xmm1, %xmm0 447; CHECK-NEXT: vzeroupper 448; CHECK-NEXT: retq 449 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 450 ret <8 x i16> %res 451} 452define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 453; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0: 454; CHECK: # %bb.0: 455; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] 456; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4 457; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4 458; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 459; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1} 460; CHECK-NEXT: vzeroupper 461; CHECK-NEXT: retq 462 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 463 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 464 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 465 ret <8 x i16> %res 466} 467 468define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) { 469; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0: 470; CHECK: # %bb.0: 471; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14] 472; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 473; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 474; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z} 475; CHECK-NEXT: vmovdqa %xmm2, %xmm0 476; CHECK-NEXT: vzeroupper 477; CHECK-NEXT: retq 478 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30> 479 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 480 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 481 ret <8 x i16> %res 482} 483define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 484; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1: 485; CHECK: # %bb.0: 486; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 487; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5] 488; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 489; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} 490; CHECK-NEXT: vmovdqa %xmm1, %xmm0 491; CHECK-NEXT: vzeroupper 492; CHECK-NEXT: retq 493 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> 494 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 495 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 496 ret <8 x i16> %res 497} 498 499define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) { 500; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1: 501; CHECK: # %bb.0: 502; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5] 503; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 504; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} 505; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 506; CHECK-NEXT: vzeroupper 507; CHECK-NEXT: retq 508 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5> 509 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 510 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 511 ret <8 x i16> %res 512} 513define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 514; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2: 515; CHECK: # %bb.0: 516; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 517; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8] 518; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 519; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} 520; CHECK-NEXT: vmovdqa %xmm1, %xmm0 521; CHECK-NEXT: vzeroupper 522; CHECK-NEXT: retq 523 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> 524 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 525 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 526 ret <8 x i16> %res 527} 528 529define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) { 530; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2: 531; CHECK: # %bb.0: 532; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8] 533; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 534; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} 535; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 536; CHECK-NEXT: vzeroupper 537; CHECK-NEXT: retq 538 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8> 539 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 540 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 541 ret <8 x i16> %res 542} 543define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) { 544; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3: 545; CHECK: # %bb.0: 546; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30] 547; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 548; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 549; CHECK-NEXT: vzeroupper 550; CHECK-NEXT: retq 551 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 552 ret <8 x i16> %res 553} 554define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { 555; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3: 556; CHECK: # %bb.0: 557; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 558; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30] 559; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1 560; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} 561; CHECK-NEXT: vmovdqa %xmm1, %xmm0 562; CHECK-NEXT: vzeroupper 563; CHECK-NEXT: retq 564 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 565 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 566 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 567 ret <8 x i16> %res 568} 569 570define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) { 571; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3: 572; CHECK: # %bb.0: 573; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30] 574; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 575; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} 576; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 577; CHECK-NEXT: vzeroupper 578; CHECK-NEXT: retq 579 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30> 580 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 581 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 582 ret <8 x i16> %res 583} 584define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(ptr %vp) { 585; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0: 586; CHECK: # %bb.0: 587; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 588; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 589; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 590; CHECK-NEXT: retq 591 %vec = load <32 x i16>, ptr %vp 592 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 593 ret <16 x i16> %res 594} 595define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { 596; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0: 597; CHECK: # %bb.0: 598; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 599; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 600; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 601; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} 602; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 603; CHECK-NEXT: retq 604 %vec = load <32 x i16>, ptr %vp 605 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 606 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 607 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 608 ret <16 x i16> %res 609} 610 611define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(ptr %vp, <16 x i16> %mask) { 612; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0: 613; CHECK: # %bb.0: 614; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12] 615; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 616; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} 617; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 618; CHECK-NEXT: retq 619 %vec = load <32 x i16>, ptr %vp 620 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12> 621 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 622 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 623 ret <16 x i16> %res 624} 625 626define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { 627; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1: 628; CHECK: # %bb.0: 629; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 630; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] 631; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 632; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} 633; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 634; CHECK-NEXT: retq 635 %vec = load <32 x i16>, ptr %vp 636 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> 637 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 638 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 639 ret <16 x i16> %res 640} 641 642define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(ptr %vp, <16 x i16> %mask) { 643; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1: 644; CHECK: # %bb.0: 645; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25] 646; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 647; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} 648; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 649; CHECK-NEXT: retq 650 %vec = load <32 x i16>, ptr %vp 651 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25> 652 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 653 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 654 ret <16 x i16> %res 655} 656 657define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { 658; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2: 659; CHECK: # %bb.0: 660; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 661; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm3 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] 662; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3 663; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 664; CHECK-NEXT: vmovdqu16 %ymm3, %ymm0 {%k1} 665; CHECK-NEXT: retq 666 %vec = load <32 x i16>, ptr %vp 667 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> 668 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 669 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 670 ret <16 x i16> %res 671} 672 673define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(ptr %vp, <16 x i16> %mask) { 674; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2: 675; CHECK: # %bb.0: 676; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 677; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0] 678; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 679; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm1 {%k1} {z} 680; CHECK-NEXT: vmovdqa %ymm1, %ymm0 681; CHECK-NEXT: retq 682 %vec = load <32 x i16>, ptr %vp 683 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16> 684 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 685 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 686 ret <16 x i16> %res 687} 688 689define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(ptr %vp) { 690; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3: 691; CHECK: # %bb.0: 692; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 693; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 694; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 695; CHECK-NEXT: retq 696 %vec = load <32 x i16>, ptr %vp 697 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 698 ret <16 x i16> %res 699} 700define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %vec2, <16 x i16> %mask) { 701; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3: 702; CHECK: # %bb.0: 703; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 704; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm2 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 705; CHECK-NEXT: vptestnmw %ymm1, %ymm1, %k1 706; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} 707; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 708; CHECK-NEXT: retq 709 %vec = load <32 x i16>, ptr %vp 710 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 711 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 712 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 713 ret <16 x i16> %res 714} 715 716define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(ptr %vp, <16 x i16> %mask) { 717; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3: 718; CHECK: # %bb.0: 719; CHECK-NEXT: vpmovsxbw {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16] 720; CHECK-NEXT: vptestnmw %ymm0, %ymm0, %k1 721; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} 722; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 723; CHECK-NEXT: retq 724 %vec = load <32 x i16>, ptr %vp 725 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16> 726 %cmp = icmp eq <16 x i16> %mask, zeroinitializer 727 %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer 728 ret <16 x i16> %res 729} 730 731define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(ptr %vp) { 732; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0: 733; CHECK: # %bb.0: 734; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17] 735; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0 736; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0 737; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 738; CHECK-NEXT: vzeroupper 739; CHECK-NEXT: retq 740 %vec = load <32 x i16>, ptr %vp 741 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 742 ret <8 x i16> %res 743} 744define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 745; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0: 746; CHECK: # %bb.0: 747; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] 748; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 749; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 750; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 751; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 752; CHECK-NEXT: vzeroupper 753; CHECK-NEXT: retq 754 %vec = load <32 x i16>, ptr %vp 755 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 756 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 757 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 758 ret <8 x i16> %res 759} 760 761define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(ptr %vp, <8 x i16> %mask) { 762; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0: 763; CHECK: # %bb.0: 764; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17] 765; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 766; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 767; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} 768; CHECK-NEXT: vmovdqa %xmm1, %xmm0 769; CHECK-NEXT: vzeroupper 770; CHECK-NEXT: retq 771 %vec = load <32 x i16>, ptr %vp 772 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1> 773 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 774 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 775 ret <8 x i16> %res 776} 777 778define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 779; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1: 780; CHECK: # %bb.0: 781; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] 782; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3 783; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3 784; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 785; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} 786; CHECK-NEXT: vzeroupper 787; CHECK-NEXT: retq 788 %vec = load <32 x i16>, ptr %vp 789 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> 790 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 791 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 792 ret <8 x i16> %res 793} 794 795define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(ptr %vp, <8 x i16> %mask) { 796; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1: 797; CHECK: # %bb.0: 798; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1] 799; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 800; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 801; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z} 802; CHECK-NEXT: vmovdqa %xmm1, %xmm0 803; CHECK-NEXT: vzeroupper 804; CHECK-NEXT: retq 805 %vec = load <32 x i16>, ptr %vp 806 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17> 807 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 808 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 809 ret <8 x i16> %res 810} 811 812define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 813; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2: 814; CHECK: # %bb.0: 815; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 816; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10] 817; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 818; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} 819; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 820; CHECK-NEXT: vzeroupper 821; CHECK-NEXT: retq 822 %vec = load <32 x i16>, ptr %vp 823 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> 824 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 825 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 826 ret <8 x i16> %res 827} 828 829define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(ptr %vp, <8 x i16> %mask) { 830; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2: 831; CHECK: # %bb.0: 832; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10] 833; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 834; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} 835; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 836; CHECK-NEXT: vzeroupper 837; CHECK-NEXT: retq 838 %vec = load <32 x i16>, ptr %vp 839 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10> 840 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 841 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 842 ret <8 x i16> %res 843} 844 845define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(ptr %vp) { 846; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3: 847; CHECK: # %bb.0: 848; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm0 = [19,1,5,31,9,12,17,9] 849; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 850; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 851; CHECK-NEXT: vzeroupper 852; CHECK-NEXT: retq 853 %vec = load <32 x i16>, ptr %vp 854 %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 855 ret <8 x i16> %res 856} 857define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %vec2, <8 x i16> %mask) { 858; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3: 859; CHECK: # %bb.0: 860; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 861; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9] 862; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 863; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} 864; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 865; CHECK-NEXT: vzeroupper 866; CHECK-NEXT: retq 867 %vec = load <32 x i16>, ptr %vp 868 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 869 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 870 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 871 ret <8 x i16> %res 872} 873 874define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(ptr %vp, <8 x i16> %mask) { 875; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3: 876; CHECK: # %bb.0: 877; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9] 878; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 879; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} 880; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 881; CHECK-NEXT: vzeroupper 882; CHECK-NEXT: retq 883 %vec = load <32 x i16>, ptr %vp 884 %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9> 885 %cmp = icmp eq <8 x i16> %mask, zeroinitializer 886 %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer 887 ret <8 x i16> %res 888} 889 890define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) { 891; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF: 892; CHECK: # %bb.0: 893; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15] 894; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 895; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 896; CHECK-NEXT: vzeroupper 897; CHECK-NEXT: retq 898 %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15> 899 ret <8 x i16> %res 900} 901 902define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) { 903; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0: 904; CHECK: # %bb.0: 905; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,0,3,2] 906; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 907; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 908; CHECK-NEXT: vzeroupper 909; CHECK-NEXT: retq 910 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 911 ret <4 x i32> %res 912} 913define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 914; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0: 915; CHECK: # %bb.0: 916; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 917; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2] 918; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 919; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} 920; CHECK-NEXT: vmovdqa %xmm1, %xmm0 921; CHECK-NEXT: vzeroupper 922; CHECK-NEXT: retq 923 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 924 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 925 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 926 ret <4 x i32> %res 927} 928 929define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) { 930; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0: 931; CHECK: # %bb.0: 932; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,0,3,2] 933; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 934; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 935; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 936; CHECK-NEXT: vzeroupper 937; CHECK-NEXT: retq 938 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2> 939 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 940 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 941 ret <4 x i32> %res 942} 943define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 944; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1: 945; CHECK: # %bb.0: 946; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 947; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3] 948; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 949; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} 950; CHECK-NEXT: vmovdqa %xmm1, %xmm0 951; CHECK-NEXT: vzeroupper 952; CHECK-NEXT: retq 953 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> 954 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 955 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 956 ret <4 x i32> %res 957} 958 959define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) { 960; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1: 961; CHECK: # %bb.0: 962; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,7,3] 963; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 964; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 965; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 966; CHECK-NEXT: vzeroupper 967; CHECK-NEXT: retq 968 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3> 969 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 970 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 971 ret <4 x i32> %res 972} 973define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 974; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2: 975; CHECK: # %bb.0: 976; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] 977; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 978; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} 979; CHECK-NEXT: vzeroupper 980; CHECK-NEXT: retq 981 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 982 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 983 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 984 ret <4 x i32> %res 985} 986 987define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) { 988; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2: 989; CHECK: # %bb.0: 990; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] 991; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 992; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 993; CHECK-NEXT: vzeroupper 994; CHECK-NEXT: retq 995 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 996 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 997 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 998 ret <4 x i32> %res 999} 1000define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) { 1001; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3: 1002; CHECK: # %bb.0: 1003; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,3,2,5] 1004; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 1005; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1006; CHECK-NEXT: vzeroupper 1007; CHECK-NEXT: retq 1008 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1009 ret <4 x i32> %res 1010} 1011define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1012; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3: 1013; CHECK: # %bb.0: 1014; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1015; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5] 1016; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1017; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} 1018; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1019; CHECK-NEXT: vzeroupper 1020; CHECK-NEXT: retq 1021 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1022 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1023 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1024 ret <4 x i32> %res 1025} 1026 1027define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) { 1028; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3: 1029; CHECK: # %bb.0: 1030; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,3,2,5] 1031; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1032; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 1033; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1034; CHECK-NEXT: vzeroupper 1035; CHECK-NEXT: retq 1036 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5> 1037 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1038 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1039 ret <4 x i32> %res 1040} 1041define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(ptr %vp) { 1042; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0: 1043; CHECK: # %bb.0: 1044; CHECK-NEXT: vmovaps 16(%rdi), %xmm0 1045; CHECK-NEXT: vshufps $7, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[3,1],mem[0,0] 1046; CHECK-NEXT: retq 1047 %vec = load <8 x i32>, ptr %vp 1048 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1049 ret <4 x i32> %res 1050} 1051define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1052; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0: 1053; CHECK: # %bb.0: 1054; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 1055; CHECK-NEXT: vshufps $7, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[3,1],mem[0,0] 1056; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1057; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} 1058; CHECK-NEXT: retq 1059 %vec = load <8 x i32>, ptr %vp 1060 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1061 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1062 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1063 ret <4 x i32> %res 1064} 1065 1066define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { 1067; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0: 1068; CHECK: # %bb.0: 1069; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 1070; CHECK-NEXT: vshufps $7, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[3,1],mem[0,0] 1071; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1072; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} 1073; CHECK-NEXT: retq 1074 %vec = load <8 x i32>, ptr %vp 1075 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0> 1076 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1077 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1078 ret <4 x i32> %res 1079} 1080 1081define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1082; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1: 1083; CHECK: # %bb.0: 1084; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1085; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,0,0,3] 1086; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1087; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} 1088; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1089; CHECK-NEXT: vzeroupper 1090; CHECK-NEXT: retq 1091 %vec = load <8 x i32>, ptr %vp 1092 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> 1093 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1094 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1095 ret <4 x i32> %res 1096} 1097 1098define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { 1099; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1: 1100; CHECK: # %bb.0: 1101; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,0,0,3] 1102; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1103; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} 1104; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1105; CHECK-NEXT: vzeroupper 1106; CHECK-NEXT: retq 1107 %vec = load <8 x i32>, ptr %vp 1108 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3> 1109 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1110 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1111 ret <4 x i32> %res 1112} 1113 1114define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1115; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2: 1116; CHECK: # %bb.0: 1117; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1118; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,7,7,0] 1119; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 1120; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1121; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1122; CHECK-NEXT: retq 1123 %vec = load <8 x i32>, ptr %vp 1124 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> 1125 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1126 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1127 ret <4 x i32> %res 1128} 1129 1130define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { 1131; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2: 1132; CHECK: # %bb.0: 1133; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1134; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,7,7,0] 1135; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1136; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} 1137; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1138; CHECK-NEXT: retq 1139 %vec = load <8 x i32>, ptr %vp 1140 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4> 1141 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1142 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1143 ret <4 x i32> %res 1144} 1145 1146define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(ptr %vp) { 1147; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3: 1148; CHECK: # %bb.0: 1149; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm1 1150; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,1,2,7] 1151; CHECK-NEXT: vpermi2d 16(%rdi), %xmm1, %xmm0 1152; CHECK-NEXT: retq 1153 %vec = load <8 x i32>, ptr %vp 1154 %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1155 ret <4 x i32> %res 1156} 1157define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1158; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3: 1159; CHECK: # %bb.0: 1160; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 1161; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,2,7] 1162; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm3 1163; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1164; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1165; CHECK-NEXT: retq 1166 %vec = load <8 x i32>, ptr %vp 1167 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1168 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1169 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1170 ret <4 x i32> %res 1171} 1172 1173define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { 1174; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3: 1175; CHECK: # %bb.0: 1176; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm2 1177; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [5,1,2,7] 1178; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1179; CHECK-NEXT: vpermi2d 16(%rdi), %xmm2, %xmm1 {%k1} {z} 1180; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1181; CHECK-NEXT: retq 1182 %vec = load <8 x i32>, ptr %vp 1183 %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7> 1184 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1185 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1186 ret <4 x i32> %res 1187} 1188 1189define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) { 1190; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0: 1191; CHECK: # %bb.0: 1192; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6] 1193; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1194; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1195; CHECK-NEXT: retq 1196 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1197 ret <8 x i32> %res 1198} 1199define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1200; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0: 1201; CHECK: # %bb.0: 1202; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1203; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6] 1204; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1205; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1206; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1207; CHECK-NEXT: retq 1208 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1209 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1210 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1211 ret <8 x i32> %res 1212} 1213 1214define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) { 1215; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0: 1216; CHECK: # %bb.0: 1217; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,13,11,14,7,10,1,6] 1218; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1219; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1220; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1221; CHECK-NEXT: retq 1222 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6> 1223 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1224 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1225 ret <8 x i32> %res 1226} 1227define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1228; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1: 1229; CHECK: # %bb.0: 1230; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1231; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8] 1232; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1233; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1234; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1235; CHECK-NEXT: retq 1236 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> 1237 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1238 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1239 ret <8 x i32> %res 1240} 1241 1242define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) { 1243; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1: 1244; CHECK: # %bb.0: 1245; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8] 1246; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1247; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1248; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1249; CHECK-NEXT: retq 1250 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8> 1251 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1252 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1253 ret <8 x i32> %res 1254} 1255define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1256; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2: 1257; CHECK: # %bb.0: 1258; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1259; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7] 1260; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1261; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1262; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1263; CHECK-NEXT: retq 1264 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> 1265 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1266 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1267 ret <8 x i32> %res 1268} 1269 1270define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) { 1271; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2: 1272; CHECK: # %bb.0: 1273; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7] 1274; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1275; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1276; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1277; CHECK-NEXT: retq 1278 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7> 1279 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1280 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1281 ret <8 x i32> %res 1282} 1283define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) { 1284; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3: 1285; CHECK: # %bb.0: 1286; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3] 1287; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1288; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1289; CHECK-NEXT: retq 1290 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1291 ret <8 x i32> %res 1292} 1293define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { 1294; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1297; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3] 1298; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1 1299; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1300; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1301; CHECK-NEXT: retq 1302 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1303 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1304 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1305 ret <8 x i32> %res 1306} 1307 1308define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) { 1309; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3: 1310; CHECK: # %bb.0: 1311; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3] 1312; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1313; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1314; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1315; CHECK-NEXT: retq 1316 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3> 1317 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1318 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1319 ret <8 x i32> %res 1320} 1321define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { 1322; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: 1323; CHECK: # %bb.0: 1324; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] 1325; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1326; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1327; CHECK-NEXT: vzeroupper 1328; CHECK-NEXT: retq 1329 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1330 ret <4 x i32> %res 1331} 1332define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1333; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: 1334; CHECK: # %bb.0: 1335; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1336; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] 1337; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1338; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1339; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1340; CHECK-NEXT: vzeroupper 1341; CHECK-NEXT: retq 1342 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1343 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1344 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1345 ret <4 x i32> %res 1346} 1347 1348define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) { 1349; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: 1350; CHECK: # %bb.0: 1351; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12] 1352; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1353; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1354; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1355; CHECK-NEXT: vzeroupper 1356; CHECK-NEXT: retq 1357 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> 1358 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1359 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1360 ret <4 x i32> %res 1361} 1362define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1363; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1: 1364; CHECK: # %bb.0: 1365; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1366; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4] 1367; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1368; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1369; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} 1370; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1371; CHECK-NEXT: vzeroupper 1372; CHECK-NEXT: retq 1373 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> 1374 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1375 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1376 ret <4 x i32> %res 1377} 1378 1379define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) { 1380; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1: 1381; CHECK: # %bb.0: 1382; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4] 1383; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1384; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1385; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} 1386; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1387; CHECK-NEXT: vzeroupper 1388; CHECK-NEXT: retq 1389 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12> 1390 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1391 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1392 ret <4 x i32> %res 1393} 1394define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1395; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2: 1396; CHECK: # %bb.0: 1397; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1398; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0] 1399; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1400; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1401; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1402; CHECK-NEXT: vzeroupper 1403; CHECK-NEXT: retq 1404 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> 1405 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1406 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1407 ret <4 x i32> %res 1408} 1409 1410define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) { 1411; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2: 1412; CHECK: # %bb.0: 1413; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,1,13,0] 1414; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1415; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1416; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1417; CHECK-NEXT: vzeroupper 1418; CHECK-NEXT: retq 1419 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0> 1420 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1421 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1422 ret <4 x i32> %res 1423} 1424define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) { 1425; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3: 1426; CHECK: # %bb.0: 1427; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,0,0,13] 1428; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 1429; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1430; CHECK-NEXT: vzeroupper 1431; CHECK-NEXT: retq 1432 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1433 ret <4 x i32> %res 1434} 1435define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { 1436; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3: 1437; CHECK: # %bb.0: 1438; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1439; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13] 1440; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 1441; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} 1442; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1443; CHECK-NEXT: vzeroupper 1444; CHECK-NEXT: retq 1445 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1446 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1447 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1448 ret <4 x i32> %res 1449} 1450 1451define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) { 1452; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3: 1453; CHECK: # %bb.0: 1454; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,0,0,13] 1455; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1456; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} 1457; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1458; CHECK-NEXT: vzeroupper 1459; CHECK-NEXT: retq 1460 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13> 1461 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1462 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1463 ret <4 x i32> %res 1464} 1465define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(ptr %vp) { 1466; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0: 1467; CHECK: # %bb.0: 1468; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4] 1469; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0 1470; CHECK-NEXT: retq 1471 %vec = load <16 x i32>, ptr %vp 1472 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1473 ret <8 x i32> %res 1474} 1475define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1476; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0: 1477; CHECK: # %bb.0: 1478; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4] 1479; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1480; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1} 1481; CHECK-NEXT: retq 1482 %vec = load <16 x i32>, ptr %vp 1483 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1484 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1485 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1486 ret <8 x i32> %res 1487} 1488 1489define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(ptr %vp, <8 x i32> %mask) { 1490; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0: 1491; CHECK: # %bb.0: 1492; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4] 1493; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1494; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z} 1495; CHECK-NEXT: retq 1496 %vec = load <16 x i32>, ptr %vp 1497 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12> 1498 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1499 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1500 ret <8 x i32> %res 1501} 1502 1503define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1504; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1: 1505; CHECK: # %bb.0: 1506; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1507; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,3,6,11,0,1,5,15] 1508; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1509; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1510; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} 1511; CHECK-NEXT: retq 1512 %vec = load <16 x i32>, ptr %vp 1513 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> 1514 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1515 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1516 ret <8 x i32> %res 1517} 1518 1519define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(ptr %vp, <8 x i32> %mask) { 1520; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1: 1521; CHECK: # %bb.0: 1522; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1523; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15] 1524; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1525; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1526; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1527; CHECK-NEXT: retq 1528 %vec = load <16 x i32>, ptr %vp 1529 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7> 1530 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1531 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1532 ret <8 x i32> %res 1533} 1534 1535define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1536; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2: 1537; CHECK: # %bb.0: 1538; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1539; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [4,14,1,5,4,2,8,10] 1540; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1541; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1542; CHECK-NEXT: vmovdqa32 %ymm3, %ymm0 {%k1} 1543; CHECK-NEXT: retq 1544 %vec = load <16 x i32>, ptr %vp 1545 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> 1546 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1547 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1548 ret <8 x i32> %res 1549} 1550 1551define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(ptr %vp, <8 x i32> %mask) { 1552; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2: 1553; CHECK: # %bb.0: 1554; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1555; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10] 1556; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1557; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1558; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1559; CHECK-NEXT: retq 1560 %vec = load <16 x i32>, ptr %vp 1561 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2> 1562 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1563 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1564 ret <8 x i32> %res 1565} 1566 1567define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(ptr %vp) { 1568; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3: 1569; CHECK: # %bb.0: 1570; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12] 1571; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 1572; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1573; CHECK-NEXT: retq 1574 %vec = load <16 x i32>, ptr %vp 1575 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1576 ret <8 x i32> %res 1577} 1578define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %vec2, <8 x i32> %mask) { 1579; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3: 1580; CHECK: # %bb.0: 1581; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1582; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,4,1,13,15,4,6,12] 1583; CHECK-NEXT: vptestnmd %ymm1, %ymm1, %k1 1584; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} 1585; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1586; CHECK-NEXT: retq 1587 %vec = load <16 x i32>, ptr %vp 1588 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1589 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1590 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 1591 ret <8 x i32> %res 1592} 1593 1594define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(ptr %vp, <8 x i32> %mask) { 1595; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3: 1596; CHECK: # %bb.0: 1597; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12] 1598; CHECK-NEXT: vptestnmd %ymm0, %ymm0, %k1 1599; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} 1600; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1601; CHECK-NEXT: retq 1602 %vec = load <16 x i32>, ptr %vp 1603 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12> 1604 %cmp = icmp eq <8 x i32> %mask, zeroinitializer 1605 %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer 1606 ret <8 x i32> %res 1607} 1608 1609define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(ptr %vp) { 1610; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0: 1611; CHECK: # %bb.0: 1612; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,0,0,6] 1613; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 1614; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1615; CHECK-NEXT: vzeroupper 1616; CHECK-NEXT: retq 1617 %vec = load <16 x i32>, ptr %vp 1618 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1619 ret <4 x i32> %res 1620} 1621define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1622; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0: 1623; CHECK: # %bb.0: 1624; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1625; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,0,0,6] 1626; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1627; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} 1628; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1629; CHECK-NEXT: vzeroupper 1630; CHECK-NEXT: retq 1631 %vec = load <16 x i32>, ptr %vp 1632 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1633 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1634 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1635 ret <4 x i32> %res 1636} 1637 1638define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(ptr %vp, <4 x i32> %mask) { 1639; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0: 1640; CHECK: # %bb.0: 1641; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [13,0,0,6] 1642; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1643; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} 1644; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1645; CHECK-NEXT: vzeroupper 1646; CHECK-NEXT: retq 1647 %vec = load <16 x i32>, ptr %vp 1648 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6> 1649 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1650 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1651 ret <4 x i32> %res 1652} 1653 1654define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1655; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: 1656; CHECK: # %bb.0: 1657; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1658; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [15,5,3,2,0,0,0,0] 1659; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm3 1660; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1661; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1662; CHECK-NEXT: vzeroupper 1663; CHECK-NEXT: retq 1664 %vec = load <16 x i32>, ptr %vp 1665 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> 1666 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1667 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1668 ret <4 x i32> %res 1669} 1670 1671define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(ptr %vp, <4 x i32> %mask) { 1672; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: 1673; CHECK: # %bb.0: 1674; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 1675; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [15,5,3,2,0,0,0,0] 1676; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1677; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z} 1678; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1679; CHECK-NEXT: vzeroupper 1680; CHECK-NEXT: retq 1681 %vec = load <16 x i32>, ptr %vp 1682 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10> 1683 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1684 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1685 ret <4 x i32> %res 1686} 1687 1688define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1689; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2: 1690; CHECK: # %bb.0: 1691; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1692; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,15,6,9] 1693; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1694; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} 1695; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1696; CHECK-NEXT: vzeroupper 1697; CHECK-NEXT: retq 1698 %vec = load <16 x i32>, ptr %vp 1699 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> 1700 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1701 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1702 ret <4 x i32> %res 1703} 1704 1705define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(ptr %vp, <4 x i32> %mask) { 1706; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2: 1707; CHECK: # %bb.0: 1708; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,15,6,9] 1709; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1710; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} 1711; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1712; CHECK-NEXT: vzeroupper 1713; CHECK-NEXT: retq 1714 %vec = load <16 x i32>, ptr %vp 1715 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9> 1716 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1717 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1718 ret <4 x i32> %res 1719} 1720 1721define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(ptr %vp) { 1722; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: 1723; CHECK: # %bb.0: 1724; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 1725; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,4,3,6] 1726; CHECK-NEXT: vpermi2d (%rdi), %xmm1, %xmm0 1727; CHECK-NEXT: retq 1728 %vec = load <16 x i32>, ptr %vp 1729 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1730 ret <4 x i32> %res 1731} 1732define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %vec2, <4 x i32> %mask) { 1733; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: 1734; CHECK: # %bb.0: 1735; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1736; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,4,3,6] 1737; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm3 1738; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 1739; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} 1740; CHECK-NEXT: retq 1741 %vec = load <16 x i32>, ptr %vp 1742 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1743 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1744 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 1745 ret <4 x i32> %res 1746} 1747 1748define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32> %mask) { 1749; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: 1750; CHECK: # %bb.0: 1751; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1752; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,4,3,6] 1753; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 1754; CHECK-NEXT: vpermi2d (%rdi), %xmm2, %xmm1 {%k1} {z} 1755; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1756; CHECK-NEXT: retq 1757 %vec = load <16 x i32>, ptr %vp 1758 %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2> 1759 %cmp = icmp eq <4 x i32> %mask, zeroinitializer 1760 %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer 1761 ret <4 x i32> %res 1762} 1763 1764define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) { 1765; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9: 1766; CHECK-FAST: # %bb.0: 1767; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10] 1768; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0 1769; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1770; CHECK-FAST-NEXT: vzeroupper 1771; CHECK-FAST-NEXT: retq 1772; 1773; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9: 1774; CHECK-FAST-PERLANE: # %bb.0: 1775; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2] 1776; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 1777; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1 1778; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 1779; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3] 1780; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 1781; CHECK-FAST-PERLANE-NEXT: vzeroupper 1782; CHECK-FAST-PERLANE-NEXT: retq 1783 %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10> 1784 ret <4 x i32> %res 1785} 1786 1787define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) { 1788; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0: 1789; CHECK: # %bb.0: 1790; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] 1791; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1792; CHECK-NEXT: vzeroupper 1793; CHECK-NEXT: retq 1794 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1795 ret <2 x i64> %res 1796} 1797define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 1798; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0: 1799; CHECK: # %bb.0: 1800; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1801; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 1802; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3] 1803; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1804; CHECK-NEXT: vzeroupper 1805; CHECK-NEXT: retq 1806 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1807 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1808 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1809 ret <2 x i64> %res 1810} 1811 1812define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) { 1813; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0: 1814; CHECK: # %bb.0: 1815; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1816; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] 1817; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1818; CHECK-NEXT: vzeroupper 1819; CHECK-NEXT: retq 1820 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0> 1821 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1822 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1823 ret <2 x i64> %res 1824} 1825define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 1826; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1: 1827; CHECK: # %bb.0: 1828; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1829; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 1830; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3] 1831; CHECK-NEXT: vmovdqa %xmm1, %xmm0 1832; CHECK-NEXT: vzeroupper 1833; CHECK-NEXT: retq 1834 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1835 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1836 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1837 ret <2 x i64> %res 1838} 1839 1840define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) { 1841; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1: 1842; CHECK: # %bb.0: 1843; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1844; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] 1845; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1846; CHECK-NEXT: vzeroupper 1847; CHECK-NEXT: retq 1848 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1849 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1850 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1851 ret <2 x i64> %res 1852} 1853define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(ptr %vp) { 1854; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0: 1855; CHECK: # %bb.0: 1856; CHECK-NEXT: vmovaps (%rdi), %xmm0 1857; CHECK-NEXT: vunpckhpd 16(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[1] 1858; CHECK-NEXT: retq 1859 %vec = load <4 x i64>, ptr %vp 1860 %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1861 ret <2 x i64> %res 1862} 1863define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { 1864; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0: 1865; CHECK: # %bb.0: 1866; CHECK-NEXT: vmovdqa (%rdi), %xmm2 1867; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1868; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[1] 1869; CHECK-NEXT: retq 1870 %vec = load <4 x i64>, ptr %vp 1871 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1872 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1873 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1874 ret <2 x i64> %res 1875} 1876 1877define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { 1878; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0: 1879; CHECK: # %bb.0: 1880; CHECK-NEXT: vmovdqa (%rdi), %xmm1 1881; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 1882; CHECK-NEXT: vpunpckhqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[1] 1883; CHECK-NEXT: retq 1884 %vec = load <4 x i64>, ptr %vp 1885 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3> 1886 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1887 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1888 ret <2 x i64> %res 1889} 1890 1891define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { 1892; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1: 1893; CHECK: # %bb.0: 1894; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 1895; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] 1896; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 1897; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} 1898; CHECK-NEXT: retq 1899 %vec = load <4 x i64>, ptr %vp 1900 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1901 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1902 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 1903 ret <2 x i64> %res 1904} 1905 1906define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { 1907; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1: 1908; CHECK: # %bb.0: 1909; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 1910; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] 1911; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 1912; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} 1913; CHECK-NEXT: retq 1914 %vec = load <4 x i64>, ptr %vp 1915 %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1> 1916 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 1917 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 1918 ret <2 x i64> %res 1919} 1920 1921define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) { 1922; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0: 1923; CHECK: # %bb.0: 1924; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 1925; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1] 1926; CHECK-NEXT: retq 1927 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1928 ret <4 x i64> %res 1929} 1930define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 1931; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0: 1932; CHECK: # %bb.0: 1933; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1934; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 1935; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1] 1936; CHECK-NEXT: vmovdqa %ymm1, %ymm0 1937; CHECK-NEXT: retq 1938 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1939 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1940 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 1941 ret <4 x i64> %res 1942} 1943 1944define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) { 1945; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0: 1946; CHECK: # %bb.0: 1947; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1948; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 1949; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1] 1950; CHECK-NEXT: retq 1951 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5> 1952 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1953 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 1954 ret <4 x i64> %res 1955} 1956define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 1957; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: 1958; CHECK-FAST: # %bb.0: 1959; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1960; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1] 1961; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 1962; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 1963; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 1964; CHECK-FAST-NEXT: retq 1965; 1966; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: 1967; CHECK-FAST-PERLANE: # %bb.0: 1968; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 1969; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] 1970; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 1971; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1] 1972; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 1973; CHECK-FAST-PERLANE-NEXT: retq 1974 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> 1975 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1976 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 1977 ret <4 x i64> %res 1978} 1979 1980define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { 1981; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: 1982; CHECK-FAST: # %bb.0: 1983; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1] 1984; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 1985; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 1986; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1987; CHECK-FAST-NEXT: retq 1988; 1989; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: 1990; CHECK-FAST-PERLANE: # %bb.0: 1991; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 1992; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] 1993; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 1994; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1] 1995; CHECK-FAST-PERLANE-NEXT: retq 1996 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1> 1997 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 1998 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 1999 ret <4 x i64> %res 2000} 2001define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2002; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: 2003; CHECK-FAST: # %bb.0: 2004; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2005; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3] 2006; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 2007; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2008; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2009; CHECK-FAST-NEXT: retq 2010; 2011; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: 2012; CHECK-FAST-PERLANE: # %bb.0: 2013; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2014; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] 2015; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 2016; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] 2017; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 2018; CHECK-FAST-PERLANE-NEXT: retq 2019 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> 2020 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2021 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2022 ret <4 x i64> %res 2023} 2024 2025define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { 2026; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: 2027; CHECK-FAST: # %bb.0: 2028; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3] 2029; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2030; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2031; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2032; CHECK-FAST-NEXT: retq 2033; 2034; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: 2035; CHECK-FAST-PERLANE: # %bb.0: 2036; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2037; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] 2038; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2039; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] 2040; CHECK-FAST-PERLANE-NEXT: retq 2041 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3> 2042 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2043 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2044 ret <4 x i64> %res 2045} 2046define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { 2047; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask3: 2048; CHECK-FAST: # %bb.0: 2049; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,0,0,7] 2050; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 2051; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2052; CHECK-FAST-NEXT: retq 2053; 2054; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3: 2055; CHECK-FAST-PERLANE: # %bb.0: 2056; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 2057; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2058; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3] 2059; CHECK-FAST-PERLANE-NEXT: retq 2060 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2061 ret <4 x i64> %res 2062} 2063define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2064; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: 2065; CHECK-FAST: # %bb.0: 2066; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2067; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7] 2068; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 2069; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2070; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2071; CHECK-FAST-NEXT: retq 2072; 2073; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: 2074; CHECK-FAST-PERLANE: # %bb.0: 2075; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2076; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 2077; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 2078; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3] 2079; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 2080; CHECK-FAST-PERLANE-NEXT: retq 2081 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2082 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2083 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2084 ret <4 x i64> %res 2085} 2086 2087define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { 2088; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: 2089; CHECK-FAST: # %bb.0: 2090; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,0,0,7] 2091; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2092; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2093; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2094; CHECK-FAST-NEXT: retq 2095; 2096; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: 2097; CHECK-FAST-PERLANE: # %bb.0: 2098; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2099; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 2100; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2101; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3] 2102; CHECK-FAST-PERLANE-NEXT: retq 2103 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7> 2104 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2105 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2106 ret <4 x i64> %res 2107} 2108define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2109; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: 2110; CHECK-FAST: # %bb.0: 2111; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2112; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5] 2113; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 2114; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2115; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2116; CHECK-FAST-NEXT: retq 2117; 2118; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: 2119; CHECK-FAST-PERLANE: # %bb.0: 2120; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2121; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] 2122; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 2123; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1] 2124; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0 2125; CHECK-FAST-PERLANE-NEXT: retq 2126 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> 2127 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2128 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2129 ret <4 x i64> %res 2130} 2131 2132define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { 2133; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: 2134; CHECK-FAST: # %bb.0: 2135; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5] 2136; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2137; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2138; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2139; CHECK-FAST-NEXT: retq 2140; 2141; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: 2142; CHECK-FAST-PERLANE: # %bb.0: 2143; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2144; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] 2145; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2146; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1] 2147; CHECK-FAST-PERLANE-NEXT: retq 2148 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> 2149 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2150 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2151 ret <4 x i64> %res 2152} 2153define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2154; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: 2155; CHECK: # %bb.0: 2156; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2157; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6] 2158; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2159; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2160; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2161; CHECK-NEXT: retq 2162 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> 2163 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2164 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2165 ret <4 x i64> %res 2166} 2167 2168define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { 2169; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: 2170; CHECK: # %bb.0: 2171; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,1,0,6] 2172; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2173; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2174; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2175; CHECK-NEXT: retq 2176 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> 2177 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2178 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2179 ret <4 x i64> %res 2180} 2181define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { 2182; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6: 2183; CHECK-FAST: # %bb.0: 2184; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3] 2185; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 2186; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2187; CHECK-FAST-NEXT: retq 2188; 2189; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6: 2190; CHECK-FAST-PERLANE: # %bb.0: 2191; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 2192; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3] 2193; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 2194; CHECK-FAST-PERLANE-NEXT: retq 2195 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2196 ret <4 x i64> %res 2197} 2198define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2199; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: 2200; CHECK-FAST: # %bb.0: 2201; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2202; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3] 2203; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1 2204; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2205; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2206; CHECK-FAST-NEXT: retq 2207; 2208; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: 2209; CHECK-FAST-PERLANE: # %bb.0: 2210; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3 2211; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3] 2212; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] 2213; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1 2214; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} 2215; CHECK-FAST-PERLANE-NEXT: retq 2216 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2217 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2218 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2219 ret <4 x i64> %res 2220} 2221 2222define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { 2223; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: 2224; CHECK-FAST: # %bb.0: 2225; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3] 2226; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2227; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2228; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2229; CHECK-FAST-NEXT: retq 2230; 2231; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: 2232; CHECK-FAST-PERLANE: # %bb.0: 2233; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2234; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3] 2235; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] 2236; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2237; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} 2238; CHECK-FAST-PERLANE-NEXT: retq 2239 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3> 2240 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2241 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2242 ret <4 x i64> %res 2243} 2244define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { 2245; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: 2246; CHECK: # %bb.0: 2247; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2248; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4] 2249; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 2250; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} 2251; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2252; CHECK-NEXT: retq 2253 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> 2254 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2255 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2256 ret <4 x i64> %res 2257} 2258define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { 2259; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: 2260; CHECK: # %bb.0: 2261; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,0,3,4] 2262; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2263; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} 2264; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2265; CHECK-NEXT: retq 2266 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> 2267 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2268 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2269 ret <4 x i64> %res 2270} 2271 2272define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { 2273; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0: 2274; CHECK: # %bb.0: 2275; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7] 2276; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2277; CHECK-NEXT: vzeroupper 2278; CHECK-NEXT: retq 2279 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2280 ret <2 x i64> %res 2281} 2282define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 2283; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: 2284; CHECK: # %bb.0: 2285; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 2286; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 2287; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,2,3,7,4,6,7] 2288; CHECK-NEXT: vmovdqa %xmm1, %xmm0 2289; CHECK-NEXT: vzeroupper 2290; CHECK-NEXT: retq 2291 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2292 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2293 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2294 ret <2 x i64> %res 2295} 2296define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { 2297; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: 2298; CHECK: # %bb.0: 2299; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2300; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7] 2301; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2302; CHECK-NEXT: vzeroupper 2303; CHECK-NEXT: retq 2304 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0> 2305 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2306 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2307 ret <2 x i64> %res 2308} 2309 2310define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { 2311; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1: 2312; CHECK: # %bb.0: 2313; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2314; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2315; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 2316; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3] 2317; CHECK-NEXT: vmovdqa %xmm1, %xmm0 2318; CHECK-NEXT: vzeroupper 2319; CHECK-NEXT: retq 2320 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> 2321 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2322 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2323 ret <2 x i64> %res 2324} 2325define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) { 2326; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1: 2327; CHECK: # %bb.0: 2328; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2329; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2330; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] 2331; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2332; CHECK-NEXT: vzeroupper 2333; CHECK-NEXT: retq 2334 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5> 2335 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2336 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2337 ret <2 x i64> %res 2338} 2339 2340define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) { 2341; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0: 2342; CHECK: # %bb.0: 2343; CHECK-NEXT: vpermpd $136, (%rdi), %ymm0 # ymm0 = mem[0,2,0,2] 2344; CHECK-NEXT: retq 2345 %vec = load <8 x i64>, ptr %vp 2346 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2347 ret <4 x i64> %res 2348} 2349define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2350; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0: 2351; CHECK: # %bb.0: 2352; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2353; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[0,2,0,2] 2354; CHECK-NEXT: retq 2355 %vec = load <8 x i64>, ptr %vp 2356 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2357 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2358 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2359 ret <4 x i64> %res 2360} 2361define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) { 2362; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0: 2363; CHECK: # %bb.0: 2364; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2365; CHECK-NEXT: vpermq $136, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[0,2,0,2] 2366; CHECK-NEXT: retq 2367 %vec = load <8 x i64>, ptr %vp 2368 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2369 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2370 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2371 ret <4 x i64> %res 2372} 2373 2374define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2375; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: 2376; CHECK-FAST: # %bb.0: 2377; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2378; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,3,2,4] 2379; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2380; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2381; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2382; CHECK-FAST-NEXT: retq 2383; 2384; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1: 2385; CHECK-FAST-PERLANE: # %bb.0: 2386; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 2387; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm2, %ymm2 # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] 2388; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2389; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0] 2390; CHECK-FAST-PERLANE-NEXT: retq 2391 %vec = load <8 x i64>, ptr %vp 2392 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> 2393 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2394 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2395 ret <4 x i64> %res 2396} 2397 2398define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(ptr %vp, <4 x i64> %mask) { 2399; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: 2400; CHECK-FAST: # %bb.0: 2401; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2402; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,3,2,4] 2403; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 2404; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2405; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2406; CHECK-FAST-NEXT: retq 2407; 2408; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: 2409; CHECK-FAST-PERLANE: # %bb.0: 2410; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 2411; CHECK-FAST-PERLANE-NEXT: vpblendd $15, (%rdi), %ymm1, %ymm1 # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] 2412; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 2413; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0] 2414; CHECK-FAST-PERLANE-NEXT: retq 2415 %vec = load <8 x i64>, ptr %vp 2416 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0> 2417 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2418 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2419 ret <4 x i64> %res 2420} 2421 2422define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2423; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: 2424; CHECK-FAST: # %bb.0: 2425; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2426; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,5,1] 2427; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2428; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2429; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2430; CHECK-FAST-NEXT: retq 2431; 2432; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2: 2433; CHECK-FAST-PERLANE: # %bb.0: 2434; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 2435; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 2436; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2437; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0] 2438; CHECK-FAST-PERLANE-NEXT: retq 2439 %vec = load <8 x i64>, ptr %vp 2440 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> 2441 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2442 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2443 ret <4 x i64> %res 2444} 2445 2446define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(ptr %vp, <4 x i64> %mask) { 2447; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: 2448; CHECK-FAST: # %bb.0: 2449; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2450; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,5,5,1] 2451; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 2452; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2453; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2454; CHECK-FAST-NEXT: retq 2455; 2456; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: 2457; CHECK-FAST-PERLANE: # %bb.0: 2458; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 2459; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] 2460; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 2461; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0] 2462; CHECK-FAST-PERLANE-NEXT: retq 2463 %vec = load <8 x i64>, ptr %vp 2464 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> 2465 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2466 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2467 ret <4 x i64> %res 2468} 2469 2470define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(ptr %vp) { 2471; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: 2472; CHECK-FAST: # %bb.0: 2473; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,0,0,2] 2474; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 2475; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2476; CHECK-FAST-NEXT: retq 2477; 2478; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: 2479; CHECK-FAST-PERLANE: # %bb.0: 2480; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 2481; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm0, %ymm0 # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] 2482; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3] 2483; CHECK-FAST-PERLANE-NEXT: retq 2484 %vec = load <8 x i64>, ptr %vp 2485 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2486 ret <4 x i64> %res 2487} 2488define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2489; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: 2490; CHECK-FAST: # %bb.0: 2491; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2492; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,0,0,2] 2493; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2494; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} 2495; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2496; CHECK-FAST-NEXT: retq 2497; 2498; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3: 2499; CHECK-FAST-PERLANE: # %bb.0: 2500; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 2501; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm2, %ymm2 # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] 2502; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2503; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3] 2504; CHECK-FAST-PERLANE-NEXT: retq 2505 %vec = load <8 x i64>, ptr %vp 2506 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2507 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2508 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2509 ret <4 x i64> %res 2510} 2511 2512define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(ptr %vp, <4 x i64> %mask) { 2513; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: 2514; CHECK-FAST: # %bb.0: 2515; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,0,2] 2516; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 2517; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} 2518; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2519; CHECK-FAST-NEXT: retq 2520; 2521; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: 2522; CHECK-FAST-PERLANE: # %bb.0: 2523; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 2524; CHECK-FAST-PERLANE-NEXT: vpalignr $8, 32(%rdi), %ymm1, %ymm1 # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] 2525; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 2526; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3] 2527; CHECK-FAST-PERLANE-NEXT: retq 2528 %vec = load <8 x i64>, ptr %vp 2529 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2> 2530 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2531 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2532 ret <4 x i64> %res 2533} 2534 2535define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2536; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4: 2537; CHECK: # %bb.0: 2538; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2539; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,0,2,5] 2540; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2541; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2542; CHECK-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2543; CHECK-NEXT: retq 2544 %vec = load <8 x i64>, ptr %vp 2545 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> 2546 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2547 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2548 ret <4 x i64> %res 2549} 2550 2551define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(ptr %vp, <4 x i64> %mask) { 2552; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: 2553; CHECK: # %bb.0: 2554; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2 2555; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,0,2,5] 2556; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2557; CHECK-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2558; CHECK-NEXT: vmovdqa %ymm1, %ymm0 2559; CHECK-NEXT: retq 2560 %vec = load <8 x i64>, ptr %vp 2561 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> 2562 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2563 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2564 ret <4 x i64> %res 2565} 2566 2567define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2568; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: 2569; CHECK-FAST: # %bb.0: 2570; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2571; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,7,1] 2572; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2573; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} 2574; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2575; CHECK-FAST-NEXT: retq 2576; 2577; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5: 2578; CHECK-FAST-PERLANE: # %bb.0: 2579; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 2580; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] 2581; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2582; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1] 2583; CHECK-FAST-PERLANE-NEXT: retq 2584 %vec = load <8 x i64>, ptr %vp 2585 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> 2586 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2587 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2588 ret <4 x i64> %res 2589} 2590 2591define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(ptr %vp, <4 x i64> %mask) { 2592; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: 2593; CHECK-FAST: # %bb.0: 2594; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,7,1] 2595; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 2596; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} 2597; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2598; CHECK-FAST-NEXT: retq 2599; 2600; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: 2601; CHECK-FAST-PERLANE: # %bb.0: 2602; CHECK-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 2603; CHECK-FAST-PERLANE-NEXT: vpblendd $192, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7] 2604; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 2605; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1] 2606; CHECK-FAST-PERLANE-NEXT: retq 2607 %vec = load <8 x i64>, ptr %vp 2608 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1> 2609 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2610 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2611 ret <4 x i64> %res 2612} 2613 2614define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(ptr %vp) { 2615; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: 2616; CHECK: # %bb.0: 2617; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [7,2,3,2] 2618; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 2619; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2620; CHECK-NEXT: retq 2621 %vec = load <8 x i64>, ptr %vp 2622 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2623 ret <4 x i64> %res 2624} 2625define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2626; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6: 2627; CHECK: # %bb.0: 2628; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2629; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,2,3,2] 2630; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 2631; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} 2632; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2633; CHECK-NEXT: retq 2634 %vec = load <8 x i64>, ptr %vp 2635 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2636 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2637 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2638 ret <4 x i64> %res 2639} 2640 2641define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(ptr %vp, <4 x i64> %mask) { 2642; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: 2643; CHECK: # %bb.0: 2644; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,2,3,2] 2645; CHECK-NEXT: vptestnmq %ymm0, %ymm0, %k1 2646; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} 2647; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2648; CHECK-NEXT: retq 2649 %vec = load <8 x i64>, ptr %vp 2650 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> 2651 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2652 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2653 ret <4 x i64> %res 2654} 2655 2656define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %vec2, <4 x i64> %mask) { 2657; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: 2658; CHECK-FAST: # %bb.0: 2659; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2660; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,3,1,5] 2661; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm3 2662; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1 2663; CHECK-FAST-NEXT: vmovdqa64 %ymm3, %ymm0 {%k1} 2664; CHECK-FAST-NEXT: retq 2665; 2666; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7: 2667; CHECK-FAST-PERLANE: # %bb.0: 2668; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm2 2669; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm2, %ymm2 # ymm2 = ymm2[1],mem[1],ymm2[3],mem[3] 2670; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1 2671; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1] 2672; CHECK-FAST-PERLANE-NEXT: retq 2673 %vec = load <8 x i64>, ptr %vp 2674 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> 2675 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2676 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 2677 ret <4 x i64> %res 2678} 2679 2680define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> %mask) { 2681; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: 2682; CHECK-FAST: # %bb.0: 2683; CHECK-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 2684; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [3,3,1,5] 2685; CHECK-FAST-NEXT: vptestnmq %ymm0, %ymm0, %k1 2686; CHECK-FAST-NEXT: vpermi2q (%rdi), %ymm2, %ymm1 {%k1} {z} 2687; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0 2688; CHECK-FAST-NEXT: retq 2689; 2690; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: 2691; CHECK-FAST-PERLANE: # %bb.0: 2692; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 2693; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq (%rdi), %ymm1, %ymm1 # ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] 2694; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm0, %ymm0, %k1 2695; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1] 2696; CHECK-FAST-PERLANE-NEXT: retq 2697 %vec = load <8 x i64>, ptr %vp 2698 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1> 2699 %cmp = icmp eq <4 x i64> %mask, zeroinitializer 2700 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer 2701 ret <4 x i64> %res 2702} 2703 2704define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { 2705; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: 2706; CHECK-FAST: # %bb.0: 2707; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,1] 2708; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0 2709; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2710; CHECK-FAST-NEXT: vzeroupper 2711; CHECK-FAST-NEXT: retq 2712; 2713; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: 2714; CHECK-FAST-PERLANE: # %bb.0: 2715; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0 2716; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] 2717; CHECK-FAST-PERLANE-NEXT: retq 2718 %vec = load <8 x i64>, ptr %vp 2719 %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2720 ret <2 x i64> %res 2721} 2722define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { 2723; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: 2724; CHECK-FAST: # %bb.0: 2725; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2726; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1] 2727; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1 2728; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} 2729; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2730; CHECK-FAST-NEXT: vzeroupper 2731; CHECK-FAST-NEXT: retq 2732; 2733; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: 2734; CHECK-FAST-PERLANE: # %bb.0: 2735; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 2736; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] 2737; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1 2738; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} 2739; CHECK-FAST-PERLANE-NEXT: retq 2740 %vec = load <8 x i64>, ptr %vp 2741 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2742 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2743 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2744 ret <2 x i64> %res 2745} 2746 2747define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { 2748; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: 2749; CHECK-FAST: # %bb.0: 2750; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1] 2751; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1 2752; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} 2753; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2754; CHECK-FAST-NEXT: vzeroupper 2755; CHECK-FAST-NEXT: retq 2756; 2757; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: 2758; CHECK-FAST-PERLANE: # %bb.0: 2759; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1 2760; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] 2761; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1 2762; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} 2763; CHECK-FAST-PERLANE-NEXT: retq 2764 %vec = load <8 x i64>, ptr %vp 2765 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1> 2766 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2767 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2768 ret <2 x i64> %res 2769} 2770 2771define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { 2772; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1: 2773; CHECK: # %bb.0: 2774; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2 2775; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 2776; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] 2777; CHECK-NEXT: retq 2778 %vec = load <8 x i64>, ptr %vp 2779 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> 2780 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2781 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 2782 ret <2 x i64> %res 2783} 2784 2785define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) { 2786; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1: 2787; CHECK: # %bb.0: 2788; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1 2789; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 2790; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] 2791; CHECK-NEXT: retq 2792 %vec = load <8 x i64>, ptr %vp 2793 %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2> 2794 %cmp = icmp eq <2 x i64> %mask, zeroinitializer 2795 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer 2796 ret <2 x i64> %res 2797} 2798 2799define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) { 2800; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0: 2801; CHECK: # %bb.0: 2802; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 2803; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1] 2804; CHECK-NEXT: vzeroupper 2805; CHECK-NEXT: retq 2806 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2807 ret <4 x float> %res 2808} 2809define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2810; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0: 2811; CHECK: # %bb.0: 2812; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 2813; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 2814; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 2815; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1] 2816; CHECK-NEXT: vmovaps %xmm1, %xmm0 2817; CHECK-NEXT: vzeroupper 2818; CHECK-NEXT: retq 2819 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2820 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2821 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2822 ret <4 x float> %res 2823} 2824 2825define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) { 2826; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0: 2827; CHECK: # %bb.0: 2828; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 2829; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2830; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2831; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1] 2832; CHECK-NEXT: vzeroupper 2833; CHECK-NEXT: retq 2834 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5> 2835 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2836 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2837 ret <4 x float> %res 2838} 2839define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2840; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1: 2841; CHECK: # %bb.0: 2842; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2843; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0] 2844; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 2845; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 2846; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} 2847; CHECK-NEXT: vmovaps %xmm1, %xmm0 2848; CHECK-NEXT: vzeroupper 2849; CHECK-NEXT: retq 2850 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> 2851 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2852 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2853 ret <4 x float> %res 2854} 2855 2856define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) { 2857; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1: 2858; CHECK: # %bb.0: 2859; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0] 2860; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2861; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2862; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2863; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2864; CHECK-NEXT: vzeroupper 2865; CHECK-NEXT: retq 2866 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0> 2867 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2868 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2869 ret <4 x float> %res 2870} 2871define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2872; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2: 2873; CHECK: # %bb.0: 2874; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2875; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0] 2876; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 2877; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 2878; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} 2879; CHECK-NEXT: vmovaps %xmm1, %xmm0 2880; CHECK-NEXT: vzeroupper 2881; CHECK-NEXT: retq 2882 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> 2883 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2884 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2885 ret <4 x float> %res 2886} 2887 2888define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) { 2889; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2: 2890; CHECK: # %bb.0: 2891; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0] 2892; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2893; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2894; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2895; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2896; CHECK-NEXT: vzeroupper 2897; CHECK-NEXT: retq 2898 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0> 2899 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2900 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2901 ret <4 x float> %res 2902} 2903define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) { 2904; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3: 2905; CHECK: # %bb.0: 2906; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,5,2] 2907; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 2908; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2909; CHECK-NEXT: vzeroupper 2910; CHECK-NEXT: retq 2911 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2912 ret <4 x float> %res 2913} 2914define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 2915; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3: 2916; CHECK: # %bb.0: 2917; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 2918; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,3,5,2] 2919; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 2920; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 2921; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} 2922; CHECK-NEXT: vmovaps %xmm1, %xmm0 2923; CHECK-NEXT: vzeroupper 2924; CHECK-NEXT: retq 2925 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2926 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2927 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2928 ret <4 x float> %res 2929} 2930 2931define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) { 2932; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3: 2933; CHECK: # %bb.0: 2934; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,5,2] 2935; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2936; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 2937; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} 2938; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2939; CHECK-NEXT: vzeroupper 2940; CHECK-NEXT: retq 2941 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2> 2942 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2943 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2944 ret <4 x float> %res 2945} 2946define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { 2947; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0: 2948; CHECK: # %bb.0: 2949; CHECK-NEXT: vmovaps 16(%rdi), %xmm1 2950; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [2,6,0,1] 2951; CHECK-NEXT: vpermi2ps (%rdi), %xmm1, %xmm0 2952; CHECK-NEXT: retq 2953 %vec = load <8 x float>, ptr %vp 2954 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2955 ret <4 x float> %res 2956} 2957define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 2958; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0: 2959; CHECK: # %bb.0: 2960; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2961; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,6,0,1] 2962; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 2963; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2964; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2965; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 2966; CHECK-NEXT: retq 2967 %vec = load <8 x float>, ptr %vp 2968 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2969 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2970 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 2971 ret <4 x float> %res 2972} 2973 2974define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { 2975; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0: 2976; CHECK: # %bb.0: 2977; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2978; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,0,1] 2979; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 2980; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 2981; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} 2982; CHECK-NEXT: vmovaps %xmm1, %xmm0 2983; CHECK-NEXT: retq 2984 %vec = load <8 x float>, ptr %vp 2985 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5> 2986 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 2987 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 2988 ret <4 x float> %res 2989} 2990 2991define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 2992; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1: 2993; CHECK: # %bb.0: 2994; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 2995; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,7,7,2] 2996; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm3 2997; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 2998; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 2999; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3000; CHECK-NEXT: retq 3001 %vec = load <8 x float>, ptr %vp 3002 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> 3003 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3004 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3005 ret <4 x float> %res 3006} 3007 3008define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { 3009; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1: 3010; CHECK: # %bb.0: 3011; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 3012; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,7,2] 3013; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3014; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3015; CHECK-NEXT: vpermi2ps (%rdi), %xmm2, %xmm1 {%k1} {z} 3016; CHECK-NEXT: vmovaps %xmm1, %xmm0 3017; CHECK-NEXT: retq 3018 %vec = load <8 x float>, ptr %vp 3019 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6> 3020 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3021 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3022 ret <4 x float> %res 3023} 3024 3025define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3026; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2: 3027; CHECK: # %bb.0: 3028; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 3029; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,1,3,7] 3030; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3031; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3032; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} 3033; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3034; CHECK-NEXT: vzeroupper 3035; CHECK-NEXT: retq 3036 %vec = load <8 x float>, ptr %vp 3037 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> 3038 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3039 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3040 ret <4 x float> %res 3041} 3042 3043define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { 3044; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2: 3045; CHECK: # %bb.0: 3046; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,1,3,7] 3047; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3048; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 3049; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} 3050; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3051; CHECK-NEXT: vzeroupper 3052; CHECK-NEXT: retq 3053 %vec = load <8 x float>, ptr %vp 3054 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7> 3055 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3056 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3057 ret <4 x float> %res 3058} 3059 3060define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { 3061; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3: 3062; CHECK: # %bb.0: 3063; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,3,5,3] 3064; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 3065; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3066; CHECK-NEXT: vzeroupper 3067; CHECK-NEXT: retq 3068 %vec = load <8 x float>, ptr %vp 3069 %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 3070 ret <4 x float> %res 3071} 3072define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3073; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3: 3074; CHECK: # %bb.0: 3075; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 3076; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3] 3077; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3078; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3079; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} 3080; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3081; CHECK-NEXT: vzeroupper 3082; CHECK-NEXT: retq 3083 %vec = load <8 x float>, ptr %vp 3084 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 3085 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3086 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3087 ret <4 x float> %res 3088} 3089 3090define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { 3091; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3: 3092; CHECK: # %bb.0: 3093; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,3] 3094; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3095; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 3096; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} 3097; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3098; CHECK-NEXT: vzeroupper 3099; CHECK-NEXT: retq 3100 %vec = load <8 x float>, ptr %vp 3101 %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3> 3102 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3103 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3104 ret <4 x float> %res 3105} 3106 3107define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) { 3108; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0: 3109; CHECK: # %bb.0: 3110; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7] 3111; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3112; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3113; CHECK-NEXT: retq 3114 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 3115 ret <8 x float> %res 3116} 3117define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 3118; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0: 3119; CHECK: # %bb.0: 3120; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3121; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7] 3122; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3123; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 3124; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3125; CHECK-NEXT: vmovaps %ymm1, %ymm0 3126; CHECK-NEXT: retq 3127 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 3128 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3129 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3130 ret <8 x float> %res 3131} 3132 3133define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) { 3134; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0: 3135; CHECK: # %bb.0: 3136; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7] 3137; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3138; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3139; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3140; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3141; CHECK-NEXT: retq 3142 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7> 3143 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3144 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3145 ret <8 x float> %res 3146} 3147define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 3148; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1: 3149; CHECK: # %bb.0: 3150; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3151; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14] 3152; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3153; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 3154; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3155; CHECK-NEXT: vmovaps %ymm1, %ymm0 3156; CHECK-NEXT: retq 3157 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> 3158 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3159 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3160 ret <8 x float> %res 3161} 3162 3163define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) { 3164; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1: 3165; CHECK: # %bb.0: 3166; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [10,12,3,12,4,15,1,14] 3167; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3168; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3169; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3170; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3171; CHECK-NEXT: retq 3172 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14> 3173 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3174 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3175 ret <8 x float> %res 3176} 3177define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 3178; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: 3179; CHECK: # %bb.0: 3180; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3181; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4] 3182; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3183; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 3184; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3185; CHECK-NEXT: vmovaps %ymm1, %ymm0 3186; CHECK-NEXT: retq 3187 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> 3188 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3189 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3190 ret <8 x float> %res 3191} 3192 3193define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { 3194; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: 3195; CHECK: # %bb.0: 3196; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,8,9,6,1,4,4] 3197; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3198; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3199; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3200; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3201; CHECK-NEXT: retq 3202 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> 3203 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3204 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3205 ret <8 x float> %res 3206} 3207define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) { 3208; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3: 3209; CHECK: # %bb.0: 3210; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8] 3211; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3212; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3213; CHECK-NEXT: retq 3214 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3215 ret <8 x float> %res 3216} 3217define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { 3218; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3: 3219; CHECK: # %bb.0: 3220; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3221; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8] 3222; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3223; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1 3224; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3225; CHECK-NEXT: vmovaps %ymm1, %ymm0 3226; CHECK-NEXT: retq 3227 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3228 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3229 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3230 ret <8 x float> %res 3231} 3232 3233define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) { 3234; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3: 3235; CHECK: # %bb.0: 3236; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [12,14,9,0,12,4,5,8] 3237; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3238; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3239; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3240; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3241; CHECK-NEXT: retq 3242 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8> 3243 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3244 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3245 ret <8 x float> %res 3246} 3247define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) { 3248; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0: 3249; CHECK: # %bb.0: 3250; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,8,9,10] 3251; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3252; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3253; CHECK-NEXT: vzeroupper 3254; CHECK-NEXT: retq 3255 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3256 ret <4 x float> %res 3257} 3258define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3259; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0: 3260; CHECK: # %bb.0: 3261; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 3262; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,8,9,10] 3263; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3264; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 3265; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3266; CHECK-NEXT: vmovaps %xmm1, %xmm0 3267; CHECK-NEXT: vzeroupper 3268; CHECK-NEXT: retq 3269 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3270 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3271 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3272 ret <4 x float> %res 3273} 3274 3275define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) { 3276; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0: 3277; CHECK: # %bb.0: 3278; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,8,9,10] 3279; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3280; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3281; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3282; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3283; CHECK-NEXT: vzeroupper 3284; CHECK-NEXT: retq 3285 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10> 3286 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3287 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3288 ret <4 x float> %res 3289} 3290define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3291; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: 3292; CHECK-FAST: # %bb.0: 3293; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 3294; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6] 3295; CHECK-FAST-NEXT: vxorps %xmm4, %xmm4, %xmm4 3296; CHECK-FAST-NEXT: vcmpeqps %xmm4, %xmm2, %k1 3297; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3298; CHECK-FAST-NEXT: vmovaps %xmm1, %xmm0 3299; CHECK-FAST-NEXT: vzeroupper 3300; CHECK-FAST-NEXT: retq 3301; 3302; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1: 3303; CHECK-FAST-PERLANE: # %bb.0: 3304; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 3305; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 3306; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6] 3307; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4 3308; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0 3309; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1 3310; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1} 3311; CHECK-FAST-PERLANE-NEXT: vzeroupper 3312; CHECK-FAST-PERLANE-NEXT: retq 3313 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> 3314 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3315 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3316 ret <4 x float> %res 3317} 3318 3319define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) { 3320; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: 3321; CHECK-FAST: # %bb.0: 3322; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6] 3323; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 3324; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3325; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3326; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3327; CHECK-FAST-NEXT: vzeroupper 3328; CHECK-FAST-NEXT: retq 3329; 3330; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1: 3331; CHECK-FAST-PERLANE: # %bb.0: 3332; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 3333; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 3334; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6] 3335; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4 3336; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1 3337; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z} 3338; CHECK-FAST-PERLANE-NEXT: vzeroupper 3339; CHECK-FAST-PERLANE-NEXT: retq 3340 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6> 3341 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3342 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3343 ret <4 x float> %res 3344} 3345define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3346; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2: 3347; CHECK: # %bb.0: 3348; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 3349; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5] 3350; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3351; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 3352; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} 3353; CHECK-NEXT: vmovaps %xmm1, %xmm0 3354; CHECK-NEXT: vzeroupper 3355; CHECK-NEXT: retq 3356 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> 3357 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3358 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3359 ret <4 x float> %res 3360} 3361 3362define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) { 3363; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2: 3364; CHECK: # %bb.0: 3365; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 3366; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5] 3367; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3368; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3369; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} 3370; CHECK-NEXT: vzeroupper 3371; CHECK-NEXT: retq 3372 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5> 3373 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3374 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3375 ret <4 x float> %res 3376} 3377define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { 3378; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: 3379; CHECK: # %bb.0: 3380; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [10,2,11,6] 3381; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 3382; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3383; CHECK-NEXT: vzeroupper 3384; CHECK-NEXT: retq 3385 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3386 ret <4 x float> %res 3387} 3388define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { 3389; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: 3390; CHECK: # %bb.0: 3391; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 3392; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [10,2,11,6] 3393; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 3394; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1 3395; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} 3396; CHECK-NEXT: vmovaps %xmm1, %xmm0 3397; CHECK-NEXT: vzeroupper 3398; CHECK-NEXT: retq 3399 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3400 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3401 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3402 ret <4 x float> %res 3403} 3404 3405define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { 3406; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: 3407; CHECK: # %bb.0: 3408; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [10,2,11,6] 3409; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3410; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3411; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} 3412; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3413; CHECK-NEXT: vzeroupper 3414; CHECK-NEXT: retq 3415 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6> 3416 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3417 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3418 ret <4 x float> %res 3419} 3420define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp) { 3421; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0: 3422; CHECK: # %bb.0: 3423; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4] 3424; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 3425; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3426; CHECK-NEXT: retq 3427 %vec = load <16 x float>, ptr %vp 3428 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3429 ret <8 x float> %res 3430} 3431define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { 3432; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0: 3433; CHECK: # %bb.0: 3434; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3435; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [7,6,7,11,5,10,0,4] 3436; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3437; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3438; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} 3439; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3440; CHECK-NEXT: retq 3441 %vec = load <16 x float>, ptr %vp 3442 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3443 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3444 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3445 ret <8 x float> %res 3446} 3447 3448define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(ptr %vp, <8 x float> %mask) { 3449; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0: 3450; CHECK: # %bb.0: 3451; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4] 3452; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3453; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 3454; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} 3455; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3456; CHECK-NEXT: retq 3457 %vec = load <16 x float>, ptr %vp 3458 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4> 3459 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3460 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3461 ret <8 x float> %res 3462} 3463 3464define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { 3465; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1: 3466; CHECK: # %bb.0: 3467; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3468; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm2 = [11,0,9,0,7,14,0,8] 3469; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3470; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1 3471; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} 3472; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3473; CHECK-NEXT: retq 3474 %vec = load <16 x float>, ptr %vp 3475 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> 3476 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3477 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3478 ret <8 x float> %res 3479} 3480 3481define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(ptr %vp, <8 x float> %mask) { 3482; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1: 3483; CHECK: # %bb.0: 3484; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8] 3485; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3486; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1 3487; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} 3488; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3489; CHECK-NEXT: retq 3490 %vec = load <16 x float>, ptr %vp 3491 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8> 3492 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3493 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3494 ret <8 x float> %res 3495} 3496 3497define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { 3498; CHECK-FAST-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: 3499; CHECK-FAST: # %bb.0: 3500; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 3501; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm3 = [9,5,2,3,2,8,8,1] 3502; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3503; CHECK-FAST-NEXT: vxorps %xmm2, %xmm2, %xmm2 3504; CHECK-FAST-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3505; CHECK-FAST-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3506; CHECK-FAST-NEXT: retq 3507; 3508; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2: 3509; CHECK-FAST-PERLANE: # %bb.0: 3510; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 3511; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,13,10,11,10,0,0,9] 3512; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3 3513; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2 3514; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3515; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3516; CHECK-FAST-PERLANE-NEXT: retq 3517 %vec = load <16 x float>, ptr %vp 3518 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> 3519 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3520 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3521 ret <8 x float> %res 3522} 3523 3524define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(ptr %vp, <8 x float> %mask) { 3525; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: 3526; CHECK-FAST: # %bb.0: 3527; CHECK-FAST-NEXT: vmovaps 32(%rdi), %ymm2 3528; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,5,2,3,2,8,8,1] 3529; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3 3530; CHECK-FAST-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3531; CHECK-FAST-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3532; CHECK-FAST-NEXT: vmovaps %ymm1, %ymm0 3533; CHECK-FAST-NEXT: retq 3534; 3535; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2: 3536; CHECK-FAST-PERLANE: # %bb.0: 3537; CHECK-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 3538; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,13,10,11,10,0,0,9] 3539; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3 3540; CHECK-FAST-PERLANE-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3541; CHECK-FAST-PERLANE-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z} 3542; CHECK-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm0 3543; CHECK-FAST-PERLANE-NEXT: retq 3544 %vec = load <16 x float>, ptr %vp 3545 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9> 3546 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3547 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3548 ret <8 x float> %res 3549} 3550 3551define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp) { 3552; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3: 3553; CHECK: # %bb.0: 3554; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 3555; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9] 3556; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm0 3557; CHECK-NEXT: retq 3558 %vec = load <16 x float>, ptr %vp 3559 %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3560 ret <8 x float> %res 3561} 3562define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %vec2, <8 x float> %mask) { 3563; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3: 3564; CHECK: # %bb.0: 3565; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3566; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [7,5,3,3,11,4,12,9] 3567; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3568; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3569; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 3570; CHECK-NEXT: vmovaps %ymm3, %ymm0 {%k1} 3571; CHECK-NEXT: retq 3572 %vec = load <16 x float>, ptr %vp 3573 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3574 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3575 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 3576 ret <8 x float> %res 3577} 3578 3579define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8 x float> %mask) { 3580; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3: 3581; CHECK: # %bb.0: 3582; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3583; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9] 3584; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3585; CHECK-NEXT: vcmpeqps %ymm3, %ymm0, %k1 3586; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3587; CHECK-NEXT: vmovaps %ymm1, %ymm0 3588; CHECK-NEXT: retq 3589 %vec = load <16 x float>, ptr %vp 3590 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1> 3591 %cmp = fcmp oeq <8 x float> %mask, zeroinitializer 3592 %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer 3593 ret <8 x float> %res 3594} 3595 3596define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) { 3597; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0: 3598; CHECK: # %bb.0: 3599; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3] 3600; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3] 3601; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0 3602; CHECK-NEXT: vzeroupper 3603; CHECK-NEXT: retq 3604 %vec = load <16 x float>, ptr %vp 3605 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3606 ret <4 x float> %res 3607} 3608define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3609; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0: 3610; CHECK: # %bb.0: 3611; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] 3612; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3] 3613; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3 3614; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3615; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3616; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3617; CHECK-NEXT: vzeroupper 3618; CHECK-NEXT: retq 3619 %vec = load <16 x float>, ptr %vp 3620 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3621 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3622 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3623 ret <4 x float> %res 3624} 3625 3626define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) { 3627; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0: 3628; CHECK: # %bb.0: 3629; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3] 3630; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3] 3631; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3632; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3633; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z} 3634; CHECK-NEXT: vmovaps %xmm1, %xmm0 3635; CHECK-NEXT: vzeroupper 3636; CHECK-NEXT: retq 3637 %vec = load <16 x float>, ptr %vp 3638 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11> 3639 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3640 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3641 ret <4 x float> %res 3642} 3643 3644define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3645; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1: 3646; CHECK: # %bb.0: 3647; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3648; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,10,6,15,0,0,0,0] 3649; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm3 3650; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3651; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3652; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3653; CHECK-NEXT: vzeroupper 3654; CHECK-NEXT: retq 3655 %vec = load <16 x float>, ptr %vp 3656 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> 3657 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3658 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3659 ret <4 x float> %res 3660} 3661 3662define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(ptr %vp, <4 x float> %mask) { 3663; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1: 3664; CHECK: # %bb.0: 3665; CHECK-NEXT: vmovaps 32(%rdi), %ymm2 3666; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,10,6,15,0,0,0,0] 3667; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3668; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3669; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3670; CHECK-NEXT: vmovaps %xmm1, %xmm0 3671; CHECK-NEXT: vzeroupper 3672; CHECK-NEXT: retq 3673 %vec = load <16 x float>, ptr %vp 3674 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7> 3675 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3676 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3677 ret <4 x float> %res 3678} 3679 3680define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3681; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2: 3682; CHECK: # %bb.0: 3683; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] 3684; CHECK-NEXT: # xmm2 = mem[0,0] 3685; CHECK-NEXT: vmovaps 32(%rdi), %ymm3 3686; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm3 3687; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3688; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 3689; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} 3690; CHECK-NEXT: vzeroupper 3691; CHECK-NEXT: retq 3692 %vec = load <16 x float>, ptr %vp 3693 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> 3694 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3695 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3696 ret <4 x float> %res 3697} 3698 3699define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(ptr %vp, <4 x float> %mask) { 3700; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2: 3701; CHECK: # %bb.0: 3702; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [4,14,4,14] 3703; CHECK-NEXT: # xmm2 = mem[0,0] 3704; CHECK-NEXT: vmovaps 32(%rdi), %ymm1 3705; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3706; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1 3707; CHECK-NEXT: vpermt2ps (%rdi), %ymm2, %ymm1 {%k1} {z} 3708; CHECK-NEXT: vmovaps %xmm1, %xmm0 3709; CHECK-NEXT: vzeroupper 3710; CHECK-NEXT: retq 3711 %vec = load <16 x float>, ptr %vp 3712 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6> 3713 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3714 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3715 ret <4 x float> %res 3716} 3717 3718define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp) { 3719; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3: 3720; CHECK: # %bb.0: 3721; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,3,15,9] 3722; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 3723; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3724; CHECK-NEXT: vzeroupper 3725; CHECK-NEXT: retq 3726 %vec = load <16 x float>, ptr %vp 3727 %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3728 ret <4 x float> %res 3729} 3730define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %vec2, <4 x float> %mask) { 3731; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3: 3732; CHECK: # %bb.0: 3733; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 3734; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,3,15,9] 3735; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3 3736; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 3737; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} 3738; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3739; CHECK-NEXT: vzeroupper 3740; CHECK-NEXT: retq 3741 %vec = load <16 x float>, ptr %vp 3742 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3743 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3744 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2 3745 ret <4 x float> %res 3746} 3747 3748define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(ptr %vp, <4 x float> %mask) { 3749; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3: 3750; CHECK: # %bb.0: 3751; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,3,15,9] 3752; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 3753; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 3754; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} 3755; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 3756; CHECK-NEXT: vzeroupper 3757; CHECK-NEXT: retq 3758 %vec = load <16 x float>, ptr %vp 3759 %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9> 3760 %cmp = fcmp oeq <4 x float> %mask, zeroinitializer 3761 %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer 3762 ret <4 x float> %res 3763} 3764 3765define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) { 3766; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0: 3767; CHECK: # %bb.0: 3768; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] 3769; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3770; CHECK-NEXT: vzeroupper 3771; CHECK-NEXT: retq 3772 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3773 ret <2 x double> %res 3774} 3775define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3776; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0: 3777; CHECK: # %bb.0: 3778; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 3779; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3780; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 3781; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3] 3782; CHECK-NEXT: vmovapd %xmm1, %xmm0 3783; CHECK-NEXT: vzeroupper 3784; CHECK-NEXT: retq 3785 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3786 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3787 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3788 ret <2 x double> %res 3789} 3790 3791define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) { 3792; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0: 3793; CHECK: # %bb.0: 3794; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3795; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 3796; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] 3797; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3798; CHECK-NEXT: vzeroupper 3799; CHECK-NEXT: retq 3800 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3801 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3802 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3803 ret <2 x double> %res 3804} 3805define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 3806; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1: 3807; CHECK: # %bb.0: 3808; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 3809; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3810; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 3811; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[1,3,2,3] 3812; CHECK-NEXT: vmovapd %xmm1, %xmm0 3813; CHECK-NEXT: vzeroupper 3814; CHECK-NEXT: retq 3815 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> 3816 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3817 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3818 ret <2 x double> %res 3819} 3820 3821define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) { 3822; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1: 3823; CHECK: # %bb.0: 3824; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3825; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 3826; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3] 3827; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3828; CHECK-NEXT: vzeroupper 3829; CHECK-NEXT: retq 3830 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3> 3831 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3832 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3833 ret <2 x double> %res 3834} 3835define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { 3836; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0: 3837; CHECK: # %bb.0: 3838; CHECK-NEXT: vmovaps (%rdi), %xmm0 3839; CHECK-NEXT: vblendps $3, 16(%rdi), %xmm0, %xmm0 # xmm0 = mem[0,1],xmm0[2,3] 3840; CHECK-NEXT: retq 3841 %vec = load <4 x double>, ptr %vp 3842 %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3843 ret <2 x double> %res 3844} 3845define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { 3846; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0: 3847; CHECK: # %bb.0: 3848; CHECK-NEXT: vmovapd (%rdi), %xmm2 3849; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm2, %xmm2 # xmm2 = mem[0],xmm2[1] 3850; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3851; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3852; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} 3853; CHECK-NEXT: retq 3854 %vec = load <4 x double>, ptr %vp 3855 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3856 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3857 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3858 ret <2 x double> %res 3859} 3860 3861define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { 3862; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0: 3863; CHECK: # %bb.0: 3864; CHECK-NEXT: vmovapd (%rdi), %xmm1 3865; CHECK-NEXT: vblendpd $1, 16(%rdi), %xmm1, %xmm1 # xmm1 = mem[0],xmm1[1] 3866; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3867; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 3868; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} 3869; CHECK-NEXT: retq 3870 %vec = load <4 x double>, ptr %vp 3871 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1> 3872 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3873 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3874 ret <2 x double> %res 3875} 3876 3877define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { 3878; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1: 3879; CHECK: # %bb.0: 3880; CHECK-NEXT: vmovapd 16(%rdi), %xmm2 3881; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3882; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 3883; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] 3884; CHECK-NEXT: retq 3885 %vec = load <4 x double>, ptr %vp 3886 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3887 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3888 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 3889 ret <2 x double> %res 3890} 3891 3892define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { 3893; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1: 3894; CHECK: # %bb.0: 3895; CHECK-NEXT: vmovapd 16(%rdi), %xmm1 3896; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3897; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 3898; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] 3899; CHECK-NEXT: retq 3900 %vec = load <4 x double>, ptr %vp 3901 %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0> 3902 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 3903 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 3904 ret <2 x double> %res 3905} 3906 3907define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { 3908; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: 3909; CHECK: # %bb.0: 3910; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,3,7,3] 3911; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 3912; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3913; CHECK-NEXT: retq 3914 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3915 ret <4 x double> %res 3916} 3917define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3918; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: 3919; CHECK: # %bb.0: 3920; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3921; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,3,7,3] 3922; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 3923; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 3924; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 3925; CHECK-NEXT: vmovapd %ymm1, %ymm0 3926; CHECK-NEXT: retq 3927 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3928 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3929 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3930 ret <4 x double> %res 3931} 3932 3933define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { 3934; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: 3935; CHECK: # %bb.0: 3936; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,3,7,3] 3937; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3938; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3939; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3940; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3941; CHECK-NEXT: retq 3942 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3> 3943 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3944 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3945 ret <4 x double> %res 3946} 3947define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3948; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1: 3949; CHECK: # %bb.0: 3950; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 3951; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,0,7,6] 3952; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 3953; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 3954; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 3955; CHECK-NEXT: vmovapd %ymm1, %ymm0 3956; CHECK-NEXT: retq 3957 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> 3958 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3959 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3960 ret <4 x double> %res 3961} 3962 3963define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) { 3964; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1: 3965; CHECK: # %bb.0: 3966; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,0,7,6] 3967; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3968; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 3969; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 3970; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3971; CHECK-NEXT: retq 3972 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6> 3973 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3974 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 3975 ret <4 x double> %res 3976} 3977define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 3978; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2: 3979; CHECK: # %bb.0: 3980; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 3981; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 3982; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0] 3983; CHECK-NEXT: vmovapd %ymm1, %ymm0 3984; CHECK-NEXT: retq 3985 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> 3986 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 3987 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 3988 ret <4 x double> %res 3989} 3990 3991define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) { 3992; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2: 3993; CHECK: # %bb.0: 3994; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 3995; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 3996; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0] 3997; CHECK-NEXT: retq 3998 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0> 3999 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4000 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4001 ret <4 x double> %res 4002} 4003define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { 4004; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: 4005; CHECK: # %bb.0: 4006; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,1,4] 4007; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 4008; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4009; CHECK-NEXT: retq 4010 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 4011 ret <4 x double> %res 4012} 4013define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 4014; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: 4015; CHECK: # %bb.0: 4016; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 4017; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,1,4] 4018; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4019; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4020; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4021; CHECK-NEXT: vmovapd %ymm1, %ymm0 4022; CHECK-NEXT: retq 4023 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 4024 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4025 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4026 ret <4 x double> %res 4027} 4028 4029define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) { 4030; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3: 4031; CHECK: # %bb.0: 4032; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,1,4] 4033; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4034; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4035; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4036; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4037; CHECK-NEXT: retq 4038 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4> 4039 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4040 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4041 ret <4 x double> %res 4042} 4043define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 4044; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: 4045; CHECK-FAST: # %bb.0: 4046; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,5] 4047; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 4048; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4049; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 4050; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1] 4051; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 4052; CHECK-FAST-NEXT: retq 4053; 4054; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: 4055; CHECK-FAST-PERLANE: # %bb.0: 4056; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3 4057; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] 4058; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4059; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 4060; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1] 4061; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 4062; CHECK-FAST-PERLANE-NEXT: retq 4063 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 4064 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4065 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4066 ret <4 x double> %res 4067} 4068 4069define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { 4070; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: 4071; CHECK-FAST: # %bb.0: 4072; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,5] 4073; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 4074; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4075; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4076; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1] 4077; CHECK-FAST-NEXT: retq 4078; 4079; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: 4080; CHECK-FAST-PERLANE: # %bb.0: 4081; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2 4082; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 4083; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4084; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4085; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1] 4086; CHECK-FAST-PERLANE-NEXT: retq 4087 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 4088 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4089 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4090 ret <4 x double> %res 4091} 4092define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 4093; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: 4094; CHECK: # %bb.0: 4095; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 4096; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,6,2,2] 4097; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4098; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4099; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4100; CHECK-NEXT: vmovapd %ymm1, %ymm0 4101; CHECK-NEXT: retq 4102 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> 4103 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4104 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4105 ret <4 x double> %res 4106} 4107 4108define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { 4109; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: 4110; CHECK: # %bb.0: 4111; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [2,6,2,2] 4112; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4113; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4114; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4115; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4116; CHECK-NEXT: retq 4117 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2> 4118 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4119 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4120 ret <4 x double> %res 4121} 4122define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { 4123; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6: 4124; CHECK-FAST: # %bb.0: 4125; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0] 4126; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 4127; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4128; CHECK-FAST-NEXT: retq 4129; 4130; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6: 4131; CHECK-FAST-PERLANE: # %bb.0: 4132; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1 4133; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 4134; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 4135; CHECK-FAST-PERLANE-NEXT: retq 4136 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 4137 ret <4 x double> %res 4138} 4139define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 4140; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: 4141; CHECK-FAST: # %bb.0: 4142; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 4143; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0] 4144; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4145; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4146; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4147; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 4148; CHECK-FAST-NEXT: retq 4149; 4150; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: 4151; CHECK-FAST-PERLANE: # %bb.0: 4152; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 4153; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 4154; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4155; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4156; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3] 4157; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 4158; CHECK-FAST-PERLANE-NEXT: retq 4159 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 4160 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4161 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4162 ret <4 x double> %res 4163} 4164 4165define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { 4166; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: 4167; CHECK-FAST: # %bb.0: 4168; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0] 4169; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4170; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4171; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4172; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4173; CHECK-FAST-NEXT: retq 4174; 4175; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: 4176; CHECK-FAST-PERLANE: # %bb.0: 4177; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 4178; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 4179; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4180; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4181; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3] 4182; CHECK-FAST-PERLANE-NEXT: retq 4183 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0> 4184 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4185 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4186 ret <4 x double> %res 4187} 4188define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { 4189; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: 4190; CHECK-FAST: # %bb.0: 4191; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 4192; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6] 4193; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4194; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4195; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4196; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 4197; CHECK-FAST-NEXT: retq 4198; 4199; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: 4200; CHECK-FAST-PERLANE: # %bb.0: 4201; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3 4202; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] 4203; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4204; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1 4205; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2] 4206; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0 4207; CHECK-FAST-PERLANE-NEXT: retq 4208 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> 4209 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4210 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4211 ret <4 x double> %res 4212} 4213 4214define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { 4215; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: 4216; CHECK-FAST: # %bb.0: 4217; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6] 4218; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4219; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4220; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4221; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4222; CHECK-FAST-NEXT: retq 4223; 4224; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: 4225; CHECK-FAST-PERLANE: # %bb.0: 4226; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2 4227; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3] 4228; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4229; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4230; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2] 4231; CHECK-FAST-PERLANE-NEXT: retq 4232 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6> 4233 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4234 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4235 ret <4 x double> %res 4236} 4237define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { 4238; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0: 4239; CHECK-FAST: # %bb.0: 4240; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6] 4241; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0 4242; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4243; CHECK-FAST-NEXT: vzeroupper 4244; CHECK-FAST-NEXT: retq 4245; 4246; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0: 4247; CHECK-FAST-PERLANE: # %bb.0: 4248; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1 4249; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4250; CHECK-FAST-PERLANE-NEXT: vzeroupper 4251; CHECK-FAST-PERLANE-NEXT: retq 4252 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 4253 ret <2 x double> %res 4254} 4255define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 4256; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: 4257; CHECK-FAST: # %bb.0: 4258; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 4259; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [0,6] 4260; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4261; CHECK-FAST-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 4262; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4263; CHECK-FAST-NEXT: vmovapd %xmm1, %xmm0 4264; CHECK-FAST-NEXT: vzeroupper 4265; CHECK-FAST-NEXT: retq 4266; 4267; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: 4268; CHECK-FAST-PERLANE: # %bb.0: 4269; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3 4270; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4271; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 4272; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0] 4273; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0 4274; CHECK-FAST-PERLANE-NEXT: vzeroupper 4275; CHECK-FAST-PERLANE-NEXT: retq 4276 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 4277 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4278 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4279 ret <2 x double> %res 4280} 4281 4282define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { 4283; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: 4284; CHECK-FAST: # %bb.0: 4285; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,6] 4286; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4287; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4288; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4289; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4290; CHECK-FAST-NEXT: vzeroupper 4291; CHECK-FAST-NEXT: retq 4292; 4293; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: 4294; CHECK-FAST-PERLANE: # %bb.0: 4295; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2 4296; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4297; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4298; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0] 4299; CHECK-FAST-PERLANE-NEXT: vzeroupper 4300; CHECK-FAST-PERLANE-NEXT: retq 4301 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6> 4302 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4303 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4304 ret <2 x double> %res 4305} 4306define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { 4307; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1: 4308; CHECK: # %bb.0: 4309; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 4310; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,7] 4311; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4312; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1 4313; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} 4314; CHECK-NEXT: vmovapd %xmm1, %xmm0 4315; CHECK-NEXT: vzeroupper 4316; CHECK-NEXT: retq 4317 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> 4318 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4319 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4320 ret <2 x double> %res 4321} 4322 4323define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) { 4324; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1: 4325; CHECK: # %bb.0: 4326; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,7] 4327; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4328; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4329; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} 4330; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 4331; CHECK-NEXT: vzeroupper 4332; CHECK-NEXT: retq 4333 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7> 4334 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4335 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4336 ret <2 x double> %res 4337} 4338define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp) { 4339; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0: 4340; CHECK: # %bb.0: 4341; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,6,7,2] 4342; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 4343; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4344; CHECK-NEXT: retq 4345 %vec = load <8 x double>, ptr %vp 4346 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4347 ret <4 x double> %res 4348} 4349define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4350; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0: 4351; CHECK: # %bb.0: 4352; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4353; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,6,7,2] 4354; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4355; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4356; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} 4357; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4358; CHECK-NEXT: retq 4359 %vec = load <8 x double>, ptr %vp 4360 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4361 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4362 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4363 ret <4 x double> %res 4364} 4365 4366define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(ptr %vp, <4 x double> %mask) { 4367; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0: 4368; CHECK: # %bb.0: 4369; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,6,7,2] 4370; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4371; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4372; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} 4373; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4374; CHECK-NEXT: retq 4375 %vec = load <8 x double>, ptr %vp 4376 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2> 4377 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4378 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4379 ret <4 x double> %res 4380} 4381 4382define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4383; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: 4384; CHECK-FAST: # %bb.0: 4385; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 4386; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,0,6,2] 4387; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 4388; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4389; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4390; CHECK-FAST-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4391; CHECK-FAST-NEXT: retq 4392; 4393; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: 4394; CHECK-FAST-PERLANE: # %bb.0: 4395; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm2 # ymm2 = mem[0,3,2,3] 4396; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4397; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4398; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm2, %ymm0 {%k1} 4399; CHECK-FAST-PERLANE-NEXT: retq 4400 %vec = load <8 x double>, ptr %vp 4401 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> 4402 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4403 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4404 ret <4 x double> %res 4405} 4406 4407define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(ptr %vp, <4 x double> %mask) { 4408; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: 4409; CHECK-FAST: # %bb.0: 4410; CHECK-FAST-NEXT: vbroadcastsd 32(%rdi), %ymm2 4411; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,0,6,2] 4412; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4413; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4414; CHECK-FAST-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} 4415; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 4416; CHECK-FAST-NEXT: retq 4417; 4418; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: 4419; CHECK-FAST-PERLANE: # %bb.0: 4420; CHECK-FAST-PERLANE-NEXT: vpermpd $236, (%rdi), %ymm1 # ymm1 = mem[0,3,2,3] 4421; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4422; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4423; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 32(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} 4424; CHECK-FAST-PERLANE-NEXT: retq 4425 %vec = load <8 x double>, ptr %vp 4426 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4> 4427 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4428 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4429 ret <4 x double> %res 4430} 4431 4432define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4433; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: 4434; CHECK-FAST: # %bb.0: 4435; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4436; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,2,3,4] 4437; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4438; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4439; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} 4440; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4441; CHECK-FAST-NEXT: retq 4442; 4443; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2: 4444; CHECK-FAST-PERLANE: # %bb.0: 4445; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm2 4446; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm3 # ymm3 = ymm2[2,3],mem[0,1] 4447; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4 4448; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 4449; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm2[1],ymm3[0],ymm2[3],ymm3[2] 4450; CHECK-FAST-PERLANE-NEXT: retq 4451 %vec = load <8 x double>, ptr %vp 4452 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 4453 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4454 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4455 ret <4 x double> %res 4456} 4457 4458define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp, <4 x double> %mask) { 4459; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: 4460; CHECK-FAST: # %bb.0: 4461; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [1,2,3,4] 4462; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4463; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4464; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} 4465; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4466; CHECK-FAST-NEXT: retq 4467; 4468; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: 4469; CHECK-FAST-PERLANE: # %bb.0: 4470; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %ymm1 4471; CHECK-FAST-PERLANE-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm2 # ymm2 = ymm1[2,3],mem[0,1] 4472; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4473; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4474; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1],ymm2[0],ymm1[3],ymm2[2] 4475; CHECK-FAST-PERLANE-NEXT: retq 4476 %vec = load <8 x double>, ptr %vp 4477 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 4478 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4479 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4480 ret <4 x double> %res 4481} 4482 4483define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) { 4484; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: 4485; CHECK: # %bb.0: 4486; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0] 4487; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 4488; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4489; CHECK-NEXT: retq 4490 %vec = load <8 x double>, ptr %vp 4491 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4492 ret <4 x double> %res 4493} 4494define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4495; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3: 4496; CHECK: # %bb.0: 4497; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4498; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [4,2,1,0] 4499; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4500; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4501; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} 4502; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4503; CHECK-NEXT: retq 4504 %vec = load <8 x double>, ptr %vp 4505 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4506 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4507 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4508 ret <4 x double> %res 4509} 4510 4511define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) { 4512; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: 4513; CHECK: # %bb.0: 4514; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0] 4515; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4516; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4517; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} 4518; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 4519; CHECK-NEXT: retq 4520 %vec = load <8 x double>, ptr %vp 4521 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0> 4522 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4523 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4524 ret <4 x double> %res 4525} 4526 4527define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4528; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4: 4529; CHECK: # %bb.0: 4530; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4531; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [2,4,1,5] 4532; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 4533; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4534; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4535; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4536; CHECK-NEXT: retq 4537 %vec = load <8 x double>, ptr %vp 4538 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> 4539 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4540 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4541 ret <4 x double> %res 4542} 4543 4544define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(ptr %vp, <4 x double> %mask) { 4545; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4: 4546; CHECK: # %bb.0: 4547; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4548; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,4,1,5] 4549; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4550; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4551; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} 4552; CHECK-NEXT: vmovapd %ymm1, %ymm0 4553; CHECK-NEXT: retq 4554 %vec = load <8 x double>, ptr %vp 4555 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1> 4556 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4557 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4558 ret <4 x double> %res 4559} 4560 4561define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4562; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: 4563; CHECK: # %bb.0: 4564; CHECK-NEXT: vmovapd (%rdi), %ymm2 4565; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm2, %ymm2 # ymm2 = ymm2[2,3],mem[0,1] 4566; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4567; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4568; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} 4569; CHECK-NEXT: retq 4570 %vec = load <8 x double>, ptr %vp 4571 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 4572 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4573 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4574 ret <4 x double> %res 4575} 4576 4577define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(ptr %vp, <4 x double> %mask) { 4578; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: 4579; CHECK: # %bb.0: 4580; CHECK-NEXT: vmovapd (%rdi), %ymm1 4581; CHECK-NEXT: vperm2f128 $33, 32(%rdi), %ymm1, %ymm1 # ymm1 = ymm1[2,3],mem[0,1] 4582; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4583; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4584; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} 4585; CHECK-NEXT: retq 4586 %vec = load <8 x double>, ptr %vp 4587 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5> 4588 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4589 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4590 ret <4 x double> %res 4591} 4592 4593define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp) { 4594; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6: 4595; CHECK: # %bb.0: 4596; CHECK-NEXT: vmovapd 32(%rdi), %ymm1 4597; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,4,1] 4598; CHECK-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0 4599; CHECK-NEXT: retq 4600 %vec = load <8 x double>, ptr %vp 4601 %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4602 ret <4 x double> %res 4603} 4604define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4605; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6: 4606; CHECK: # %bb.0: 4607; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4608; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,2,4,1] 4609; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 4610; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4611; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 4612; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} 4613; CHECK-NEXT: retq 4614 %vec = load <8 x double>, ptr %vp 4615 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4616 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4617 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4618 ret <4 x double> %res 4619} 4620 4621define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(ptr %vp, <4 x double> %mask) { 4622; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6: 4623; CHECK: # %bb.0: 4624; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 4625; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,1] 4626; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4627; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 4628; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} 4629; CHECK-NEXT: vmovapd %ymm1, %ymm0 4630; CHECK-NEXT: retq 4631 %vec = load <8 x double>, ptr %vp 4632 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 4633 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4634 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4635 ret <4 x double> %res 4636} 4637 4638define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %vec2, <4 x double> %mask) { 4639; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7: 4640; CHECK: # %bb.0: 4641; CHECK-NEXT: vmovapd (%rdi), %ymm2 4642; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4643; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 4644; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} 4645; CHECK-NEXT: retq 4646 %vec = load <8 x double>, ptr %vp 4647 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> 4648 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4649 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 4650 ret <4 x double> %res 4651} 4652 4653define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp, <4 x double> %mask) { 4654; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7: 4655; CHECK: # %bb.0: 4656; CHECK-NEXT: vmovapd (%rdi), %ymm1 4657; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4658; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 4659; CHECK-NEXT: vunpcklpd 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} 4660; CHECK-NEXT: retq 4661 %vec = load <8 x double>, ptr %vp 4662 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5> 4663 %cmp = fcmp oeq <4 x double> %mask, zeroinitializer 4664 %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer 4665 ret <4 x double> %res 4666} 4667 4668define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) { 4669; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: 4670; CHECK: # %bb.0: 4671; CHECK-NEXT: vmovapd (%rdi), %xmm0 4672; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0] 4673; CHECK-NEXT: retq 4674 %vec = load <8 x double>, ptr %vp 4675 %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4676 ret <2 x double> %res 4677} 4678define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { 4679; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0: 4680; CHECK: # %bb.0: 4681; CHECK-NEXT: vmovapd (%rdi), %xmm2 4682; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4683; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4684; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0] 4685; CHECK-NEXT: retq 4686 %vec = load <8 x double>, ptr %vp 4687 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4688 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4689 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4690 ret <2 x double> %res 4691} 4692 4693define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) { 4694; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0: 4695; CHECK: # %bb.0: 4696; CHECK-NEXT: vmovapd (%rdi), %xmm1 4697; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4698; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 4699; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0] 4700; CHECK-NEXT: retq 4701 %vec = load <8 x double>, ptr %vp 4702 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6> 4703 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4704 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4705 ret <2 x double> %res 4706} 4707 4708define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) { 4709; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: 4710; CHECK: # %bb.0: 4711; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0] 4712; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 4713; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 4714; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0] 4715; CHECK-NEXT: retq 4716 %vec = load <8 x double>, ptr %vp 4717 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> 4718 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4719 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2 4720 ret <2 x double> %res 4721} 4722 4723define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) { 4724; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: 4725; CHECK: # %bb.0: 4726; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0] 4727; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 4728; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 4729; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0] 4730; CHECK-NEXT: retq 4731 %vec = load <8 x double>, ptr %vp 4732 %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4> 4733 %cmp = fcmp oeq <2 x double> %mask, zeroinitializer 4734 %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer 4735 ret <2 x double> %res 4736} 4737 4738; PR35977 4739define void @test_zext_v8i8_to_v8i16(ptr %arg, ptr %arg1) { 4740; CHECK-LABEL: test_zext_v8i8_to_v8i16: 4741; CHECK: # %bb.0: 4742; CHECK-NEXT: vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 4743; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 4744; CHECK-NEXT: vmovdqa %xmm0, (%rsi) 4745; CHECK-NEXT: retq 4746 %tmp2 = load <8 x i8>, ptr %arg 4747 %tmp3 = extractelement <8 x i8> %tmp2, i32 0 4748 %tmp4 = zext i8 %tmp3 to i16 4749 %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0 4750 %tmp6 = extractelement <8 x i8> %tmp2, i32 1 4751 %tmp7 = zext i8 %tmp6 to i16 4752 %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1 4753 %tmp9 = extractelement <8 x i8> %tmp2, i32 2 4754 %tmp10 = zext i8 %tmp9 to i16 4755 %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2 4756 %tmp12 = extractelement <8 x i8> %tmp2, i32 3 4757 %tmp13 = zext i8 %tmp12 to i16 4758 %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3 4759 %tmp15 = extractelement <8 x i8> %tmp2, i32 4 4760 %tmp16 = zext i8 %tmp15 to i16 4761 %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4 4762 %tmp18 = extractelement <8 x i8> %tmp2, i32 5 4763 %tmp19 = zext i8 %tmp18 to i16 4764 %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5 4765 %tmp21 = extractelement <8 x i8> %tmp2, i32 6 4766 %tmp22 = zext i8 %tmp21 to i16 4767 %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6 4768 %tmp24 = extractelement <8 x i8> %tmp2, i32 7 4769 %tmp25 = zext i8 %tmp24 to i16 4770 %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7 4771 %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 4772 store <8 x i16> %tmp27, ptr %arg1 4773 ret void 4774} 4775