1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI 13 14; PR31551 15; Pairs of shufflevector:trunc functions with functional equivalence. 16; Ideally, the shuffles should be lowered to code with the same quality as the truncates. 17 18define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind { 19; AVX512F-LABEL: shuffle_v64i8_to_v32i8: 20; AVX512F: # %bb.0: 21; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 22; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 23; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] 24; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 25; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 26; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 27; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 28; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) 29; AVX512F-NEXT: vzeroupper 30; AVX512F-NEXT: retq 31; 32; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8: 33; AVX512VL-FAST-ALL: # %bb.0: 34; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 35; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 36; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] 37; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] 38; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7] 39; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 40; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) 41; AVX512VL-FAST-ALL-NEXT: vzeroupper 42; AVX512VL-FAST-ALL-NEXT: retq 43; 44; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8: 45; AVX512VL-FAST-PERLANE: # %bb.0: 46; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 47; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 48; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] 49; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 50; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 51; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 52; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 53; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) 54; AVX512VL-FAST-PERLANE-NEXT: vzeroupper 55; AVX512VL-FAST-PERLANE-NEXT: retq 56; 57; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: 58; AVX512BW: # %bb.0: 59; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 60; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) 61; AVX512BW-NEXT: vzeroupper 62; AVX512BW-NEXT: retq 63; 64; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: 65; AVX512BWVL: # %bb.0: 66; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 67; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) 68; AVX512BWVL-NEXT: vzeroupper 69; AVX512BWVL-NEXT: retq 70; 71; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8: 72; AVX512VBMI: # %bb.0: 73; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 74; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) 75; AVX512VBMI-NEXT: vzeroupper 76; AVX512VBMI-NEXT: retq 77 %vec = load <64 x i8>, ptr %L 78 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> 79 store <32 x i8> %strided.vec, ptr %S 80 ret void 81} 82 83define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind { 84; AVX512F-LABEL: trunc_v32i16_to_v32i8: 85; AVX512F: # %bb.0: 86; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 87; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 88; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) 89; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 90; AVX512F-NEXT: vzeroupper 91; AVX512F-NEXT: retq 92; 93; AVX512VL-LABEL: trunc_v32i16_to_v32i8: 94; AVX512VL: # %bb.0: 95; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 96; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 97; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) 98; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 99; AVX512VL-NEXT: vzeroupper 100; AVX512VL-NEXT: retq 101; 102; AVX512BW-LABEL: trunc_v32i16_to_v32i8: 103; AVX512BW: # %bb.0: 104; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 105; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) 106; AVX512BW-NEXT: vzeroupper 107; AVX512BW-NEXT: retq 108; 109; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: 110; AVX512BWVL: # %bb.0: 111; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 112; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) 113; AVX512BWVL-NEXT: vzeroupper 114; AVX512BWVL-NEXT: retq 115; 116; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8: 117; AVX512VBMI: # %bb.0: 118; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 119; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) 120; AVX512VBMI-NEXT: vzeroupper 121; AVX512VBMI-NEXT: retq 122 %vec = load <64 x i8>, ptr %L 123 %bc = bitcast <64 x i8> %vec to <32 x i16> 124 %strided.vec = trunc <32 x i16> %bc to <32 x i8> 125 store <32 x i8> %strided.vec, ptr %S 126 ret void 127} 128 129define void @shuffle_v32i16_to_v16i16(ptr %L, ptr %S) nounwind { 130; AVX512-LABEL: shuffle_v32i16_to_v16i16: 131; AVX512: # %bb.0: 132; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 133; AVX512-NEXT: vpmovdw %zmm0, (%rsi) 134; AVX512-NEXT: vzeroupper 135; AVX512-NEXT: retq 136 %vec = load <32 x i16>, ptr %L 137 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 138 store <16 x i16> %strided.vec, ptr %S 139 ret void 140} 141 142define void @trunc_v16i32_to_v16i16(ptr %L, ptr %S) nounwind { 143; AVX512-LABEL: trunc_v16i32_to_v16i16: 144; AVX512: # %bb.0: 145; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 146; AVX512-NEXT: vpmovdw %zmm0, (%rsi) 147; AVX512-NEXT: vzeroupper 148; AVX512-NEXT: retq 149 %vec = load <32 x i16>, ptr %L 150 %bc = bitcast <32 x i16> %vec to <16 x i32> 151 %strided.vec = trunc <16 x i32> %bc to <16 x i16> 152 store <16 x i16> %strided.vec, ptr %S 153 ret void 154} 155 156define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind { 157; AVX512-LABEL: shuffle_v16i32_to_v8i32: 158; AVX512: # %bb.0: 159; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 160; AVX512-NEXT: vpmovqd %zmm0, (%rsi) 161; AVX512-NEXT: vzeroupper 162; AVX512-NEXT: retq 163 %vec = load <16 x i32>, ptr %L 164 %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 165 store <8 x i32> %strided.vec, ptr %S 166 ret void 167} 168 169define void @trunc_v8i64_to_v8i32(ptr %L, ptr %S) nounwind { 170; AVX512-LABEL: trunc_v8i64_to_v8i32: 171; AVX512: # %bb.0: 172; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 173; AVX512-NEXT: vpmovqd %zmm0, (%rsi) 174; AVX512-NEXT: vzeroupper 175; AVX512-NEXT: retq 176 %vec = load <16 x i32>, ptr %L 177 %bc = bitcast <16 x i32> %vec to <8 x i64> 178 %strided.vec = trunc <8 x i64> %bc to <8 x i32> 179 store <8 x i32> %strided.vec, ptr %S 180 ret void 181} 182 183define void @shuffle_v64i8_to_v16i8(ptr %L, ptr %S) nounwind { 184; AVX512-LABEL: shuffle_v64i8_to_v16i8: 185; AVX512: # %bb.0: 186; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 187; AVX512-NEXT: vpmovdb %zmm0, (%rsi) 188; AVX512-NEXT: vzeroupper 189; AVX512-NEXT: retq 190 %vec = load <64 x i8>, ptr %L 191 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60> 192 store <16 x i8> %strided.vec, ptr %S 193 ret void 194} 195 196define void @trunc_v16i32_to_v16i8(ptr %L, ptr %S) nounwind { 197; AVX512-LABEL: trunc_v16i32_to_v16i8: 198; AVX512: # %bb.0: 199; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 200; AVX512-NEXT: vpmovdb %zmm0, (%rsi) 201; AVX512-NEXT: vzeroupper 202; AVX512-NEXT: retq 203 %vec = load <64 x i8>, ptr %L 204 %bc = bitcast <64 x i8> %vec to <16 x i32> 205 %strided.vec = trunc <16 x i32> %bc to <16 x i8> 206 store <16 x i8> %strided.vec, ptr %S 207 ret void 208} 209 210define void @shuffle_v32i16_to_v8i16(ptr %L, ptr %S) nounwind { 211; AVX512-LABEL: shuffle_v32i16_to_v8i16: 212; AVX512: # %bb.0: 213; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 214; AVX512-NEXT: vpmovqw %zmm0, (%rsi) 215; AVX512-NEXT: vzeroupper 216; AVX512-NEXT: retq 217 %vec = load <32 x i16>, ptr %L 218 %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28> 219 store <8 x i16> %strided.vec, ptr %S 220 ret void 221} 222 223define void @trunc_v8i64_to_v8i16(ptr %L, ptr %S) nounwind { 224; AVX512-LABEL: trunc_v8i64_to_v8i16: 225; AVX512: # %bb.0: 226; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 227; AVX512-NEXT: vpmovqw %zmm0, (%rsi) 228; AVX512-NEXT: vzeroupper 229; AVX512-NEXT: retq 230 %vec = load <32 x i16>, ptr %L 231 %bc = bitcast <32 x i16> %vec to <8 x i64> 232 %strided.vec = trunc <8 x i64> %bc to <8 x i16> 233 store <8 x i16> %strided.vec, ptr %S 234 ret void 235} 236 237define void @shuffle_v64i8_to_v8i8(ptr %L, ptr %S) nounwind { 238; AVX512-LABEL: shuffle_v64i8_to_v8i8: 239; AVX512: # %bb.0: 240; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 241; AVX512-NEXT: vpmovqb %zmm0, (%rsi) 242; AVX512-NEXT: vzeroupper 243; AVX512-NEXT: retq 244 %vec = load <64 x i8>, ptr %L 245 %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56> 246 store <8 x i8> %strided.vec, ptr %S 247 ret void 248} 249 250define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind { 251; AVX512-LABEL: trunc_v8i64_to_v8i8: 252; AVX512: # %bb.0: 253; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 254; AVX512-NEXT: vpmovqb %zmm0, (%rsi) 255; AVX512-NEXT: vzeroupper 256; AVX512-NEXT: retq 257 %vec = load <64 x i8>, ptr %L 258 %bc = bitcast <64 x i8> %vec to <8 x i64> 259 %strided.vec = trunc <8 x i64> %bc to <8 x i8> 260 store <8 x i8> %strided.vec, ptr %S 261 ret void 262} 263 264define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { 265; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: 266; AVX512: # %bb.0: 267; AVX512-NEXT: vpsrld $8, %zmm0, %zmm0 268; AVX512-NEXT: vpmovdb %zmm0, %xmm0 269; AVX512-NEXT: vzeroupper 270; AVX512-NEXT: retq 271 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61> 272 ret <16 x i8> %res 273} 274 275define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { 276; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 277; AVX512F: # %bb.0: 278; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] 279; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 280; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] 281; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] 282; AVX512F-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 283; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 284; AVX512F-NEXT: vzeroupper 285; AVX512F-NEXT: retq 286; 287; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 288; AVX512VL: # %bb.0: 289; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] 290; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 291; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] 292; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] 293; AVX512VL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 294; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 295; AVX512VL-NEXT: vzeroupper 296; AVX512VL-NEXT: retq 297; 298; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 299; AVX512BW: # %bb.0: 300; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21] 301; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2 302; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] 303; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] 304; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 305; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 306; AVX512BW-NEXT: vzeroupper 307; AVX512BW-NEXT: retq 308; 309; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 310; AVX512BWVL: # %bb.0: 311; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13] 312; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 313; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u] 314; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u] 315; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 316; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 317; AVX512BWVL-NEXT: vzeroupper 318; AVX512BWVL-NEXT: retq 319; 320; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: 321; AVX512VBMI: # %bb.0: 322; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] 323; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 324; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 325; AVX512VBMI-NEXT: vzeroupper 326; AVX512VBMI-NEXT: retq 327 %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62> 328 ret <16 x i8> %res 329} 330 331; PR111611 332define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) { 333; AVX512F-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 334; AVX512F: # %bb.0: 335; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 336; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] 337; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 338; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 339; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 340; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 341; AVX512F-NEXT: retq 342; 343; AVX512VL-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 344; AVX512VL-FAST-ALL: # %bb.0: 345; AVX512VL-FAST-ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 346; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] 347; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] 348; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7] 349; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm2, %ymm0 350; AVX512VL-FAST-ALL-NEXT: retq 351; 352; AVX512VL-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 353; AVX512VL-FAST-PERLANE: # %bb.0: 354; AVX512VL-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm1 355; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] 356; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 357; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 358; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 359; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 360; AVX512VL-FAST-PERLANE-NEXT: retq 361; 362; AVX512BW-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 363; AVX512BW: # %bb.0: 364; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 365; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 366; AVX512BW-NEXT: retq 367; 368; AVX512BWVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 369; AVX512BWVL: # %bb.0: 370; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0 371; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 372; AVX512BWVL-NEXT: retq 373; 374; AVX512VBMI-LABEL: trunc_shuffle_v32i16_v32i8_ofs1: 375; AVX512VBMI: # %bb.0: 376; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 377; AVX512VBMI-NEXT: vpmovwb %zmm0, %ymm0 378; AVX512VBMI-NEXT: retq 379 %bc = bitcast <32 x i16> %a0 to <64 x i8> 380 %res = shufflevector <64 x i8> %bc, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63> 381 ret <32 x i8> %res 382} 383 384define <4 x double> @PR34175(ptr %p) { 385; AVX512F-LABEL: PR34175: 386; AVX512F: # %bb.0: 387; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 388; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 389; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 390; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 391; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 392; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 393; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 394; AVX512F-NEXT: retq 395; 396; AVX512VL-LABEL: PR34175: 397; AVX512VL: # %bb.0: 398; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 399; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 400; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 401; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 402; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 403; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 404; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 405; AVX512VL-NEXT: retq 406; 407; AVX512BW-LABEL: PR34175: 408; AVX512BW: # %bb.0: 409; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] 410; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0 411; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 412; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 413; AVX512BW-NEXT: retq 414; 415; AVX512BWVL-LABEL: PR34175: 416; AVX512BWVL: # %bb.0: 417; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] 418; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0 419; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 420; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 421; AVX512BWVL-NEXT: retq 422; 423; AVX512VBMI-LABEL: PR34175: 424; AVX512VBMI: # %bb.0: 425; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] 426; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0 427; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 428; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 429; AVX512VBMI-NEXT: retq 430 %v = load <32 x i16>, ptr %p, align 2 431 %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24> 432 %tofp = uitofp <4 x i16> %shuf to <4 x double> 433 ret <4 x double> %tofp 434} 435 436define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { 437; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: 438; AVX512: # %bb.0: 439; AVX512-NEXT: vpmovqb %zmm0, %xmm0 440; AVX512-NEXT: vzeroupper 441; AVX512-NEXT: retq 442 %truncated = trunc <8 x i64> %vec to <8 x i8> 443 %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 444 ret <16 x i8> %result 445} 446 447