xref: /llvm-project/llvm/test/CodeGen/X86/extract-concat.ll (revision be6c752e157638849f1f59f7e2b7ecbe11a022fe)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2    | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mattr=sse4.2  | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-- -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512F
7
8define void @foo(<4 x float> %in, ptr %out) {
9; SSE2-LABEL: foo:
10; SSE2:       # %bb.0:
11; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
12; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
13; SSE2-NEXT:    packuswb %xmm0, %xmm0
14; SSE2-NEXT:    packuswb %xmm0, %xmm0
15; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
16; SSE2-NEXT:    movd %xmm0, (%rdi)
17; SSE2-NEXT:    retq
18;
19; SSE42-LABEL: foo:
20; SSE42:       # %bb.0:
21; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
22; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
23; SSE42-NEXT:    movl $255, %eax
24; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
25; SSE42-NEXT:    movd %xmm0, (%rdi)
26; SSE42-NEXT:    retq
27;
28; AVX-LABEL: foo:
29; AVX:       # %bb.0:
30; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
31; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
32; AVX-NEXT:    movl $255, %eax
33; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
34; AVX-NEXT:    vmovd %xmm0, (%rdi)
35; AVX-NEXT:    retq
36  %t0 = fptosi <4 x float> %in to <4 x i32>
37  %t1 = trunc <4 x i32> %t0 to <4 x i16>
38  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
39  %t3 = trunc <8 x i16> %t2 to <8 x i8>
40  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
42  store <4 x i8> %t5, ptr %out
43  ret void
44}
45
46define <16 x i64> @catcat(<4 x i64> %x) {
47; SSE-LABEL: catcat:
48; SSE:       # %bb.0:
49; SSE-NEXT:    movq %rdi, %rax
50; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
51; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
52; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
53; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
54; SSE-NEXT:    movdqa %xmm1, 112(%rdi)
55; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
56; SSE-NEXT:    movdqa %xmm3, 80(%rdi)
57; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
58; SSE-NEXT:    movdqa %xmm0, 48(%rdi)
59; SSE-NEXT:    movdqa %xmm0, 32(%rdi)
60; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
61; SSE-NEXT:    movdqa %xmm2, (%rdi)
62; SSE-NEXT:    retq
63;
64; AVX1-LABEL: catcat:
65; AVX1:       # %bb.0:
66; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1]
67; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm4
68; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
69; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
70; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
71; AVX1-NEXT:    vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2]
72; AVX1-NEXT:    vshufpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
73; AVX1-NEXT:    vmovaps %ymm4, %ymm0
74; AVX1-NEXT:    retq
75;
76; AVX2-LABEL: catcat:
77; AVX2:       # %bb.0:
78; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[1,1,1,1]
79; AVX2-NEXT:    vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2]
80; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm0[3,3,3,3]
81; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
82; AVX2-NEXT:    retq
83;
84; AVX512F-LABEL: catcat:
85; AVX512F:       # %bb.0:
86; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
87; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
88; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm2
89; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [2,2,2,2,3,3,3,3]
90; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm1
91; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
92; AVX512F-NEXT:    retq
93  %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
94  %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
95  %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
96  ret  <16 x i64> %r
97}
98
99define <16 x i64> @load_catcat(ptr %p) {
100; SSE-LABEL: load_catcat:
101; SSE:       # %bb.0:
102; SSE-NEXT:    movq %rdi, %rax
103; SSE-NEXT:    movdqa (%rsi), %xmm0
104; SSE-NEXT:    movdqa 16(%rsi), %xmm1
105; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
106; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
107; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
108; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
109; SSE-NEXT:    movdqa %xmm1, 112(%rdi)
110; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
111; SSE-NEXT:    movdqa %xmm3, 80(%rdi)
112; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
113; SSE-NEXT:    movdqa %xmm0, 48(%rdi)
114; SSE-NEXT:    movdqa %xmm0, 32(%rdi)
115; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
116; SSE-NEXT:    movdqa %xmm2, (%rdi)
117; SSE-NEXT:    retq
118;
119; AVX1-LABEL: load_catcat:
120; AVX1:       # %bb.0:
121; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
122; AVX1-NEXT:    vbroadcastsd 8(%rdi), %ymm1
123; AVX1-NEXT:    vbroadcastsd 16(%rdi), %ymm2
124; AVX1-NEXT:    vbroadcastsd 24(%rdi), %ymm3
125; AVX1-NEXT:    retq
126;
127; AVX2-LABEL: load_catcat:
128; AVX2:       # %bb.0:
129; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
130; AVX2-NEXT:    vbroadcastsd 8(%rdi), %ymm1
131; AVX2-NEXT:    vbroadcastsd 16(%rdi), %ymm2
132; AVX2-NEXT:    vbroadcastsd 24(%rdi), %ymm3
133; AVX2-NEXT:    retq
134;
135; AVX512F-LABEL: load_catcat:
136; AVX512F:       # %bb.0:
137; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3]
138; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm0 = [0,4,0,4,1,5,1,5]
139; AVX512F-NEXT:    vpermq %zmm1, %zmm0, %zmm0
140; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [2,6,2,6,3,7,3,7]
141; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
142; AVX512F-NEXT:    retq
143  %x = load <4 x i64>, ptr %p
144  %cat1 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
145  %cat2 = shufflevector <8 x i64> %cat1, <8 x i64> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
146  %r = shufflevector <16 x i64> %cat2, <16 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
147  ret  <16 x i64> %r
148}
149
150; Use weird types to make sure we do not miscompile a case where
151; the source ops are not an even multiple size of the result.
152
153define <4 x i32> @cat_ext_straddle(ptr %px, ptr %py) {
154; SSE-LABEL: cat_ext_straddle:
155; SSE:       # %bb.0:
156; SSE-NEXT:    movaps 16(%rdi), %xmm0
157; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: cat_ext_straddle:
161; AVX:       # %bb.0:
162; AVX-NEXT:    vmovaps 16(%rdi), %xmm0
163; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
164; AVX-NEXT:    retq
165  %x = load <6 x i32>, ptr %px
166  %y = load <6 x i32>, ptr %py
167  %cat = shufflevector <6 x i32> %x, <6 x i32> %y, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
168  %ext = shufflevector <12 x i32> %cat, <12 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
169  ret <4 x i32> %ext
170}
171