xref: /llvm-project/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll (revision 67c3f2b4303972a6dc8ada54efe1d5d80d119a51)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
2; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE
3; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1
4; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2
5; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512F
6; RUN: llc -mattr=+avx512bw -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX512,AVX512BW
7
8define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
9; SSE-LABEL: v_test_canonicalize__half:
10; SSE:       # %bb.0: # %entry
11; SSE-NEXT:    pushq %rbx
12; SSE-NEXT:    subq $16, %rsp
13; SSE-NEXT:    movq %rdi, %rbx
14; SSE-NEXT:    pinsrw $0, (%rdi), %xmm0
15; SSE-NEXT:    callq __extendhfsf2@PLT
16; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
17; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
18; SSE-NEXT:    callq __extendhfsf2@PLT
19; SSE-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
20; SSE-NEXT:    callq __truncsfhf2@PLT
21; SSE-NEXT:    pextrw $0, %xmm0, %eax
22; SSE-NEXT:    movw %ax, (%rbx)
23; SSE-NEXT:    addq $16, %rsp
24; SSE-NEXT:    popq %rbx
25; SSE-NEXT:    retq
26;
27; AVX-LABEL: v_test_canonicalize__half:
28; AVX:       # %bb.0: # %entry
29; AVX-NEXT:    pushq %rbx
30; AVX-NEXT:    subq $16, %rsp
31; AVX-NEXT:    movq %rdi, %rbx
32; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
33; AVX-NEXT:    callq __extendhfsf2@PLT
34; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
35; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
36; AVX-NEXT:    callq __extendhfsf2@PLT
37; AVX-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
38; AVX-NEXT:    callq __truncsfhf2@PLT
39; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
40; AVX-NEXT:    addq $16, %rsp
41; AVX-NEXT:    popq %rbx
42; AVX-NEXT:    retq
43;
44; AVX512-LABEL: v_test_canonicalize__half:
45; AVX512:       # %bb.0: # %entry
46; AVX512-NEXT:    movzwl (%rdi), %eax
47; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
48; AVX512-NEXT:    vmovd %ecx, %xmm0
49; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
50; AVX512-NEXT:    vmovd %eax, %xmm1
51; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
52; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
53; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
54; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
55; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
56; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
57; AVX512-NEXT:    retq
58entry:
59  %val = load half, half addrspace(1)* %out
60  %canonicalized = call half @llvm.canonicalize.f16(half %val)
61  store half %canonicalized, half addrspace(1)* %out
62  ret void
63}
64
65define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
66; SSE-LABEL: complex_canonicalize_fmul_half:
67; SSE:       # %bb.0: # %entry
68; SSE-NEXT:    pushq %rax
69; SSE-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
70; SSE-NEXT:    callq __extendhfsf2@PLT
71; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
72; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
73; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
74; SSE-NEXT:    callq __extendhfsf2@PLT
75; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
76; SSE-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
77; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
78; SSE-NEXT:    subss %xmm0, %xmm1
79; SSE-NEXT:    movaps %xmm1, %xmm0
80; SSE-NEXT:    callq __truncsfhf2@PLT
81; SSE-NEXT:    callq __extendhfsf2@PLT
82; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
83; SSE-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
84; SSE-NEXT:    callq __truncsfhf2@PLT
85; SSE-NEXT:    callq __extendhfsf2@PLT
86; SSE-NEXT:    subss (%rsp), %xmm0 # 4-byte Folded Reload
87; SSE-NEXT:    callq __truncsfhf2@PLT
88; SSE-NEXT:    callq __extendhfsf2@PLT
89; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
90; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
91; SSE-NEXT:    callq __extendhfsf2@PLT
92; SSE-NEXT:    mulss (%rsp), %xmm0 # 4-byte Folded Reload
93; SSE-NEXT:    callq __truncsfhf2@PLT
94; SSE-NEXT:    callq __extendhfsf2@PLT
95; SSE-NEXT:    subss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
96; SSE-NEXT:    callq __truncsfhf2@PLT
97; SSE-NEXT:    popq %rax
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: complex_canonicalize_fmul_half:
101; AVX:       # %bb.0: # %entry
102; AVX-NEXT:    pushq %rax
103; AVX-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
104; AVX-NEXT:    callq __extendhfsf2@PLT
105; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
106; AVX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
107; AVX-NEXT:    # xmm0 = mem[0],zero,zero,zero
108; AVX-NEXT:    callq __extendhfsf2@PLT
109; AVX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
110; AVX-NEXT:    vmovss (%rsp), %xmm1 # 4-byte Reload
111; AVX-NEXT:    # xmm1 = mem[0],zero,zero,zero
112; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
113; AVX-NEXT:    callq __truncsfhf2@PLT
114; AVX-NEXT:    callq __extendhfsf2@PLT
115; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
116; AVX-NEXT:    vaddss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
117; AVX-NEXT:    callq __truncsfhf2@PLT
118; AVX-NEXT:    callq __extendhfsf2@PLT
119; AVX-NEXT:    vsubss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
120; AVX-NEXT:    callq __truncsfhf2@PLT
121; AVX-NEXT:    callq __extendhfsf2@PLT
122; AVX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
123; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
124; AVX-NEXT:    callq __extendhfsf2@PLT
125; AVX-NEXT:    vmulss (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload
126; AVX-NEXT:    callq __truncsfhf2@PLT
127; AVX-NEXT:    callq __extendhfsf2@PLT
128; AVX-NEXT:    vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
129; AVX-NEXT:    callq __truncsfhf2@PLT
130; AVX-NEXT:    popq %rax
131; AVX-NEXT:    retq
132;
133; AVX512-LABEL: complex_canonicalize_fmul_half:
134; AVX512:       # %bb.0: # %entry
135; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
136; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
137; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
138; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
139; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
140; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2
141; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
142; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
143; AVX512-NEXT:    vsubss %xmm0, %xmm2, %xmm0
144; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
145; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
146; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
147; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
148; AVX512-NEXT:    vmovd %eax, %xmm2
149; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
150; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
151; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
152; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
153; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
154; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
155; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
156; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
157; AVX512-NEXT:    vmovd %xmm0, %eax
158; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
159; AVX512-NEXT:    retq
160entry:
161
162  %mul1 = fsub half %a, %b
163  %add = fadd half %mul1, %b
164  %mul2 = fsub half %add, %mul1
165  %canonicalized = call half @llvm.canonicalize.f16(half %mul2)
166  %result = fsub half %canonicalized, %b
167  ret half %result
168}
169
170define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind {
171; SSE-LABEL: v_test_canonicalize_v2half:
172; SSE:       # %bb.0: # %entry
173; SSE-NEXT:    pushq %rbx
174; SSE-NEXT:    subq $48, %rsp
175; SSE-NEXT:    movq %rdi, %rbx
176; SSE-NEXT:    pinsrw $0, 2(%rdi), %xmm0
177; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
178; SSE-NEXT:    pinsrw $0, (%rdi), %xmm0
179; SSE-NEXT:    callq __extendhfsf2@PLT
180; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
181; SSE-NEXT:    pinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
182; SSE-NEXT:    callq __extendhfsf2@PLT
183; SSE-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
184; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
185; SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
186; SSE-NEXT:    mulss %xmm0, %xmm1
187; SSE-NEXT:    movaps %xmm1, %xmm0
188; SSE-NEXT:    callq __truncsfhf2@PLT
189; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
190; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
191; SSE-NEXT:    callq __extendhfsf2@PLT
192; SSE-NEXT:    mulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
193; SSE-NEXT:    callq __truncsfhf2@PLT
194; SSE-NEXT:    pextrw $0, %xmm0, %eax
195; SSE-NEXT:    movw %ax, 2(%rbx)
196; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
197; SSE-NEXT:    pextrw $0, %xmm0, %eax
198; SSE-NEXT:    movw %ax, (%rbx)
199; SSE-NEXT:    addq $48, %rsp
200; SSE-NEXT:    popq %rbx
201; SSE-NEXT:    retq
202;
203; AVX-LABEL: v_test_canonicalize_v2half:
204; AVX:       # %bb.0: # %entry
205; AVX-NEXT:    pushq %rbx
206; AVX-NEXT:    subq $48, %rsp
207; AVX-NEXT:    movq %rdi, %rbx
208; AVX-NEXT:    vpinsrw $0, 2(%rdi), %xmm0, %xmm0
209; AVX-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
210; AVX-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
211; AVX-NEXT:    callq __extendhfsf2@PLT
212; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
213; AVX-NEXT:    vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214; AVX-NEXT:    callq __extendhfsf2@PLT
215; AVX-NEXT:    vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
216; AVX-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
217; AVX-NEXT:    callq __truncsfhf2@PLT
218; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
219; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
220; AVX-NEXT:    callq __extendhfsf2@PLT
221; AVX-NEXT:    vmulss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
222; AVX-NEXT:    callq __truncsfhf2@PLT
223; AVX-NEXT:    vpextrw $0, %xmm0, 2(%rbx)
224; AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
225; AVX-NEXT:    vpextrw $0, %xmm0, (%rbx)
226; AVX-NEXT:    addq $48, %rsp
227; AVX-NEXT:    popq %rbx
228; AVX-NEXT:    retq
229;
230; AVX512-LABEL: v_test_canonicalize_v2half:
231; AVX512:       # %bb.0: # %entry
232; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
233; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
234; AVX512-NEXT:    vmovd %eax, %xmm1
235; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
236; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
237; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
238; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm2
239; AVX512-NEXT:    vxorps %xmm3, %xmm3, %xmm3
240; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
241; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
242; AVX512-NEXT:    vmovd %xmm2, %eax
243; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
244; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
245; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
246; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
247; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
248; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
249; AVX512-NEXT:    vmovd %xmm0, %eax
250; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
251; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
252; AVX512-NEXT:    vmovd %xmm0, (%rdi)
253; AVX512-NEXT:    retq
254entry:
255  %val = load <2 x half>, <2 x half> addrspace(1)* %out
256  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
257  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
258  ret void
259}
260
261;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
262; AVX1: {{.*}}
263; AVX2: {{.*}}
264; AVX512BW: {{.*}}
265; AVX512F: {{.*}}
266