xref: /llvm-project/llvm/test/CodeGen/X86/bfloat.ll (revision 76569025dd8b026b3309dedbcaf877d16eace805)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
3; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
4; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
5; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
6; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
7
8define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
9; X86-LABEL: add:
10; X86:       # %bb.0:
11; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
12; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
13; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
14; X86-NEXT:    movzwl (%edx), %edx
15; X86-NEXT:    shll $16, %edx
16; X86-NEXT:    vmovd %edx, %xmm0
17; X86-NEXT:    movzwl (%ecx), %ecx
18; X86-NEXT:    shll $16, %ecx
19; X86-NEXT:    vmovd %ecx, %xmm1
20; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
21; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
22; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
23; X86-NEXT:    retl
24;
25; SSE2-LABEL: add:
26; SSE2:       # %bb.0:
27; SSE2-NEXT:    pushq %rbx
28; SSE2-NEXT:    movq %rdx, %rbx
29; SSE2-NEXT:    movzwl (%rsi), %eax
30; SSE2-NEXT:    shll $16, %eax
31; SSE2-NEXT:    movd %eax, %xmm1
32; SSE2-NEXT:    movzwl (%rdi), %eax
33; SSE2-NEXT:    shll $16, %eax
34; SSE2-NEXT:    movd %eax, %xmm0
35; SSE2-NEXT:    addss %xmm1, %xmm0
36; SSE2-NEXT:    callq __truncsfbf2@PLT
37; SSE2-NEXT:    pextrw $0, %xmm0, %eax
38; SSE2-NEXT:    movw %ax, (%rbx)
39; SSE2-NEXT:    popq %rbx
40; SSE2-NEXT:    retq
41;
42; F16-LABEL: add:
43; F16:       # %bb.0:
44; F16-NEXT:    movzwl (%rsi), %eax
45; F16-NEXT:    shll $16, %eax
46; F16-NEXT:    vmovd %eax, %xmm0
47; F16-NEXT:    movzwl (%rdi), %eax
48; F16-NEXT:    shll $16, %eax
49; F16-NEXT:    vmovd %eax, %xmm1
50; F16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
51; F16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
52; F16-NEXT:    vpextrw $0, %xmm0, (%rdx)
53; F16-NEXT:    retq
54;
55; AVXNC-LABEL: add:
56; AVXNC:       # %bb.0:
57; AVXNC-NEXT:    movzwl (%rsi), %eax
58; AVXNC-NEXT:    shll $16, %eax
59; AVXNC-NEXT:    vmovd %eax, %xmm0
60; AVXNC-NEXT:    movzwl (%rdi), %eax
61; AVXNC-NEXT:    shll $16, %eax
62; AVXNC-NEXT:    vmovd %eax, %xmm1
63; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
64; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
65; AVXNC-NEXT:    vpextrw $0, %xmm0, (%rdx)
66; AVXNC-NEXT:    retq
67  %a = load bfloat, ptr %pa
68  %b = load bfloat, ptr %pb
69  %add = fadd bfloat %a, %b
70  store bfloat %add, ptr %pc
71  ret void
72}
73
74define bfloat @add2(bfloat %a, bfloat %b) nounwind {
75; X86-LABEL: add2:
76; X86:       # %bb.0:
77; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
78; X86-NEXT:    shll $16, %eax
79; X86-NEXT:    vmovd %eax, %xmm0
80; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
81; X86-NEXT:    shll $16, %eax
82; X86-NEXT:    vmovd %eax, %xmm1
83; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
84; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
85; X86-NEXT:    vmovw %xmm0, %eax
86; X86-NEXT:    vmovw %eax, %xmm0
87; X86-NEXT:    retl
88;
89; SSE2-LABEL: add2:
90; SSE2:       # %bb.0:
91; SSE2-NEXT:    pushq %rax
92; SSE2-NEXT:    pextrw $0, %xmm0, %eax
93; SSE2-NEXT:    pextrw $0, %xmm1, %ecx
94; SSE2-NEXT:    shll $16, %ecx
95; SSE2-NEXT:    movd %ecx, %xmm1
96; SSE2-NEXT:    shll $16, %eax
97; SSE2-NEXT:    movd %eax, %xmm0
98; SSE2-NEXT:    addss %xmm1, %xmm0
99; SSE2-NEXT:    callq __truncsfbf2@PLT
100; SSE2-NEXT:    popq %rax
101; SSE2-NEXT:    retq
102;
103; FP16-LABEL: add2:
104; FP16:       # %bb.0:
105; FP16-NEXT:    vmovw %xmm0, %eax
106; FP16-NEXT:    vmovw %xmm1, %ecx
107; FP16-NEXT:    shll $16, %ecx
108; FP16-NEXT:    vmovd %ecx, %xmm0
109; FP16-NEXT:    shll $16, %eax
110; FP16-NEXT:    vmovd %eax, %xmm1
111; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
112; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
113; FP16-NEXT:    vmovw %xmm0, %eax
114; FP16-NEXT:    vmovw %eax, %xmm0
115; FP16-NEXT:    retq
116;
117; AVXNC-LABEL: add2:
118; AVXNC:       # %bb.0:
119; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
120; AVXNC-NEXT:    vpextrw $0, %xmm1, %ecx
121; AVXNC-NEXT:    shll $16, %ecx
122; AVXNC-NEXT:    vmovd %ecx, %xmm0
123; AVXNC-NEXT:    shll $16, %eax
124; AVXNC-NEXT:    vmovd %eax, %xmm1
125; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
126; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
127; AVXNC-NEXT:    vmovd %xmm0, %eax
128; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
129; AVXNC-NEXT:    retq
130  %add = fadd bfloat %a, %b
131  ret bfloat %add
132}
133
134define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
135; X86-LABEL: add_double:
136; X86:       # %bb.0:
137; X86-NEXT:    pushl %ebx
138; X86-NEXT:    pushl %edi
139; X86-NEXT:    pushl %esi
140; X86-NEXT:    subl $16, %esp
141; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
142; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
143; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
144; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
145; X86-NEXT:    vmovsd %xmm0, (%esp)
146; X86-NEXT:    calll __truncdfbf2
147; X86-NEXT:    vmovw %xmm0, %edi
148; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
149; X86-NEXT:    vmovsd %xmm0, (%esp)
150; X86-NEXT:    calll __truncdfbf2
151; X86-NEXT:    vmovw %xmm0, %eax
152; X86-NEXT:    shll $16, %eax
153; X86-NEXT:    vmovd %eax, %xmm0
154; X86-NEXT:    shll $16, %edi
155; X86-NEXT:    vmovd %edi, %xmm1
156; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
157; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
158; X86-NEXT:    vmovw %xmm0, %eax
159; X86-NEXT:    shll $16, %eax
160; X86-NEXT:    vmovd %eax, %xmm0
161; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
162; X86-NEXT:    vmovsd %xmm0, (%esi)
163; X86-NEXT:    addl $16, %esp
164; X86-NEXT:    popl %esi
165; X86-NEXT:    popl %edi
166; X86-NEXT:    popl %ebx
167; X86-NEXT:    retl
168;
169; SSE2-LABEL: add_double:
170; SSE2:       # %bb.0:
171; SSE2-NEXT:    pushq %rbp
172; SSE2-NEXT:    pushq %r14
173; SSE2-NEXT:    pushq %rbx
174; SSE2-NEXT:    movq %rdx, %rbx
175; SSE2-NEXT:    movq %rsi, %r14
176; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
177; SSE2-NEXT:    callq __truncdfbf2@PLT
178; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
179; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
180; SSE2-NEXT:    callq __truncdfbf2@PLT
181; SSE2-NEXT:    pextrw $0, %xmm0, %eax
182; SSE2-NEXT:    shll $16, %eax
183; SSE2-NEXT:    movd %eax, %xmm1
184; SSE2-NEXT:    shll $16, %ebp
185; SSE2-NEXT:    movd %ebp, %xmm0
186; SSE2-NEXT:    addss %xmm1, %xmm0
187; SSE2-NEXT:    callq __truncsfbf2@PLT
188; SSE2-NEXT:    pextrw $0, %xmm0, %eax
189; SSE2-NEXT:    shll $16, %eax
190; SSE2-NEXT:    movd %eax, %xmm0
191; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
192; SSE2-NEXT:    movsd %xmm0, (%rbx)
193; SSE2-NEXT:    popq %rbx
194; SSE2-NEXT:    popq %r14
195; SSE2-NEXT:    popq %rbp
196; SSE2-NEXT:    retq
197;
198; FP16-LABEL: add_double:
199; FP16:       # %bb.0:
200; FP16-NEXT:    pushq %rbp
201; FP16-NEXT:    pushq %r14
202; FP16-NEXT:    pushq %rbx
203; FP16-NEXT:    movq %rdx, %rbx
204; FP16-NEXT:    movq %rsi, %r14
205; FP16-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
206; FP16-NEXT:    callq __truncdfbf2@PLT
207; FP16-NEXT:    vmovw %xmm0, %ebp
208; FP16-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
209; FP16-NEXT:    callq __truncdfbf2@PLT
210; FP16-NEXT:    vmovw %xmm0, %eax
211; FP16-NEXT:    shll $16, %eax
212; FP16-NEXT:    vmovd %eax, %xmm0
213; FP16-NEXT:    shll $16, %ebp
214; FP16-NEXT:    vmovd %ebp, %xmm1
215; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
216; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
217; FP16-NEXT:    vmovw %xmm0, %eax
218; FP16-NEXT:    shll $16, %eax
219; FP16-NEXT:    vmovd %eax, %xmm0
220; FP16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
221; FP16-NEXT:    vmovsd %xmm0, (%rbx)
222; FP16-NEXT:    popq %rbx
223; FP16-NEXT:    popq %r14
224; FP16-NEXT:    popq %rbp
225; FP16-NEXT:    retq
226;
227; AVXNC-LABEL: add_double:
228; AVXNC:       # %bb.0:
229; AVXNC-NEXT:    pushq %rbp
230; AVXNC-NEXT:    pushq %r14
231; AVXNC-NEXT:    pushq %rbx
232; AVXNC-NEXT:    movq %rdx, %rbx
233; AVXNC-NEXT:    movq %rsi, %r14
234; AVXNC-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
235; AVXNC-NEXT:    callq __truncdfbf2@PLT
236; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebp
237; AVXNC-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
238; AVXNC-NEXT:    callq __truncdfbf2@PLT
239; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
240; AVXNC-NEXT:    shll $16, %eax
241; AVXNC-NEXT:    vmovd %eax, %xmm0
242; AVXNC-NEXT:    shll $16, %ebp
243; AVXNC-NEXT:    vmovd %ebp, %xmm1
244; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
245; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
246; AVXNC-NEXT:    vmovd %xmm0, %eax
247; AVXNC-NEXT:    shll $16, %eax
248; AVXNC-NEXT:    vmovd %eax, %xmm0
249; AVXNC-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
250; AVXNC-NEXT:    vmovsd %xmm0, (%rbx)
251; AVXNC-NEXT:    popq %rbx
252; AVXNC-NEXT:    popq %r14
253; AVXNC-NEXT:    popq %rbp
254; AVXNC-NEXT:    retq
255  %la = load double, ptr %pa
256  %a = fptrunc double %la to bfloat
257  %lb = load double, ptr %pb
258  %b = fptrunc double %lb to bfloat
259  %add = fadd bfloat %a, %b
260  %dadd = fpext bfloat %add to double
261  store double %dadd, ptr %pc
262  ret void
263}
264
265define double @add_double2(double %da, double %db) nounwind {
266; X86-LABEL: add_double2:
267; X86:       # %bb.0:
268; X86-NEXT:    pushl %esi
269; X86-NEXT:    subl $24, %esp
270; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
271; X86-NEXT:    vmovsd %xmm0, (%esp)
272; X86-NEXT:    calll __truncdfbf2
273; X86-NEXT:    vmovw %xmm0, %esi
274; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
275; X86-NEXT:    vmovsd %xmm0, (%esp)
276; X86-NEXT:    calll __truncdfbf2
277; X86-NEXT:    vmovw %xmm0, %eax
278; X86-NEXT:    shll $16, %eax
279; X86-NEXT:    vmovd %eax, %xmm0
280; X86-NEXT:    shll $16, %esi
281; X86-NEXT:    vmovd %esi, %xmm1
282; X86-NEXT:    vaddss %xmm0, %xmm1, %xmm0
283; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
284; X86-NEXT:    vmovw %xmm0, %eax
285; X86-NEXT:    shll $16, %eax
286; X86-NEXT:    vmovd %eax, %xmm0
287; X86-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
288; X86-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp)
289; X86-NEXT:    fldl {{[0-9]+}}(%esp)
290; X86-NEXT:    addl $24, %esp
291; X86-NEXT:    popl %esi
292; X86-NEXT:    retl
293;
294; SSE2-LABEL: add_double2:
295; SSE2:       # %bb.0:
296; SSE2-NEXT:    pushq %rbx
297; SSE2-NEXT:    subq $16, %rsp
298; SSE2-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
299; SSE2-NEXT:    callq __truncdfbf2@PLT
300; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
301; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
302; SSE2-NEXT:    # xmm0 = mem[0],zero
303; SSE2-NEXT:    callq __truncdfbf2@PLT
304; SSE2-NEXT:    pextrw $0, %xmm0, %eax
305; SSE2-NEXT:    shll $16, %eax
306; SSE2-NEXT:    movd %eax, %xmm1
307; SSE2-NEXT:    shll $16, %ebx
308; SSE2-NEXT:    movd %ebx, %xmm0
309; SSE2-NEXT:    addss %xmm1, %xmm0
310; SSE2-NEXT:    callq __truncsfbf2@PLT
311; SSE2-NEXT:    pextrw $0, %xmm0, %eax
312; SSE2-NEXT:    shll $16, %eax
313; SSE2-NEXT:    movd %eax, %xmm0
314; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
315; SSE2-NEXT:    addq $16, %rsp
316; SSE2-NEXT:    popq %rbx
317; SSE2-NEXT:    retq
318;
319; FP16-LABEL: add_double2:
320; FP16:       # %bb.0:
321; FP16-NEXT:    pushq %rbx
322; FP16-NEXT:    subq $16, %rsp
323; FP16-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
324; FP16-NEXT:    callq __truncdfbf2@PLT
325; FP16-NEXT:    vmovw %xmm0, %ebx
326; FP16-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
327; FP16-NEXT:    # xmm0 = mem[0],zero
328; FP16-NEXT:    callq __truncdfbf2@PLT
329; FP16-NEXT:    vmovw %xmm0, %eax
330; FP16-NEXT:    shll $16, %eax
331; FP16-NEXT:    vmovd %eax, %xmm0
332; FP16-NEXT:    shll $16, %ebx
333; FP16-NEXT:    vmovd %ebx, %xmm1
334; FP16-NEXT:    vaddss %xmm0, %xmm1, %xmm0
335; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
336; FP16-NEXT:    vmovw %xmm0, %eax
337; FP16-NEXT:    shll $16, %eax
338; FP16-NEXT:    vmovd %eax, %xmm0
339; FP16-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
340; FP16-NEXT:    addq $16, %rsp
341; FP16-NEXT:    popq %rbx
342; FP16-NEXT:    retq
343;
344; AVXNC-LABEL: add_double2:
345; AVXNC:       # %bb.0:
346; AVXNC-NEXT:    pushq %rbx
347; AVXNC-NEXT:    subq $16, %rsp
348; AVXNC-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
349; AVXNC-NEXT:    callq __truncdfbf2@PLT
350; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebx
351; AVXNC-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
352; AVXNC-NEXT:    # xmm0 = mem[0],zero
353; AVXNC-NEXT:    callq __truncdfbf2@PLT
354; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
355; AVXNC-NEXT:    shll $16, %eax
356; AVXNC-NEXT:    vmovd %eax, %xmm0
357; AVXNC-NEXT:    shll $16, %ebx
358; AVXNC-NEXT:    vmovd %ebx, %xmm1
359; AVXNC-NEXT:    vaddss %xmm0, %xmm1, %xmm0
360; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
361; AVXNC-NEXT:    vmovd %xmm0, %eax
362; AVXNC-NEXT:    shll $16, %eax
363; AVXNC-NEXT:    vmovd %eax, %xmm0
364; AVXNC-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
365; AVXNC-NEXT:    addq $16, %rsp
366; AVXNC-NEXT:    popq %rbx
367; AVXNC-NEXT:    retq
368  %a = fptrunc double %da to bfloat
369  %b = fptrunc double %db to bfloat
370  %add = fadd bfloat %a, %b
371  %dadd = fpext bfloat %add to double
372  ret double %dadd
373}
374
375define void @add_constant(ptr %pa, ptr %pc) nounwind {
376; X86-LABEL: add_constant:
377; X86:       # %bb.0:
378; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
379; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
380; X86-NEXT:    movzwl (%ecx), %ecx
381; X86-NEXT:    shll $16, %ecx
382; X86-NEXT:    vmovd %ecx, %xmm0
383; X86-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
384; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
385; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
386; X86-NEXT:    retl
387;
388; SSE2-LABEL: add_constant:
389; SSE2:       # %bb.0:
390; SSE2-NEXT:    pushq %rbx
391; SSE2-NEXT:    movq %rsi, %rbx
392; SSE2-NEXT:    movzwl (%rdi), %eax
393; SSE2-NEXT:    shll $16, %eax
394; SSE2-NEXT:    movd %eax, %xmm0
395; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
396; SSE2-NEXT:    callq __truncsfbf2@PLT
397; SSE2-NEXT:    pextrw $0, %xmm0, %eax
398; SSE2-NEXT:    movw %ax, (%rbx)
399; SSE2-NEXT:    popq %rbx
400; SSE2-NEXT:    retq
401;
402; F16-LABEL: add_constant:
403; F16:       # %bb.0:
404; F16-NEXT:    movzwl (%rdi), %eax
405; F16-NEXT:    shll $16, %eax
406; F16-NEXT:    vmovd %eax, %xmm0
407; F16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
408; F16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
409; F16-NEXT:    vpextrw $0, %xmm0, (%rsi)
410; F16-NEXT:    retq
411;
412; AVXNC-LABEL: add_constant:
413; AVXNC:       # %bb.0:
414; AVXNC-NEXT:    movzwl (%rdi), %eax
415; AVXNC-NEXT:    shll $16, %eax
416; AVXNC-NEXT:    vmovd %eax, %xmm0
417; AVXNC-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
418; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
419; AVXNC-NEXT:    vpextrw $0, %xmm0, (%rsi)
420; AVXNC-NEXT:    retq
421  %a = load bfloat, ptr %pa
422  %add = fadd bfloat %a, 1.0
423  store bfloat %add, ptr %pc
424  ret void
425}
426
427define bfloat @add_constant2(bfloat %a) nounwind {
428; X86-LABEL: add_constant2:
429; X86:       # %bb.0:
430; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
431; X86-NEXT:    shll $16, %eax
432; X86-NEXT:    vmovd %eax, %xmm0
433; X86-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
434; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
435; X86-NEXT:    vmovw %xmm0, %eax
436; X86-NEXT:    vmovw %eax, %xmm0
437; X86-NEXT:    retl
438;
439; SSE2-LABEL: add_constant2:
440; SSE2:       # %bb.0:
441; SSE2-NEXT:    pushq %rax
442; SSE2-NEXT:    pextrw $0, %xmm0, %eax
443; SSE2-NEXT:    shll $16, %eax
444; SSE2-NEXT:    movd %eax, %xmm0
445; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
446; SSE2-NEXT:    callq __truncsfbf2@PLT
447; SSE2-NEXT:    popq %rax
448; SSE2-NEXT:    retq
449;
450; FP16-LABEL: add_constant2:
451; FP16:       # %bb.0:
452; FP16-NEXT:    vmovw %xmm0, %eax
453; FP16-NEXT:    shll $16, %eax
454; FP16-NEXT:    vmovd %eax, %xmm0
455; FP16-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
456; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
457; FP16-NEXT:    vmovw %xmm0, %eax
458; FP16-NEXT:    vmovw %eax, %xmm0
459; FP16-NEXT:    retq
460;
461; AVXNC-LABEL: add_constant2:
462; AVXNC:       # %bb.0:
463; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
464; AVXNC-NEXT:    shll $16, %eax
465; AVXNC-NEXT:    vmovd %eax, %xmm0
466; AVXNC-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
467; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
468; AVXNC-NEXT:    vmovd %xmm0, %eax
469; AVXNC-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
470; AVXNC-NEXT:    retq
471  %add = fadd bfloat %a, 1.0
472  ret bfloat %add
473}
474
475define void @store_constant(ptr %pc) nounwind {
476; X86-LABEL: store_constant:
477; X86:       # %bb.0:
478; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
479; X86-NEXT:    movw $16256, (%eax) # imm = 0x3F80
480; X86-NEXT:    retl
481;
482; CHECK-LABEL: store_constant:
483; CHECK:       # %bb.0:
484; CHECK-NEXT:    movw $16256, (%rdi) # imm = 0x3F80
485; CHECK-NEXT:    retq
486  store bfloat 1.0, ptr %pc
487  ret void
488}
489
490define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
491; X86-LABEL: fold_ext_trunc:
492; X86:       # %bb.0:
493; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
494; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
495; X86-NEXT:    movzwl (%ecx), %ecx
496; X86-NEXT:    movw %cx, (%eax)
497; X86-NEXT:    retl
498;
499; CHECK-LABEL: fold_ext_trunc:
500; CHECK:       # %bb.0:
501; CHECK-NEXT:    movzwl (%rdi), %eax
502; CHECK-NEXT:    movw %ax, (%rsi)
503; CHECK-NEXT:    retq
504  %a = load bfloat, ptr %pa
505  %ext = fpext bfloat %a to float
506  %trunc = fptrunc float %ext to bfloat
507  store bfloat %trunc, ptr %pc
508  ret void
509}
510
511define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
512; X86-LABEL: fold_ext_trunc2:
513; X86:       # %bb.0:
514; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
515; X86-NEXT:    retl
516;
517; CHECK-LABEL: fold_ext_trunc2:
518; CHECK:       # %bb.0:
519; CHECK-NEXT:    retq
520  %ext = fpext bfloat %a to float
521  %trunc = fptrunc float %ext to bfloat
522  ret bfloat %trunc
523}
524
525define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
526; X86-LABEL: addv:
527; X86:       # %bb.0:
528; X86-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
529; X86-NEXT:    vpslld $16, %ymm1, %ymm1
530; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
531; X86-NEXT:    vpslld $16, %ymm0, %ymm0
532; X86-NEXT:    vaddps %ymm1, %ymm0, %ymm0
533; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
534; X86-NEXT:    vzeroupper
535; X86-NEXT:    retl
536;
537; SSE2-LABEL: addv:
538; SSE2:       # %bb.0:
539; SSE2-NEXT:    pushq %rbp
540; SSE2-NEXT:    pushq %r15
541; SSE2-NEXT:    pushq %r14
542; SSE2-NEXT:    pushq %r13
543; SSE2-NEXT:    pushq %r12
544; SSE2-NEXT:    pushq %rbx
545; SSE2-NEXT:    subq $56, %rsp
546; SSE2-NEXT:    movq %xmm0, %rcx
547; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
548; SSE2-NEXT:    movq %rcx, %rax
549; SSE2-NEXT:    shrq $48, %rax
550; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
551; SSE2-NEXT:    movq %xmm1, %rdx
552; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
553; SSE2-NEXT:    movq %rdx, %rax
554; SSE2-NEXT:    shrq $48, %rax
555; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
556; SSE2-NEXT:    movq %rcx, %rax
557; SSE2-NEXT:    shrq $32, %rax
558; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
559; SSE2-NEXT:    movq %rdx, %rax
560; SSE2-NEXT:    shrq $32, %rax
561; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
562; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
563; SSE2-NEXT:    movq %xmm0, %r15
564; SSE2-NEXT:    movq %r15, %rbx
565; SSE2-NEXT:    shrq $48, %rbx
566; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1,1]
567; SSE2-NEXT:    movq %xmm1, %r14
568; SSE2-NEXT:    movq %r14, %rbp
569; SSE2-NEXT:    shrq $48, %rbp
570; SSE2-NEXT:    movq %r15, %r12
571; SSE2-NEXT:    shrq $32, %r12
572; SSE2-NEXT:    movq %r14, %r13
573; SSE2-NEXT:    shrq $32, %r13
574; SSE2-NEXT:    movl %r14d, %eax
575; SSE2-NEXT:    shll $16, %eax
576; SSE2-NEXT:    movd %eax, %xmm1
577; SSE2-NEXT:    movl %r15d, %eax
578; SSE2-NEXT:    shll $16, %eax
579; SSE2-NEXT:    movd %eax, %xmm0
580; SSE2-NEXT:    addss %xmm1, %xmm0
581; SSE2-NEXT:    callq __truncsfbf2@PLT
582; SSE2-NEXT:    pextrw $0, %xmm0, %eax
583; SSE2-NEXT:    movzwl %ax, %eax
584; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
585; SSE2-NEXT:    andl $-65536, %r14d # imm = 0xFFFF0000
586; SSE2-NEXT:    movd %r14d, %xmm1
587; SSE2-NEXT:    andl $-65536, %r15d # imm = 0xFFFF0000
588; SSE2-NEXT:    movd %r15d, %xmm0
589; SSE2-NEXT:    addss %xmm1, %xmm0
590; SSE2-NEXT:    callq __truncsfbf2@PLT
591; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
592; SSE2-NEXT:    shll $16, %r15d
593; SSE2-NEXT:    addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
594; SSE2-NEXT:    shll $16, %r13d
595; SSE2-NEXT:    movd %r13d, %xmm1
596; SSE2-NEXT:    shll $16, %r12d
597; SSE2-NEXT:    movd %r12d, %xmm0
598; SSE2-NEXT:    addss %xmm1, %xmm0
599; SSE2-NEXT:    callq __truncsfbf2@PLT
600; SSE2-NEXT:    pextrw $0, %xmm0, %eax
601; SSE2-NEXT:    movzwl %ax, %r14d
602; SSE2-NEXT:    shll $16, %ebp
603; SSE2-NEXT:    movd %ebp, %xmm1
604; SSE2-NEXT:    shll $16, %ebx
605; SSE2-NEXT:    movd %ebx, %xmm0
606; SSE2-NEXT:    addss %xmm1, %xmm0
607; SSE2-NEXT:    callq __truncsfbf2@PLT
608; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
609; SSE2-NEXT:    shll $16, %ebx
610; SSE2-NEXT:    orl %r14d, %ebx
611; SSE2-NEXT:    shlq $32, %rbx
612; SSE2-NEXT:    orq %r15, %rbx
613; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
614; SSE2-NEXT:    movl %r15d, %eax
615; SSE2-NEXT:    shll $16, %eax
616; SSE2-NEXT:    movd %eax, %xmm1
617; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
618; SSE2-NEXT:    movl %r14d, %eax
619; SSE2-NEXT:    shll $16, %eax
620; SSE2-NEXT:    movd %eax, %xmm0
621; SSE2-NEXT:    addss %xmm1, %xmm0
622; SSE2-NEXT:    callq __truncsfbf2@PLT
623; SSE2-NEXT:    pextrw $0, %xmm0, %eax
624; SSE2-NEXT:    movzwl %ax, %ebp
625; SSE2-NEXT:    movq %r15, %rax
626; SSE2-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
627; SSE2-NEXT:    movd %eax, %xmm1
628; SSE2-NEXT:    movq %r14, %rax
629; SSE2-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
630; SSE2-NEXT:    movd %eax, %xmm0
631; SSE2-NEXT:    addss %xmm1, %xmm0
632; SSE2-NEXT:    callq __truncsfbf2@PLT
633; SSE2-NEXT:    pextrw $0, %xmm0, %r14d
634; SSE2-NEXT:    shll $16, %r14d
635; SSE2-NEXT:    orl %ebp, %r14d
636; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
637; SSE2-NEXT:    shll $16, %eax
638; SSE2-NEXT:    movd %eax, %xmm1
639; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
640; SSE2-NEXT:    shll $16, %eax
641; SSE2-NEXT:    movd %eax, %xmm0
642; SSE2-NEXT:    addss %xmm1, %xmm0
643; SSE2-NEXT:    callq __truncsfbf2@PLT
644; SSE2-NEXT:    pextrw $0, %xmm0, %eax
645; SSE2-NEXT:    movzwl %ax, %ebp
646; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
647; SSE2-NEXT:    shll $16, %eax
648; SSE2-NEXT:    movd %eax, %xmm1
649; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
650; SSE2-NEXT:    shll $16, %eax
651; SSE2-NEXT:    movd %eax, %xmm0
652; SSE2-NEXT:    addss %xmm1, %xmm0
653; SSE2-NEXT:    callq __truncsfbf2@PLT
654; SSE2-NEXT:    pextrw $0, %xmm0, %eax
655; SSE2-NEXT:    shll $16, %eax
656; SSE2-NEXT:    orl %ebp, %eax
657; SSE2-NEXT:    shlq $32, %rax
658; SSE2-NEXT:    orq %r14, %rax
659; SSE2-NEXT:    movq %rax, %xmm0
660; SSE2-NEXT:    movq %rbx, %xmm1
661; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
662; SSE2-NEXT:    addq $56, %rsp
663; SSE2-NEXT:    popq %rbx
664; SSE2-NEXT:    popq %r12
665; SSE2-NEXT:    popq %r13
666; SSE2-NEXT:    popq %r14
667; SSE2-NEXT:    popq %r15
668; SSE2-NEXT:    popq %rbp
669; SSE2-NEXT:    retq
670;
671; F16-LABEL: addv:
672; F16:       # %bb.0:
673; F16-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
674; F16-NEXT:    vpslld $16, %ymm1, %ymm1
675; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
676; F16-NEXT:    vpslld $16, %ymm0, %ymm0
677; F16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
678; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
679; F16-NEXT:    vzeroupper
680; F16-NEXT:    retq
681;
682; AVXNC-LABEL: addv:
683; AVXNC:       # %bb.0:
684; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
685; AVXNC-NEXT:    vpslld $16, %ymm1, %ymm1
686; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
687; AVXNC-NEXT:    vpslld $16, %ymm0, %ymm0
688; AVXNC-NEXT:    vaddps %ymm1, %ymm0, %ymm0
689; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
690; AVXNC-NEXT:    vzeroupper
691; AVXNC-NEXT:    retq
692  %add = fadd <8 x bfloat> %a, %b
693  ret <8 x bfloat> %add
694}
695
696define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
697; X86-LABEL: pr62997:
698; X86:       # %bb.0:
699; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
700; X86-NEXT:    vmovsh {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero
701; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
702; X86-NEXT:    retl
703;
704; SSE2-LABEL: pr62997:
705; SSE2:       # %bb.0:
706; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
707; SSE2-NEXT:    retq
708;
709; BF16-LABEL: pr62997:
710; BF16:       # %bb.0:
711; BF16-NEXT:    vpextrw $0, %xmm1, %eax
712; BF16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
713; BF16-NEXT:    retq
714;
715; FP16-LABEL: pr62997:
716; FP16:       # %bb.0:
717; FP16-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
718; FP16-NEXT:    retq
719  %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
720  %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
721  ret <2 x bfloat> %2
722}
723
724define <32 x bfloat> @pr63017() {
725; X86-LABEL: pr63017:
726; X86:       # %bb.0:
727; X86-NEXT:    vxorps %xmm0, %xmm0, %xmm0
728; X86-NEXT:    retl
729;
730; SSE2-LABEL: pr63017:
731; SSE2:       # %bb.0:
732; SSE2-NEXT:    xorps %xmm0, %xmm0
733; SSE2-NEXT:    xorps %xmm1, %xmm1
734; SSE2-NEXT:    xorps %xmm2, %xmm2
735; SSE2-NEXT:    xorps %xmm3, %xmm3
736; SSE2-NEXT:    retq
737;
738; F16-LABEL: pr63017:
739; F16:       # %bb.0:
740; F16-NEXT:    vxorps %xmm0, %xmm0, %xmm0
741; F16-NEXT:    retq
742;
743; AVXNC-LABEL: pr63017:
744; AVXNC:       # %bb.0:
745; AVXNC-NEXT:    vxorps %xmm0, %xmm0, %xmm0
746; AVXNC-NEXT:    vxorps %xmm1, %xmm1, %xmm1
747; AVXNC-NEXT:    retq
748  ret <32 x bfloat> zeroinitializer
749}
750
751define <32 x bfloat> @pr63017_2() nounwind {
752; X86-LABEL: pr63017_2:
753; X86:       # %bb.0:
754; X86-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
755; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
756; X86-NEXT:    retl
757;
758; SSE2-LABEL: pr63017_2:
759; SSE2:       # %bb.0:
760; SSE2-NEXT:    xorl %eax, %eax
761; SSE2-NEXT:    testb %al, %al
762; SSE2-NEXT:    jne .LBB12_1
763; SSE2-NEXT:  # %bb.2: # %cond.load
764; SSE2-NEXT:    movzwl (%rax), %eax
765; SSE2-NEXT:    shll $16, %eax
766; SSE2-NEXT:    movd %eax, %xmm0
767; SSE2-NEXT:    jmp .LBB12_3
768; SSE2-NEXT:  .LBB12_1:
769; SSE2-NEXT:    movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
770; SSE2-NEXT:  .LBB12_3:
771; SSE2-NEXT:    pushq %r14
772; SSE2-NEXT:    pushq %rbx
773; SSE2-NEXT:    subq $88, %rsp
774; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
775; SSE2-NEXT:    callq __truncsfbf2@PLT
776; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
777; SSE2-NEXT:    shll $16, %ebx
778; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
779; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
780; SSE2-NEXT:    callq __truncsfbf2@PLT
781; SSE2-NEXT:    pextrw $0, %xmm0, %eax
782; SSE2-NEXT:    movzwl %ax, %r14d
783; SSE2-NEXT:    orl %ebx, %r14d
784; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
785; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
786; SSE2-NEXT:    callq __truncsfbf2@PLT
787; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
788; SSE2-NEXT:    shll $16, %ebx
789; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
790; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
791; SSE2-NEXT:    callq __truncsfbf2@PLT
792; SSE2-NEXT:    pextrw $0, %xmm0, %eax
793; SSE2-NEXT:    movzwl %ax, %eax
794; SSE2-NEXT:    orl %ebx, %eax
795; SSE2-NEXT:    shlq $32, %rax
796; SSE2-NEXT:    orq %r14, %rax
797; SSE2-NEXT:    movq %rax, %xmm0
798; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
799; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
800; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
801; SSE2-NEXT:    callq __truncsfbf2@PLT
802; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
803; SSE2-NEXT:    shll $16, %ebx
804; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
805; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
806; SSE2-NEXT:    callq __truncsfbf2@PLT
807; SSE2-NEXT:    pextrw $0, %xmm0, %eax
808; SSE2-NEXT:    movzwl %ax, %r14d
809; SSE2-NEXT:    orl %ebx, %r14d
810; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
811; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
812; SSE2-NEXT:    callq __truncsfbf2@PLT
813; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
814; SSE2-NEXT:    shll $16, %ebx
815; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
816; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
817; SSE2-NEXT:    callq __truncsfbf2@PLT
818; SSE2-NEXT:    pextrw $0, %xmm0, %eax
819; SSE2-NEXT:    movzwl %ax, %eax
820; SSE2-NEXT:    orl %ebx, %eax
821; SSE2-NEXT:    shlq $32, %rax
822; SSE2-NEXT:    orq %r14, %rax
823; SSE2-NEXT:    movq %rax, %xmm0
824; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
825; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
826; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
827; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
828; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
829; SSE2-NEXT:    callq __truncsfbf2@PLT
830; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
831; SSE2-NEXT:    shll $16, %ebx
832; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
833; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
834; SSE2-NEXT:    callq __truncsfbf2@PLT
835; SSE2-NEXT:    pextrw $0, %xmm0, %eax
836; SSE2-NEXT:    movzwl %ax, %r14d
837; SSE2-NEXT:    orl %ebx, %r14d
838; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
839; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
840; SSE2-NEXT:    callq __truncsfbf2@PLT
841; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
842; SSE2-NEXT:    shll $16, %ebx
843; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
844; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
845; SSE2-NEXT:    callq __truncsfbf2@PLT
846; SSE2-NEXT:    pextrw $0, %xmm0, %eax
847; SSE2-NEXT:    movzwl %ax, %eax
848; SSE2-NEXT:    orl %ebx, %eax
849; SSE2-NEXT:    shlq $32, %rax
850; SSE2-NEXT:    orq %r14, %rax
851; SSE2-NEXT:    movq %rax, %xmm0
852; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
853; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
854; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
855; SSE2-NEXT:    callq __truncsfbf2@PLT
856; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
857; SSE2-NEXT:    shll $16, %ebx
858; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
859; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
860; SSE2-NEXT:    callq __truncsfbf2@PLT
861; SSE2-NEXT:    pextrw $0, %xmm0, %eax
862; SSE2-NEXT:    movzwl %ax, %r14d
863; SSE2-NEXT:    orl %ebx, %r14d
864; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
865; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
866; SSE2-NEXT:    callq __truncsfbf2@PLT
867; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
868; SSE2-NEXT:    shll $16, %ebx
869; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
870; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
871; SSE2-NEXT:    callq __truncsfbf2@PLT
872; SSE2-NEXT:    pextrw $0, %xmm0, %eax
873; SSE2-NEXT:    movzwl %ax, %eax
874; SSE2-NEXT:    orl %ebx, %eax
875; SSE2-NEXT:    shlq $32, %rax
876; SSE2-NEXT:    orq %r14, %rax
877; SSE2-NEXT:    movq %rax, %xmm0
878; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
879; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
880; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
881; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
882; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
883; SSE2-NEXT:    callq __truncsfbf2@PLT
884; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
885; SSE2-NEXT:    shll $16, %ebx
886; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
887; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
888; SSE2-NEXT:    callq __truncsfbf2@PLT
889; SSE2-NEXT:    pextrw $0, %xmm0, %eax
890; SSE2-NEXT:    movzwl %ax, %r14d
891; SSE2-NEXT:    orl %ebx, %r14d
892; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
893; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
894; SSE2-NEXT:    callq __truncsfbf2@PLT
895; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
896; SSE2-NEXT:    shll $16, %ebx
897; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
898; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
899; SSE2-NEXT:    callq __truncsfbf2@PLT
900; SSE2-NEXT:    pextrw $0, %xmm0, %eax
901; SSE2-NEXT:    movzwl %ax, %eax
902; SSE2-NEXT:    orl %ebx, %eax
903; SSE2-NEXT:    shlq $32, %rax
904; SSE2-NEXT:    orq %r14, %rax
905; SSE2-NEXT:    movq %rax, %xmm0
906; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
907; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
908; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
909; SSE2-NEXT:    callq __truncsfbf2@PLT
910; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
911; SSE2-NEXT:    shll $16, %ebx
912; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
913; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
914; SSE2-NEXT:    callq __truncsfbf2@PLT
915; SSE2-NEXT:    pextrw $0, %xmm0, %eax
916; SSE2-NEXT:    movzwl %ax, %r14d
917; SSE2-NEXT:    orl %ebx, %r14d
918; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
919; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
920; SSE2-NEXT:    callq __truncsfbf2@PLT
921; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
922; SSE2-NEXT:    shll $16, %ebx
923; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
924; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
925; SSE2-NEXT:    callq __truncsfbf2@PLT
926; SSE2-NEXT:    pextrw $0, %xmm0, %eax
927; SSE2-NEXT:    movzwl %ax, %eax
928; SSE2-NEXT:    orl %ebx, %eax
929; SSE2-NEXT:    shlq $32, %rax
930; SSE2-NEXT:    orq %r14, %rax
931; SSE2-NEXT:    movq %rax, %xmm0
932; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
933; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
934; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
935; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
936; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
937; SSE2-NEXT:    callq __truncsfbf2@PLT
938; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
939; SSE2-NEXT:    shll $16, %ebx
940; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
941; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
942; SSE2-NEXT:    callq __truncsfbf2@PLT
943; SSE2-NEXT:    pextrw $0, %xmm0, %eax
944; SSE2-NEXT:    movzwl %ax, %r14d
945; SSE2-NEXT:    orl %ebx, %r14d
946; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
947; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
948; SSE2-NEXT:    callq __truncsfbf2@PLT
949; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
950; SSE2-NEXT:    shll $16, %ebx
951; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
952; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
953; SSE2-NEXT:    callq __truncsfbf2@PLT
954; SSE2-NEXT:    pextrw $0, %xmm0, %eax
955; SSE2-NEXT:    movzwl %ax, %eax
956; SSE2-NEXT:    orl %ebx, %eax
957; SSE2-NEXT:    shlq $32, %rax
958; SSE2-NEXT:    orq %r14, %rax
959; SSE2-NEXT:    movq %rax, %xmm0
960; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
961; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
962; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
963; SSE2-NEXT:    callq __truncsfbf2@PLT
964; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
965; SSE2-NEXT:    shll $16, %ebx
966; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
967; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
968; SSE2-NEXT:    callq __truncsfbf2@PLT
969; SSE2-NEXT:    pextrw $0, %xmm0, %eax
970; SSE2-NEXT:    movzwl %ax, %r14d
971; SSE2-NEXT:    orl %ebx, %r14d
972; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
973; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
974; SSE2-NEXT:    callq __truncsfbf2@PLT
975; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
976; SSE2-NEXT:    shll $16, %ebx
977; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
978; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
979; SSE2-NEXT:    callq __truncsfbf2@PLT
980; SSE2-NEXT:    pextrw $0, %xmm0, %eax
981; SSE2-NEXT:    movzwl %ax, %eax
982; SSE2-NEXT:    orl %ebx, %eax
983; SSE2-NEXT:    shlq $32, %rax
984; SSE2-NEXT:    orq %r14, %rax
985; SSE2-NEXT:    movq %rax, %xmm0
986; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
987; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
988; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
989; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
990; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
991; SSE2-NEXT:    addq $88, %rsp
992; SSE2-NEXT:    popq %rbx
993; SSE2-NEXT:    popq %r14
994; SSE2-NEXT:    retq
995;
996; FP16-LABEL: pr63017_2:
997; FP16:       # %bb.0:
998; FP16-NEXT:    vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
999; FP16-NEXT:    vmovdqu16 (%rax), %zmm0 {%k1}
1000; FP16-NEXT:    retq
1001;
1002; AVXNC-LABEL: pr63017_2:
1003; AVXNC:       # %bb.0:
1004; AVXNC-NEXT:    vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1005; AVXNC-NEXT:    xorl %eax, %eax
1006; AVXNC-NEXT:    testb %al, %al
1007; AVXNC-NEXT:    jne .LBB12_2
1008; AVXNC-NEXT:  # %bb.1: # %cond.load
1009; AVXNC-NEXT:    vmovups (%rax), %ymm0
1010; AVXNC-NEXT:  .LBB12_2:
1011; AVXNC-NEXT:    vmovaps %ymm0, %ymm1
1012; AVXNC-NEXT:    retq
1013  %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
1014  ret <32 x bfloat> %1
1015}
1016
1017define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
1018; X86-LABEL: pr62997_3:
1019; X86:       # %bb.0:
1020; X86-NEXT:    vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
1021; X86-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1022; X86-NEXT:    retl
1023;
1024; SSE2-LABEL: pr62997_3:
1025; SSE2:       # %bb.0:
1026; SSE2-NEXT:    movq %xmm0, %rax
1027; SSE2-NEXT:    movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
1028; SSE2-NEXT:    andq %rax, %rcx
1029; SSE2-NEXT:    movzwl %ax, %eax
1030; SSE2-NEXT:    pextrw $0, %xmm4, %edx
1031; SSE2-NEXT:    shll $16, %edx
1032; SSE2-NEXT:    orl %eax, %edx
1033; SSE2-NEXT:    orq %rcx, %rdx
1034; SSE2-NEXT:    movq %rdx, %xmm4
1035; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1036; SSE2-NEXT:    retq
1037;
1038; FP16-LABEL: pr62997_3:
1039; FP16:       # %bb.0:
1040; FP16-NEXT:    vmovw %xmm1, %eax
1041; FP16-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm1
1042; FP16-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
1043; FP16-NEXT:    retq
1044;
1045; AVXNC-LABEL: pr62997_3:
1046; AVXNC:       # %bb.0:
1047; AVXNC-NEXT:    vpextrw $0, %xmm2, %eax
1048; AVXNC-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm2
1049; AVXNC-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1050; AVXNC-NEXT:    retq
1051  %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1
1052  ret <32 x bfloat> %3
1053}
1054
1055declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>)
1056
1057define <4 x float> @pr64460_1(<4 x bfloat> %a) {
1058; X86-LABEL: pr64460_1:
1059; X86:       # %bb.0:
1060; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1061; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1062; X86-NEXT:    retl
1063;
1064; SSE2-LABEL: pr64460_1:
1065; SSE2:       # %bb.0:
1066; SSE2-NEXT:    pxor %xmm1, %xmm1
1067; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1068; SSE2-NEXT:    movdqa %xmm1, %xmm0
1069; SSE2-NEXT:    retq
1070;
1071; AVX-LABEL: pr64460_1:
1072; AVX:       # %bb.0:
1073; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1074; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1075; AVX-NEXT:    retq
1076  %b = fpext <4 x bfloat> %a to <4 x float>
1077  ret <4 x float> %b
1078}
1079
1080define <8 x float> @pr64460_2(<8 x bfloat> %a) {
1081; X86-LABEL: pr64460_2:
1082; X86:       # %bb.0:
1083; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1084; X86-NEXT:    vpslld $16, %ymm0, %ymm0
1085; X86-NEXT:    retl
1086;
1087; SSE2-LABEL: pr64460_2:
1088; SSE2:       # %bb.0:
1089; SSE2-NEXT:    pxor %xmm1, %xmm1
1090; SSE2-NEXT:    pxor %xmm2, %xmm2
1091; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1092; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1093; SSE2-NEXT:    movdqa %xmm2, %xmm0
1094; SSE2-NEXT:    retq
1095;
1096; AVX-LABEL: pr64460_2:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1099; AVX-NEXT:    vpslld $16, %ymm0, %ymm0
1100; AVX-NEXT:    retq
1101  %b = fpext <8 x bfloat> %a to <8 x float>
1102  ret <8 x float> %b
1103}
1104
1105define <16 x float> @pr64460_3(<16 x bfloat> %a) {
1106; X86-LABEL: pr64460_3:
1107; X86:       # %bb.0:
1108; X86-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1109; X86-NEXT:    vpslld $16, %zmm0, %zmm0
1110; X86-NEXT:    retl
1111;
1112; SSE2-LABEL: pr64460_3:
1113; SSE2:       # %bb.0:
1114; SSE2-NEXT:    pxor %xmm3, %xmm3
1115; SSE2-NEXT:    pxor %xmm5, %xmm5
1116; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
1117; SSE2-NEXT:    pxor %xmm4, %xmm4
1118; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1119; SSE2-NEXT:    pxor %xmm2, %xmm2
1120; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1121; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1122; SSE2-NEXT:    movdqa %xmm5, %xmm0
1123; SSE2-NEXT:    movdqa %xmm4, %xmm1
1124; SSE2-NEXT:    retq
1125;
1126; F16-LABEL: pr64460_3:
1127; F16:       # %bb.0:
1128; F16-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1129; F16-NEXT:    vpslld $16, %zmm0, %zmm0
1130; F16-NEXT:    retq
1131;
1132; AVXNC-LABEL: pr64460_3:
1133; AVXNC:       # %bb.0:
1134; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1135; AVXNC-NEXT:    vpslld $16, %ymm1, %ymm2
1136; AVXNC-NEXT:    vextracti128 $1, %ymm0, %xmm0
1137; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1138; AVXNC-NEXT:    vpslld $16, %ymm0, %ymm1
1139; AVXNC-NEXT:    vmovdqa %ymm2, %ymm0
1140; AVXNC-NEXT:    retq
1141  %b = fpext <16 x bfloat> %a to <16 x float>
1142  ret <16 x float> %b
1143}
1144
1145define <8 x double> @pr64460_4(<8 x bfloat> %a) {
1146; X86-LABEL: pr64460_4:
1147; X86:       # %bb.0:
1148; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1149; X86-NEXT:    vpslld $16, %ymm0, %ymm0
1150; X86-NEXT:    vcvtps2pd %ymm0, %zmm0
1151; X86-NEXT:    retl
1152;
1153; SSE2-LABEL: pr64460_4:
1154; SSE2:       # %bb.0:
1155; SSE2-NEXT:    pxor %xmm3, %xmm3
1156; SSE2-NEXT:    pxor %xmm1, %xmm1
1157; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1158; SSE2-NEXT:    cvtps2pd %xmm1, %xmm4
1159; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1160; SSE2-NEXT:    cvtps2pd %xmm3, %xmm2
1161; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1162; SSE2-NEXT:    cvtps2pd %xmm0, %xmm1
1163; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1164; SSE2-NEXT:    cvtps2pd %xmm0, %xmm3
1165; SSE2-NEXT:    movaps %xmm4, %xmm0
1166; SSE2-NEXT:    retq
1167;
1168; F16-LABEL: pr64460_4:
1169; F16:       # %bb.0:
1170; F16-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1171; F16-NEXT:    vpslld $16, %ymm0, %ymm0
1172; F16-NEXT:    vcvtps2pd %ymm0, %zmm0
1173; F16-NEXT:    retq
1174;
1175; AVXNC-LABEL: pr64460_4:
1176; AVXNC:       # %bb.0:
1177; AVXNC-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1178; AVXNC-NEXT:    vpslld $16, %ymm0, %ymm1
1179; AVXNC-NEXT:    vcvtps2pd %xmm1, %ymm0
1180; AVXNC-NEXT:    vextracti128 $1, %ymm1, %xmm1
1181; AVXNC-NEXT:    vcvtps2pd %xmm1, %ymm1
1182; AVXNC-NEXT:    retq
1183  %b = fpext <8 x bfloat> %a to <8 x double>
1184  ret <8 x double> %b
1185}
1186
1187define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
1188; X86-LABEL: fptrunc_v4f32:
1189; X86:       # %bb.0:
1190; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1191; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
1192; X86-NEXT:    vzeroupper
1193; X86-NEXT:    retl
1194;
1195; SSE2-LABEL: fptrunc_v4f32:
1196; SSE2:       # %bb.0:
1197; SSE2-NEXT:    subq $72, %rsp
1198; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
1199; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1200; SSE2-NEXT:    callq __truncsfbf2@PLT
1201; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1202; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1203; SSE2-NEXT:    callq __truncsfbf2@PLT
1204; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1205; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1206; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
1207; SSE2-NEXT:    callq __truncsfbf2@PLT
1208; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1209; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1210; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1211; SSE2-NEXT:    callq __truncsfbf2@PLT
1212; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1213; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1214; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1215; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1216; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1217; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1218; SSE2-NEXT:    addq $72, %rsp
1219; SSE2-NEXT:    retq
1220;
1221; F16-LABEL: fptrunc_v4f32:
1222; F16:       # %bb.0:
1223; F16-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1224; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
1225; F16-NEXT:    vzeroupper
1226; F16-NEXT:    retq
1227;
1228; AVXNC-LABEL: fptrunc_v4f32:
1229; AVXNC:       # %bb.0:
1230; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1231; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
1232; AVXNC-NEXT:    vzeroupper
1233; AVXNC-NEXT:    retq
1234  %b = fptrunc <4 x float> %a to <4 x bfloat>
1235  ret <4 x bfloat> %b
1236}
1237
1238define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
1239; X86-LABEL: fptrunc_v8f32:
1240; X86:       # %bb.0:
1241; X86-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
1242; X86-NEXT:    vzeroupper
1243; X86-NEXT:    retl
1244;
1245; SSE2-LABEL: fptrunc_v8f32:
1246; SSE2:       # %bb.0:
1247; SSE2-NEXT:    pushq %rbp
1248; SSE2-NEXT:    pushq %r14
1249; SSE2-NEXT:    pushq %rbx
1250; SSE2-NEXT:    subq $32, %rsp
1251; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
1252; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1253; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1254; SSE2-NEXT:    callq __truncsfbf2@PLT
1255; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
1256; SSE2-NEXT:    shll $16, %ebx
1257; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1258; SSE2-NEXT:    callq __truncsfbf2@PLT
1259; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1260; SSE2-NEXT:    movzwl %ax, %r14d
1261; SSE2-NEXT:    orl %ebx, %r14d
1262; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1263; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1264; SSE2-NEXT:    callq __truncsfbf2@PLT
1265; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1266; SSE2-NEXT:    shll $16, %ebp
1267; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1268; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1269; SSE2-NEXT:    callq __truncsfbf2@PLT
1270; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1271; SSE2-NEXT:    movzwl %ax, %ebx
1272; SSE2-NEXT:    orl %ebp, %ebx
1273; SSE2-NEXT:    shlq $32, %rbx
1274; SSE2-NEXT:    orq %r14, %rbx
1275; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1276; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1277; SSE2-NEXT:    callq __truncsfbf2@PLT
1278; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1279; SSE2-NEXT:    shll $16, %ebp
1280; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1281; SSE2-NEXT:    callq __truncsfbf2@PLT
1282; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1283; SSE2-NEXT:    movzwl %ax, %r14d
1284; SSE2-NEXT:    orl %ebp, %r14d
1285; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1286; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1287; SSE2-NEXT:    callq __truncsfbf2@PLT
1288; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1289; SSE2-NEXT:    shll $16, %ebp
1290; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1291; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1292; SSE2-NEXT:    callq __truncsfbf2@PLT
1293; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1294; SSE2-NEXT:    movzwl %ax, %eax
1295; SSE2-NEXT:    orl %ebp, %eax
1296; SSE2-NEXT:    shlq $32, %rax
1297; SSE2-NEXT:    orq %r14, %rax
1298; SSE2-NEXT:    movq %rax, %xmm1
1299; SSE2-NEXT:    movq %rbx, %xmm0
1300; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1301; SSE2-NEXT:    addq $32, %rsp
1302; SSE2-NEXT:    popq %rbx
1303; SSE2-NEXT:    popq %r14
1304; SSE2-NEXT:    popq %rbp
1305; SSE2-NEXT:    retq
1306;
1307; F16-LABEL: fptrunc_v8f32:
1308; F16:       # %bb.0:
1309; F16-NEXT:    vcvtneps2bf16 %ymm0, %xmm0
1310; F16-NEXT:    vzeroupper
1311; F16-NEXT:    retq
1312;
1313; AVXNC-LABEL: fptrunc_v8f32:
1314; AVXNC:       # %bb.0:
1315; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
1316; AVXNC-NEXT:    vzeroupper
1317; AVXNC-NEXT:    retq
1318  %b = fptrunc <8 x float> %a to <8 x bfloat>
1319  ret <8 x bfloat> %b
1320}
1321
1322define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
1323; X86-LABEL: fptrunc_v16f32:
1324; X86:       # %bb.0:
1325; X86-NEXT:    vcvtneps2bf16 %zmm0, %ymm0
1326; X86-NEXT:    retl
1327;
1328; SSE2-LABEL: fptrunc_v16f32:
1329; SSE2:       # %bb.0:
1330; SSE2-NEXT:    pushq %rbp
1331; SSE2-NEXT:    pushq %r15
1332; SSE2-NEXT:    pushq %r14
1333; SSE2-NEXT:    pushq %r12
1334; SSE2-NEXT:    pushq %rbx
1335; SSE2-NEXT:    subq $64, %rsp
1336; SSE2-NEXT:    movaps %xmm3, (%rsp) # 16-byte Spill
1337; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1338; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1339; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1340; SSE2-NEXT:    movaps %xmm2, %xmm0
1341; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1]
1342; SSE2-NEXT:    callq __truncsfbf2@PLT
1343; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
1344; SSE2-NEXT:    shll $16, %ebx
1345; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1346; SSE2-NEXT:    callq __truncsfbf2@PLT
1347; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1348; SSE2-NEXT:    movzwl %ax, %r14d
1349; SSE2-NEXT:    orl %ebx, %r14d
1350; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1351; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1352; SSE2-NEXT:    callq __truncsfbf2@PLT
1353; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1354; SSE2-NEXT:    shll $16, %ebp
1355; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1356; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1357; SSE2-NEXT:    callq __truncsfbf2@PLT
1358; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1359; SSE2-NEXT:    movzwl %ax, %ebx
1360; SSE2-NEXT:    orl %ebp, %ebx
1361; SSE2-NEXT:    shlq $32, %rbx
1362; SSE2-NEXT:    orq %r14, %rbx
1363; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1364; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1365; SSE2-NEXT:    callq __truncsfbf2@PLT
1366; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1367; SSE2-NEXT:    shll $16, %ebp
1368; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1369; SSE2-NEXT:    callq __truncsfbf2@PLT
1370; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1371; SSE2-NEXT:    movzwl %ax, %r15d
1372; SSE2-NEXT:    orl %ebp, %r15d
1373; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1374; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1375; SSE2-NEXT:    callq __truncsfbf2@PLT
1376; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1377; SSE2-NEXT:    shll $16, %ebp
1378; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1379; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1380; SSE2-NEXT:    callq __truncsfbf2@PLT
1381; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1382; SSE2-NEXT:    movzwl %ax, %r14d
1383; SSE2-NEXT:    orl %ebp, %r14d
1384; SSE2-NEXT:    shlq $32, %r14
1385; SSE2-NEXT:    orq %r15, %r14
1386; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1387; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1388; SSE2-NEXT:    callq __truncsfbf2@PLT
1389; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1390; SSE2-NEXT:    shll $16, %ebp
1391; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1392; SSE2-NEXT:    callq __truncsfbf2@PLT
1393; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1394; SSE2-NEXT:    movzwl %ax, %r12d
1395; SSE2-NEXT:    orl %ebp, %r12d
1396; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1397; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1398; SSE2-NEXT:    callq __truncsfbf2@PLT
1399; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1400; SSE2-NEXT:    shll $16, %ebp
1401; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1402; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1403; SSE2-NEXT:    callq __truncsfbf2@PLT
1404; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1405; SSE2-NEXT:    movzwl %ax, %r15d
1406; SSE2-NEXT:    orl %ebp, %r15d
1407; SSE2-NEXT:    shlq $32, %r15
1408; SSE2-NEXT:    orq %r12, %r15
1409; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1410; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
1411; SSE2-NEXT:    callq __truncsfbf2@PLT
1412; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1413; SSE2-NEXT:    shll $16, %ebp
1414; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1415; SSE2-NEXT:    callq __truncsfbf2@PLT
1416; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1417; SSE2-NEXT:    movzwl %ax, %r12d
1418; SSE2-NEXT:    orl %ebp, %r12d
1419; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1420; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1421; SSE2-NEXT:    callq __truncsfbf2@PLT
1422; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1423; SSE2-NEXT:    shll $16, %ebp
1424; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1425; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1426; SSE2-NEXT:    callq __truncsfbf2@PLT
1427; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1428; SSE2-NEXT:    movzwl %ax, %eax
1429; SSE2-NEXT:    orl %ebp, %eax
1430; SSE2-NEXT:    shlq $32, %rax
1431; SSE2-NEXT:    orq %r12, %rax
1432; SSE2-NEXT:    movq %rax, %xmm1
1433; SSE2-NEXT:    movq %r15, %xmm0
1434; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1435; SSE2-NEXT:    movq %r14, %xmm2
1436; SSE2-NEXT:    movq %rbx, %xmm1
1437; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1438; SSE2-NEXT:    addq $64, %rsp
1439; SSE2-NEXT:    popq %rbx
1440; SSE2-NEXT:    popq %r12
1441; SSE2-NEXT:    popq %r14
1442; SSE2-NEXT:    popq %r15
1443; SSE2-NEXT:    popq %rbp
1444; SSE2-NEXT:    retq
1445;
1446; F16-LABEL: fptrunc_v16f32:
1447; F16:       # %bb.0:
1448; F16-NEXT:    vcvtneps2bf16 %zmm0, %ymm0
1449; F16-NEXT:    retq
1450;
1451; AVXNC-LABEL: fptrunc_v16f32:
1452; AVXNC:       # %bb.0:
1453; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm0, %xmm0
1454; AVXNC-NEXT:    {vex} vcvtneps2bf16 %ymm1, %xmm1
1455; AVXNC-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1456; AVXNC-NEXT:    retq
1457  %b = fptrunc <16 x float> %a to <16 x bfloat>
1458  ret <16 x bfloat> %b
1459}
1460
1461define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
1462; X86-LABEL: fptrunc_v8f64:
1463; X86:       # %bb.0:
1464; X86-NEXT:    subl $204, %esp
1465; X86-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
1466; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
1467; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1468; X86-NEXT:    vmovlps %xmm0, (%esp)
1469; X86-NEXT:    vzeroupper
1470; X86-NEXT:    calll __truncdfbf2
1471; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1472; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1473; X86-NEXT:    vmovhps %xmm0, (%esp)
1474; X86-NEXT:    calll __truncdfbf2
1475; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1476; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1477; X86-NEXT:    vmovlps %xmm0, (%esp)
1478; X86-NEXT:    vzeroupper
1479; X86-NEXT:    calll __truncdfbf2
1480; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1481; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1482; X86-NEXT:    vmovhps %xmm0, (%esp)
1483; X86-NEXT:    vzeroupper
1484; X86-NEXT:    calll __truncdfbf2
1485; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1486; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1487; X86-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
1488; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1489; X86-NEXT:    vmovlps %xmm0, (%esp)
1490; X86-NEXT:    vzeroupper
1491; X86-NEXT:    calll __truncdfbf2
1492; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1493; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1494; X86-NEXT:    vmovhps %xmm0, (%esp)
1495; X86-NEXT:    calll __truncdfbf2
1496; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1497; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
1498; X86-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1499; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1500; X86-NEXT:    vmovlps %xmm0, (%esp)
1501; X86-NEXT:    vzeroupper
1502; X86-NEXT:    calll __truncdfbf2
1503; X86-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1504; X86-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1505; X86-NEXT:    vmovhps %xmm0, (%esp)
1506; X86-NEXT:    calll __truncdfbf2
1507; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1508; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1509; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1510; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1511; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1512; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1513; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1514; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1515; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1516; X86-NEXT:    vmovdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 16-byte Reload
1517; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1518; X86-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
1519; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1520; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1521; X86-NEXT:    addl $204, %esp
1522; X86-NEXT:    retl
1523;
1524; SSE2-LABEL: fptrunc_v8f64:
1525; SSE2:       # %bb.0:
1526; SSE2-NEXT:    pushq %rbp
1527; SSE2-NEXT:    pushq %r14
1528; SSE2-NEXT:    pushq %rbx
1529; SSE2-NEXT:    subq $64, %rsp
1530; SSE2-NEXT:    movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1531; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1532; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
1533; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1534; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1535; SSE2-NEXT:    callq __truncdfbf2@PLT
1536; SSE2-NEXT:    pextrw $0, %xmm0, %ebx
1537; SSE2-NEXT:    shll $16, %ebx
1538; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1539; SSE2-NEXT:    callq __truncdfbf2@PLT
1540; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1541; SSE2-NEXT:    movzwl %ax, %r14d
1542; SSE2-NEXT:    orl %ebx, %r14d
1543; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1544; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1545; SSE2-NEXT:    callq __truncdfbf2@PLT
1546; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1547; SSE2-NEXT:    shll $16, %ebp
1548; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
1549; SSE2-NEXT:    callq __truncdfbf2@PLT
1550; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1551; SSE2-NEXT:    movzwl %ax, %ebx
1552; SSE2-NEXT:    orl %ebp, %ebx
1553; SSE2-NEXT:    shlq $32, %rbx
1554; SSE2-NEXT:    orq %r14, %rbx
1555; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1556; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1557; SSE2-NEXT:    callq __truncdfbf2@PLT
1558; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1559; SSE2-NEXT:    shll $16, %ebp
1560; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1561; SSE2-NEXT:    callq __truncdfbf2@PLT
1562; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1563; SSE2-NEXT:    movzwl %ax, %r14d
1564; SSE2-NEXT:    orl %ebp, %r14d
1565; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1566; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1,1]
1567; SSE2-NEXT:    callq __truncdfbf2@PLT
1568; SSE2-NEXT:    pextrw $0, %xmm0, %ebp
1569; SSE2-NEXT:    shll $16, %ebp
1570; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1571; SSE2-NEXT:    callq __truncdfbf2@PLT
1572; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1573; SSE2-NEXT:    movzwl %ax, %eax
1574; SSE2-NEXT:    orl %ebp, %eax
1575; SSE2-NEXT:    shlq $32, %rax
1576; SSE2-NEXT:    orq %r14, %rax
1577; SSE2-NEXT:    movq %rax, %xmm1
1578; SSE2-NEXT:    movq %rbx, %xmm0
1579; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1580; SSE2-NEXT:    addq $64, %rsp
1581; SSE2-NEXT:    popq %rbx
1582; SSE2-NEXT:    popq %r14
1583; SSE2-NEXT:    popq %rbp
1584; SSE2-NEXT:    retq
1585;
1586; FP16-LABEL: fptrunc_v8f64:
1587; FP16:       # %bb.0:
1588; FP16-NEXT:    subq $184, %rsp
1589; FP16-NEXT:    vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
1590; FP16-NEXT:    vextractf128 $1, %ymm0, %xmm0
1591; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1592; FP16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1593; FP16-NEXT:    vzeroupper
1594; FP16-NEXT:    callq __truncdfbf2@PLT
1595; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1596; FP16-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1597; FP16-NEXT:    callq __truncdfbf2@PLT
1598; FP16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1599; FP16-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1600; FP16-NEXT:    # xmm0 = mem[1,0]
1601; FP16-NEXT:    callq __truncdfbf2@PLT
1602; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1603; FP16-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1604; FP16-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1605; FP16-NEXT:    vzeroupper
1606; FP16-NEXT:    callq __truncdfbf2@PLT
1607; FP16-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1608; FP16-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1609; FP16-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
1610; FP16-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
1611; FP16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1612; FP16-NEXT:    vzeroupper
1613; FP16-NEXT:    callq __truncdfbf2@PLT
1614; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1615; FP16-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1616; FP16-NEXT:    callq __truncdfbf2@PLT
1617; FP16-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1618; FP16-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
1619; FP16-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
1620; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1621; FP16-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1622; FP16-NEXT:    vzeroupper
1623; FP16-NEXT:    callq __truncdfbf2@PLT
1624; FP16-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1626; FP16-NEXT:    callq __truncdfbf2@PLT
1627; FP16-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1628; FP16-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1629; FP16-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1630; FP16-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1631; FP16-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1632; FP16-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1633; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1634; FP16-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1635; FP16-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1636; FP16-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1637; FP16-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1638; FP16-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
1639; FP16-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1640; FP16-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1641; FP16-NEXT:    addq $184, %rsp
1642; FP16-NEXT:    retq
1643;
1644; AVXNC-LABEL: fptrunc_v8f64:
1645; AVXNC:       # %bb.0:
1646; AVXNC-NEXT:    pushq %rbp
1647; AVXNC-NEXT:    pushq %r15
1648; AVXNC-NEXT:    pushq %r14
1649; AVXNC-NEXT:    pushq %r13
1650; AVXNC-NEXT:    pushq %r12
1651; AVXNC-NEXT:    pushq %rbx
1652; AVXNC-NEXT:    subq $168, %rsp
1653; AVXNC-NEXT:    vmovups %ymm1, (%rsp) # 32-byte Spill
1654; AVXNC-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1655; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1656; AVXNC-NEXT:    vzeroupper
1657; AVXNC-NEXT:    callq __truncdfbf2@PLT
1658; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1659; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1660; AVXNC-NEXT:    # xmm0 = mem[1,0]
1661; AVXNC-NEXT:    callq __truncdfbf2@PLT
1662; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1663; AVXNC-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
1664; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
1665; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1666; AVXNC-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
1667; AVXNC-NEXT:    vzeroupper
1668; AVXNC-NEXT:    callq __truncdfbf2@PLT
1669; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1670; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
1671; AVXNC-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1672; AVXNC-NEXT:    vzeroupper
1673; AVXNC-NEXT:    callq __truncdfbf2@PLT
1674; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1675; AVXNC-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
1676; AVXNC-NEXT:    # xmm0 = mem[1,0]
1677; AVXNC-NEXT:    callq __truncdfbf2@PLT
1678; AVXNC-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1679; AVXNC-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
1680; AVXNC-NEXT:    vextractf128 $1, %ymm0, %xmm0
1681; AVXNC-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682; AVXNC-NEXT:    vzeroupper
1683; AVXNC-NEXT:    callq __truncdfbf2@PLT
1684; AVXNC-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1685; AVXNC-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1686; AVXNC-NEXT:    # xmm0 = mem[1,0]
1687; AVXNC-NEXT:    callq __truncdfbf2@PLT
1688; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebx
1689; AVXNC-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1690; AVXNC-NEXT:    vpextrw $0, %xmm0, %ebp
1691; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1692; AVXNC-NEXT:    vpextrw $0, %xmm0, %r14d
1693; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1694; AVXNC-NEXT:    vpextrw $0, %xmm0, %r15d
1695; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1696; AVXNC-NEXT:    vpextrw $0, %xmm0, %r12d
1697; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1698; AVXNC-NEXT:    vpextrw $0, %xmm0, %r13d
1699; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1700; AVXNC-NEXT:    callq __truncdfbf2@PLT
1701; AVXNC-NEXT:    vpextrw $0, %xmm0, %eax
1702; AVXNC-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1703; AVXNC-NEXT:    vpinsrw $1, %r13d, %xmm0, %xmm0
1704; AVXNC-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1705; AVXNC-NEXT:    vpinsrw $3, %r12d, %xmm0, %xmm0
1706; AVXNC-NEXT:    vpinsrw $4, %r15d, %xmm0, %xmm0
1707; AVXNC-NEXT:    vpinsrw $5, %r14d, %xmm0, %xmm0
1708; AVXNC-NEXT:    vpinsrw $6, %ebp, %xmm0, %xmm0
1709; AVXNC-NEXT:    vpinsrw $7, %ebx, %xmm0, %xmm0
1710; AVXNC-NEXT:    addq $168, %rsp
1711; AVXNC-NEXT:    popq %rbx
1712; AVXNC-NEXT:    popq %r12
1713; AVXNC-NEXT:    popq %r13
1714; AVXNC-NEXT:    popq %r14
1715; AVXNC-NEXT:    popq %r15
1716; AVXNC-NEXT:    popq %rbp
1717; AVXNC-NEXT:    retq
1718  %b = fptrunc <8 x double> %a to <8 x bfloat>
1719  ret <8 x bfloat> %b
1720}
1721
1722define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
1723; X86-LABEL: test_v8bf16_v32bf16:
1724; X86:       # %bb.0:
1725; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1726; X86-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1727; X86-NEXT:    retl
1728;
1729; SSE2-LABEL: test_v8bf16_v32bf16:
1730; SSE2:       # %bb.0:
1731; SSE2-NEXT:    movaps (%rdi), %xmm0
1732; SSE2-NEXT:    movaps %xmm0, %xmm1
1733; SSE2-NEXT:    movaps %xmm0, %xmm2
1734; SSE2-NEXT:    movaps %xmm0, %xmm3
1735; SSE2-NEXT:    retq
1736;
1737; F16-LABEL: test_v8bf16_v32bf16:
1738; F16:       # %bb.0:
1739; F16-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1740; F16-NEXT:    retq
1741;
1742; AVXNC-LABEL: test_v8bf16_v32bf16:
1743; AVXNC:       # %bb.0:
1744; AVXNC-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1745; AVXNC-NEXT:    vmovaps %ymm0, %ymm1
1746; AVXNC-NEXT:    retq
1747  %2 = load <8 x bfloat>, ptr %0, align 16
1748  %3 = shufflevector <8 x bfloat> %2, <8 x bfloat> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1749  ret <32 x bfloat> %3
1750}
1751
1752define <16 x bfloat> @concat_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1753; X86-LABEL: concat_v8bf16:
1754; X86:       # %bb.0:
1755; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1756; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1757; X86-NEXT:    retl
1758;
1759; SSE2-LABEL: concat_v8bf16:
1760; SSE2:       # %bb.0:
1761; SSE2-NEXT:    retq
1762;
1763; AVX-LABEL: concat_v8bf16:
1764; AVX:       # %bb.0:
1765; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1766; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1767; AVX-NEXT:    retq
1768  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1769  ret <16 x bfloat> %a
1770}
1771
1772define <8 x bfloat> @extract_v32bf16_v8bf16(<32 x bfloat> %x) {
1773; X86-LABEL: extract_v32bf16_v8bf16:
1774; X86:       # %bb.0:
1775; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
1776; X86-NEXT:    vzeroupper
1777; X86-NEXT:    retl
1778;
1779; SSE2-LABEL: extract_v32bf16_v8bf16:
1780; SSE2:       # %bb.0:
1781; SSE2-NEXT:    pextrw $0, %xmm1, %eax
1782; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
1783; SSE2-NEXT:    shll $16, %ecx
1784; SSE2-NEXT:    orl %eax, %ecx
1785; SSE2-NEXT:    pextrw $2, %xmm1, %eax
1786; SSE2-NEXT:    pextrw $3, %xmm1, %edx
1787; SSE2-NEXT:    shll $16, %edx
1788; SSE2-NEXT:    orl %eax, %edx
1789; SSE2-NEXT:    shlq $32, %rdx
1790; SSE2-NEXT:    orq %rcx, %rdx
1791; SSE2-NEXT:    pextrw $4, %xmm1, %eax
1792; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
1793; SSE2-NEXT:    shll $16, %ecx
1794; SSE2-NEXT:    orl %eax, %ecx
1795; SSE2-NEXT:    pextrw $6, %xmm1, %eax
1796; SSE2-NEXT:    pextrw $7, %xmm1, %esi
1797; SSE2-NEXT:    shll $16, %esi
1798; SSE2-NEXT:    orl %eax, %esi
1799; SSE2-NEXT:    shlq $32, %rsi
1800; SSE2-NEXT:    orq %rcx, %rsi
1801; SSE2-NEXT:    movq %rsi, %xmm1
1802; SSE2-NEXT:    movq %rdx, %xmm0
1803; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1804; SSE2-NEXT:    retq
1805;
1806; AVX-LABEL: extract_v32bf16_v8bf16:
1807; AVX:       # %bb.0:
1808; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
1809; AVX-NEXT:    vzeroupper
1810; AVX-NEXT:    retq
1811  %a = shufflevector <32 x bfloat> %x, <32 x bfloat> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1812  ret <8 x bfloat> %a
1813}
1814
1815define <16 x bfloat> @concat_zero_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1816; X86-LABEL: concat_zero_v8bf16:
1817; X86:       # %bb.0:
1818; X86-NEXT:    vmovaps %xmm0, %xmm0
1819; X86-NEXT:    retl
1820;
1821; SSE2-LABEL: concat_zero_v8bf16:
1822; SSE2:       # %bb.0:
1823; SSE2-NEXT:    xorps %xmm1, %xmm1
1824; SSE2-NEXT:    retq
1825;
1826; AVX-LABEL: concat_zero_v8bf16:
1827; AVX:       # %bb.0:
1828; AVX-NEXT:    vmovaps %xmm0, %xmm0
1829; AVX-NEXT:    retq
1830  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1831  ret <16 x bfloat> %a
1832}
1833
1834define <16 x bfloat> @concat_dup_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) {
1835; X86-LABEL: concat_dup_v8bf16:
1836; X86:       # %bb.0:
1837; X86-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1838; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1839; X86-NEXT:    retl
1840;
1841; SSE2-LABEL: concat_dup_v8bf16:
1842; SSE2:       # %bb.0:
1843; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
1844; SSE2-NEXT:    retq
1845;
1846; AVX-LABEL: concat_dup_v8bf16:
1847; AVX:       # %bb.0:
1848; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1849; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1850; AVX-NEXT:    retq
1851  %a = shufflevector <8 x bfloat> %x, <8 x bfloat> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1852  ret <16 x bfloat> %a
1853}
1854
1855define float @trunc_ext(float %a) nounwind {
1856; X86-LABEL: trunc_ext:
1857; X86:       # %bb.0:
1858; X86-NEXT:    pushl %eax
1859; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1860; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
1861; X86-NEXT:    vmovw %xmm0, %eax
1862; X86-NEXT:    shll $16, %eax
1863; X86-NEXT:    vmovd %eax, %xmm0
1864; X86-NEXT:    vmovd %xmm0, (%esp)
1865; X86-NEXT:    flds (%esp)
1866; X86-NEXT:    popl %eax
1867; X86-NEXT:    retl
1868;
1869; SSE2-LABEL: trunc_ext:
1870; SSE2:       # %bb.0:
1871; SSE2-NEXT:    pushq %rax
1872; SSE2-NEXT:    callq __truncsfbf2@PLT
1873; SSE2-NEXT:    pextrw $0, %xmm0, %eax
1874; SSE2-NEXT:    shll $16, %eax
1875; SSE2-NEXT:    movd %eax, %xmm0
1876; SSE2-NEXT:    popq %rax
1877; SSE2-NEXT:    retq
1878;
1879; FP16-LABEL: trunc_ext:
1880; FP16:       # %bb.0:
1881; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
1882; FP16-NEXT:    vmovw %xmm0, %eax
1883; FP16-NEXT:    shll $16, %eax
1884; FP16-NEXT:    vmovd %eax, %xmm0
1885; FP16-NEXT:    retq
1886;
1887; AVXNC-LABEL: trunc_ext:
1888; AVXNC:       # %bb.0:
1889; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
1890; AVXNC-NEXT:    vmovd %xmm0, %eax
1891; AVXNC-NEXT:    shll $16, %eax
1892; AVXNC-NEXT:    vmovd %eax, %xmm0
1893; AVXNC-NEXT:    retq
1894  %b = fptrunc float %a to bfloat
1895  %c = fpext bfloat %b to float
1896  ret float %c
1897}
1898
1899define void @PR92471(ptr %0, ptr %1) nounwind {
1900; X86-LABEL: PR92471:
1901; X86:       # %bb.0:
1902; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1903; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1904; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1905; X86-NEXT:    vpinsrd $1, 4(%ecx), %xmm0, %xmm0
1906; X86-NEXT:    vpinsrd $2, 8(%ecx), %xmm0, %xmm0
1907; X86-NEXT:    vpinsrw $6, 12(%ecx), %xmm0, %xmm0
1908; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1909; X86-NEXT:    vpslld $16, %ymm0, %ymm0
1910; X86-NEXT:    vextracti128 $1, %ymm0, %xmm1
1911; X86-NEXT:    vpextrd $2, %xmm1, 24(%eax)
1912; X86-NEXT:    vpextrd $1, %xmm1, 20(%eax)
1913; X86-NEXT:    vmovd %xmm1, 16(%eax)
1914; X86-NEXT:    vmovdqu %xmm0, (%eax)
1915; X86-NEXT:    vzeroupper
1916; X86-NEXT:    retl
1917;
1918; SSE2-LABEL: PR92471:
1919; SSE2:       # %bb.0:
1920; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
1921; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1922; SSE2-NEXT:    pinsrw $2, 12(%rdi), %xmm1
1923; SSE2-NEXT:    pxor %xmm2, %xmm2
1924; SSE2-NEXT:    pxor %xmm3, %xmm3
1925; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1926; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1927; SSE2-NEXT:    movdqu %xmm2, (%rsi)
1928; SSE2-NEXT:    movq %xmm3, 16(%rsi)
1929; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
1930; SSE2-NEXT:    movd %xmm0, 24(%rsi)
1931; SSE2-NEXT:    retq
1932;
1933; AVX-LABEL: PR92471:
1934; AVX:       # %bb.0:
1935; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
1936; AVX-NEXT:    vpinsrd $2, 8(%rdi), %xmm0, %xmm0
1937; AVX-NEXT:    vpinsrw $6, 12(%rdi), %xmm0, %xmm0
1938; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1939; AVX-NEXT:    vpslld $16, %ymm0, %ymm0
1940; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
1941; AVX-NEXT:    vpextrd $2, %xmm1, 24(%rsi)
1942; AVX-NEXT:    vmovq %xmm1, 16(%rsi)
1943; AVX-NEXT:    vmovdqu %xmm0, (%rsi)
1944; AVX-NEXT:    vzeroupper
1945; AVX-NEXT:    retq
1946  %3 = load <7 x bfloat>, ptr %0, align 2
1947  %4 = fpext <7 x bfloat> %3 to <7 x float>
1948  store <7 x float> %4, ptr %1, align 4
1949  ret void
1950}
1951
1952define bfloat @PR108936(x86_fp80 %0) nounwind {
1953; X86-LABEL: PR108936:
1954; X86:       # %bb.0:
1955; X86-NEXT:    subl $12, %esp
1956; X86-NEXT:    fldt {{[0-9]+}}(%esp)
1957; X86-NEXT:    fstpt (%esp)
1958; X86-NEXT:    calll __truncxfbf2
1959; X86-NEXT:    addl $12, %esp
1960; X86-NEXT:    retl
1961;
1962; CHECK-LABEL: PR108936:
1963; CHECK:       # %bb.0:
1964; CHECK-NEXT:    subq $24, %rsp
1965; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
1966; CHECK-NEXT:    fstpt (%rsp)
1967; CHECK-NEXT:    callq __truncxfbf2@PLT
1968; CHECK-NEXT:    addq $24, %rsp
1969; CHECK-NEXT:    retq
1970  %2 = fptrunc x86_fp80 %0 to bfloat
1971  ret bfloat %2
1972}
1973
1974define bfloat @PR115710(fp128 %0) nounwind {
1975; X86-LABEL: PR115710:
1976; X86:       # %bb.0:
1977; X86-NEXT:    subl $28, %esp
1978; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
1979; X86-NEXT:    vmovups %xmm0, (%esp)
1980; X86-NEXT:    calll __trunctfbf2
1981; X86-NEXT:    addl $28, %esp
1982; X86-NEXT:    retl
1983;
1984; CHECK-LABEL: PR115710:
1985; CHECK:       # %bb.0:
1986; CHECK-NEXT:    pushq %rax
1987; CHECK-NEXT:    callq __trunctfbf2@PLT
1988; CHECK-NEXT:    popq %rax
1989; CHECK-NEXT:    retq
1990  %2 = fptrunc fp128 %0 to bfloat
1991  ret bfloat %2
1992}
1993