xref: /llvm-project/llvm/test/CodeGen/X86/nontemporal-3.ll (revision f1200ca7ac88c6ff9aa4fe3b560cf326dc3d4e25)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=CHECK,SSE,SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
9
10; Test codegen for under aligned nontemporal vector stores
11
12; XMM versions.
13
14define void @test_zero_v2f64_align1(ptr %dst) nounwind {
15; CHECK-LABEL: test_zero_v2f64_align1:
16; CHECK:       # %bb.0:
17; CHECK-NEXT:    xorl %eax, %eax
18; CHECK-NEXT:    movntiq %rax, 8(%rdi)
19; CHECK-NEXT:    movntiq %rax, (%rdi)
20; CHECK-NEXT:    retq
21  store <2 x double> zeroinitializer, ptr %dst, align 1, !nontemporal !1
22  ret void
23}
24
25define void @test_zero_v4f32_align1(ptr %dst) nounwind {
26; CHECK-LABEL: test_zero_v4f32_align1:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    xorl %eax, %eax
29; CHECK-NEXT:    movntiq %rax, 8(%rdi)
30; CHECK-NEXT:    movntiq %rax, (%rdi)
31; CHECK-NEXT:    retq
32  store <4 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
33  ret void
34}
35
36define void @test_zero_v2i64_align1(ptr %dst) nounwind {
37; CHECK-LABEL: test_zero_v2i64_align1:
38; CHECK:       # %bb.0:
39; CHECK-NEXT:    xorl %eax, %eax
40; CHECK-NEXT:    movntiq %rax, 8(%rdi)
41; CHECK-NEXT:    movntiq %rax, (%rdi)
42; CHECK-NEXT:    retq
43  store <2 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
44  ret void
45}
46
47define void @test_zero_v4i32_align1(ptr %dst) nounwind {
48; CHECK-LABEL: test_zero_v4i32_align1:
49; CHECK:       # %bb.0:
50; CHECK-NEXT:    xorl %eax, %eax
51; CHECK-NEXT:    movntiq %rax, 8(%rdi)
52; CHECK-NEXT:    movntiq %rax, (%rdi)
53; CHECK-NEXT:    retq
54  store <4 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
55  ret void
56}
57
58define void @test_zero_v8i16_align1(ptr %dst) nounwind {
59; CHECK-LABEL: test_zero_v8i16_align1:
60; CHECK:       # %bb.0:
61; CHECK-NEXT:    xorl %eax, %eax
62; CHECK-NEXT:    movntiq %rax, 8(%rdi)
63; CHECK-NEXT:    movntiq %rax, (%rdi)
64; CHECK-NEXT:    retq
65  store <8 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
66  ret void
67}
68
69define void @test_zero_v16i8_align1(ptr %dst) nounwind {
70; CHECK-LABEL: test_zero_v16i8_align1:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    xorl %eax, %eax
73; CHECK-NEXT:    movntiq %rax, 8(%rdi)
74; CHECK-NEXT:    movntiq %rax, (%rdi)
75; CHECK-NEXT:    retq
76  store <16 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
77  ret void
78}
79
80; YMM versions.
81
82define void @test_zero_v4f64_align1(ptr %dst) nounwind {
83; CHECK-LABEL: test_zero_v4f64_align1:
84; CHECK:       # %bb.0:
85; CHECK-NEXT:    xorl %eax, %eax
86; CHECK-NEXT:    movntiq %rax, 8(%rdi)
87; CHECK-NEXT:    movntiq %rax, (%rdi)
88; CHECK-NEXT:    movntiq %rax, 24(%rdi)
89; CHECK-NEXT:    movntiq %rax, 16(%rdi)
90; CHECK-NEXT:    retq
91  store <4 x double> zeroinitializer, ptr %dst, align 1, !nontemporal !1
92  ret void
93}
94
95define void @test_zero_v8f32_align1(ptr %dst) nounwind {
96; CHECK-LABEL: test_zero_v8f32_align1:
97; CHECK:       # %bb.0:
98; CHECK-NEXT:    xorl %eax, %eax
99; CHECK-NEXT:    movntiq %rax, 8(%rdi)
100; CHECK-NEXT:    movntiq %rax, (%rdi)
101; CHECK-NEXT:    movntiq %rax, 24(%rdi)
102; CHECK-NEXT:    movntiq %rax, 16(%rdi)
103; CHECK-NEXT:    retq
104  store <8 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
105  ret void
106}
107
108define void @test_zero_v4i64_align1(ptr %dst) nounwind {
109; CHECK-LABEL: test_zero_v4i64_align1:
110; CHECK:       # %bb.0:
111; CHECK-NEXT:    xorl %eax, %eax
112; CHECK-NEXT:    movntiq %rax, 8(%rdi)
113; CHECK-NEXT:    movntiq %rax, (%rdi)
114; CHECK-NEXT:    movntiq %rax, 24(%rdi)
115; CHECK-NEXT:    movntiq %rax, 16(%rdi)
116; CHECK-NEXT:    retq
117  store <4 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
118  ret void
119}
120
121define void @test_zero_v8i32_align1(ptr %dst) nounwind {
122; CHECK-LABEL: test_zero_v8i32_align1:
123; CHECK:       # %bb.0:
124; CHECK-NEXT:    xorl %eax, %eax
125; CHECK-NEXT:    movntiq %rax, 8(%rdi)
126; CHECK-NEXT:    movntiq %rax, (%rdi)
127; CHECK-NEXT:    movntiq %rax, 24(%rdi)
128; CHECK-NEXT:    movntiq %rax, 16(%rdi)
129; CHECK-NEXT:    retq
130  store <8 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
131  ret void
132}
133
134define void @test_zero_v16i16_align1(ptr %dst) nounwind {
135; CHECK-LABEL: test_zero_v16i16_align1:
136; CHECK:       # %bb.0:
137; CHECK-NEXT:    xorl %eax, %eax
138; CHECK-NEXT:    movntiq %rax, 8(%rdi)
139; CHECK-NEXT:    movntiq %rax, (%rdi)
140; CHECK-NEXT:    movntiq %rax, 24(%rdi)
141; CHECK-NEXT:    movntiq %rax, 16(%rdi)
142; CHECK-NEXT:    retq
143  store <16 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
144  ret void
145}
146
147define void @test_zero_v32i8_align1(ptr %dst) nounwind {
148; CHECK-LABEL: test_zero_v32i8_align1:
149; CHECK:       # %bb.0:
150; CHECK-NEXT:    xorl %eax, %eax
151; CHECK-NEXT:    movntiq %rax, 8(%rdi)
152; CHECK-NEXT:    movntiq %rax, (%rdi)
153; CHECK-NEXT:    movntiq %rax, 24(%rdi)
154; CHECK-NEXT:    movntiq %rax, 16(%rdi)
155; CHECK-NEXT:    retq
156  store <32 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
157  ret void
158}
159
160define void @test_zero_v4f64_align16(ptr %dst) nounwind {
161; SSE-LABEL: test_zero_v4f64_align16:
162; SSE:       # %bb.0:
163; SSE-NEXT:    xorps %xmm0, %xmm0
164; SSE-NEXT:    movntps %xmm0, 16(%rdi)
165; SSE-NEXT:    movntps %xmm0, (%rdi)
166; SSE-NEXT:    retq
167;
168; AVX-LABEL: test_zero_v4f64_align16:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
171; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
172; AVX-NEXT:    vmovntps %xmm0, (%rdi)
173; AVX-NEXT:    retq
174;
175; AVX512-LABEL: test_zero_v4f64_align16:
176; AVX512:       # %bb.0:
177; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
178; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
179; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
180; AVX512-NEXT:    retq
181  store <4 x double> zeroinitializer, ptr %dst, align 16, !nontemporal !1
182  ret void
183}
184
185define void @test_zero_v8f32_align16(ptr %dst) nounwind {
186; SSE-LABEL: test_zero_v8f32_align16:
187; SSE:       # %bb.0:
188; SSE-NEXT:    xorps %xmm0, %xmm0
189; SSE-NEXT:    movntps %xmm0, 16(%rdi)
190; SSE-NEXT:    movntps %xmm0, (%rdi)
191; SSE-NEXT:    retq
192;
193; AVX-LABEL: test_zero_v8f32_align16:
194; AVX:       # %bb.0:
195; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
196; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
197; AVX-NEXT:    vmovntps %xmm0, (%rdi)
198; AVX-NEXT:    retq
199;
200; AVX512-LABEL: test_zero_v8f32_align16:
201; AVX512:       # %bb.0:
202; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
203; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
204; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
205; AVX512-NEXT:    retq
206  store <8 x float> zeroinitializer, ptr %dst, align 16, !nontemporal !1
207  ret void
208}
209
210define void @test_zero_v4i64_align16(ptr %dst) nounwind {
211; SSE-LABEL: test_zero_v4i64_align16:
212; SSE:       # %bb.0:
213; SSE-NEXT:    xorps %xmm0, %xmm0
214; SSE-NEXT:    movntps %xmm0, 16(%rdi)
215; SSE-NEXT:    movntps %xmm0, (%rdi)
216; SSE-NEXT:    retq
217;
218; AVX-LABEL: test_zero_v4i64_align16:
219; AVX:       # %bb.0:
220; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
221; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
222; AVX-NEXT:    vmovntps %xmm0, (%rdi)
223; AVX-NEXT:    retq
224;
225; AVX512-LABEL: test_zero_v4i64_align16:
226; AVX512:       # %bb.0:
227; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
228; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
229; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
230; AVX512-NEXT:    retq
231  store <4 x i64> zeroinitializer, ptr %dst, align 16, !nontemporal !1
232  ret void
233}
234
235define void @test_zero_v8i32_align16(ptr %dst) nounwind {
236; SSE-LABEL: test_zero_v8i32_align16:
237; SSE:       # %bb.0:
238; SSE-NEXT:    xorps %xmm0, %xmm0
239; SSE-NEXT:    movntps %xmm0, 16(%rdi)
240; SSE-NEXT:    movntps %xmm0, (%rdi)
241; SSE-NEXT:    retq
242;
243; AVX-LABEL: test_zero_v8i32_align16:
244; AVX:       # %bb.0:
245; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
246; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
247; AVX-NEXT:    vmovntps %xmm0, (%rdi)
248; AVX-NEXT:    retq
249;
250; AVX512-LABEL: test_zero_v8i32_align16:
251; AVX512:       # %bb.0:
252; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
253; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
254; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
255; AVX512-NEXT:    retq
256  store <8 x i32> zeroinitializer, ptr %dst, align 16, !nontemporal !1
257  ret void
258}
259
260define void @test_zero_v16i16_align16(ptr %dst) nounwind {
261; SSE-LABEL: test_zero_v16i16_align16:
262; SSE:       # %bb.0:
263; SSE-NEXT:    xorps %xmm0, %xmm0
264; SSE-NEXT:    movntps %xmm0, 16(%rdi)
265; SSE-NEXT:    movntps %xmm0, (%rdi)
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: test_zero_v16i16_align16:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
271; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
272; AVX-NEXT:    vmovntps %xmm0, (%rdi)
273; AVX-NEXT:    retq
274;
275; AVX512-LABEL: test_zero_v16i16_align16:
276; AVX512:       # %bb.0:
277; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
278; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
279; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
280; AVX512-NEXT:    retq
281  store <16 x i16> zeroinitializer, ptr %dst, align 16, !nontemporal !1
282  ret void
283}
284
285define void @test_zero_v32i8_align16(ptr %dst) nounwind {
286; SSE-LABEL: test_zero_v32i8_align16:
287; SSE:       # %bb.0:
288; SSE-NEXT:    xorps %xmm0, %xmm0
289; SSE-NEXT:    movntps %xmm0, 16(%rdi)
290; SSE-NEXT:    movntps %xmm0, (%rdi)
291; SSE-NEXT:    retq
292;
293; AVX-LABEL: test_zero_v32i8_align16:
294; AVX:       # %bb.0:
295; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
296; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
297; AVX-NEXT:    vmovntps %xmm0, (%rdi)
298; AVX-NEXT:    retq
299;
300; AVX512-LABEL: test_zero_v32i8_align16:
301; AVX512:       # %bb.0:
302; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
303; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
304; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
305; AVX512-NEXT:    retq
306  store <32 x i8> zeroinitializer, ptr %dst, align 16, !nontemporal !1
307  ret void
308}
309
310; ZMM versions.
311
312define void @test_zero_v8f64_align1(ptr %dst) nounwind {
313; CHECK-LABEL: test_zero_v8f64_align1:
314; CHECK:       # %bb.0:
315; CHECK-NEXT:    xorl %eax, %eax
316; CHECK-NEXT:    movntiq %rax, 8(%rdi)
317; CHECK-NEXT:    movntiq %rax, (%rdi)
318; CHECK-NEXT:    movntiq %rax, 24(%rdi)
319; CHECK-NEXT:    movntiq %rax, 16(%rdi)
320; CHECK-NEXT:    movntiq %rax, 40(%rdi)
321; CHECK-NEXT:    movntiq %rax, 32(%rdi)
322; CHECK-NEXT:    movntiq %rax, 56(%rdi)
323; CHECK-NEXT:    movntiq %rax, 48(%rdi)
324; CHECK-NEXT:    retq
325  store <8 x double> zeroinitializer, ptr %dst, align 1, !nontemporal !1
326  ret void
327}
328
329define void @test_zero_v16f32_align1(ptr %dst) nounwind {
330; CHECK-LABEL: test_zero_v16f32_align1:
331; CHECK:       # %bb.0:
332; CHECK-NEXT:    xorl %eax, %eax
333; CHECK-NEXT:    movntiq %rax, 8(%rdi)
334; CHECK-NEXT:    movntiq %rax, (%rdi)
335; CHECK-NEXT:    movntiq %rax, 24(%rdi)
336; CHECK-NEXT:    movntiq %rax, 16(%rdi)
337; CHECK-NEXT:    movntiq %rax, 40(%rdi)
338; CHECK-NEXT:    movntiq %rax, 32(%rdi)
339; CHECK-NEXT:    movntiq %rax, 56(%rdi)
340; CHECK-NEXT:    movntiq %rax, 48(%rdi)
341; CHECK-NEXT:    retq
342  store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
343  ret void
344}
345
346define void @test_zero_v8i64_align1(ptr %dst) nounwind {
347; CHECK-LABEL: test_zero_v8i64_align1:
348; CHECK:       # %bb.0:
349; CHECK-NEXT:    xorl %eax, %eax
350; CHECK-NEXT:    movntiq %rax, 8(%rdi)
351; CHECK-NEXT:    movntiq %rax, (%rdi)
352; CHECK-NEXT:    movntiq %rax, 24(%rdi)
353; CHECK-NEXT:    movntiq %rax, 16(%rdi)
354; CHECK-NEXT:    movntiq %rax, 40(%rdi)
355; CHECK-NEXT:    movntiq %rax, 32(%rdi)
356; CHECK-NEXT:    movntiq %rax, 56(%rdi)
357; CHECK-NEXT:    movntiq %rax, 48(%rdi)
358; CHECK-NEXT:    retq
359  store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
360  ret void
361}
362
363define void @test_zero_v16i32_align1(ptr %dst) nounwind {
364; CHECK-LABEL: test_zero_v16i32_align1:
365; CHECK:       # %bb.0:
366; CHECK-NEXT:    xorl %eax, %eax
367; CHECK-NEXT:    movntiq %rax, 8(%rdi)
368; CHECK-NEXT:    movntiq %rax, (%rdi)
369; CHECK-NEXT:    movntiq %rax, 24(%rdi)
370; CHECK-NEXT:    movntiq %rax, 16(%rdi)
371; CHECK-NEXT:    movntiq %rax, 40(%rdi)
372; CHECK-NEXT:    movntiq %rax, 32(%rdi)
373; CHECK-NEXT:    movntiq %rax, 56(%rdi)
374; CHECK-NEXT:    movntiq %rax, 48(%rdi)
375; CHECK-NEXT:    retq
376  store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
377  ret void
378}
379
380define void @test_zero_v32i16_align1(ptr %dst) nounwind {
381; CHECK-LABEL: test_zero_v32i16_align1:
382; CHECK:       # %bb.0:
383; CHECK-NEXT:    xorl %eax, %eax
384; CHECK-NEXT:    movntiq %rax, 8(%rdi)
385; CHECK-NEXT:    movntiq %rax, (%rdi)
386; CHECK-NEXT:    movntiq %rax, 24(%rdi)
387; CHECK-NEXT:    movntiq %rax, 16(%rdi)
388; CHECK-NEXT:    movntiq %rax, 40(%rdi)
389; CHECK-NEXT:    movntiq %rax, 32(%rdi)
390; CHECK-NEXT:    movntiq %rax, 56(%rdi)
391; CHECK-NEXT:    movntiq %rax, 48(%rdi)
392; CHECK-NEXT:    retq
393  store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
394  ret void
395}
396
397define void @test_zero_v64i8_align1(ptr %dst) nounwind {
398; CHECK-LABEL: test_zero_v64i8_align1:
399; CHECK:       # %bb.0:
400; CHECK-NEXT:    xorl %eax, %eax
401; CHECK-NEXT:    movntiq %rax, 8(%rdi)
402; CHECK-NEXT:    movntiq %rax, (%rdi)
403; CHECK-NEXT:    movntiq %rax, 24(%rdi)
404; CHECK-NEXT:    movntiq %rax, 16(%rdi)
405; CHECK-NEXT:    movntiq %rax, 40(%rdi)
406; CHECK-NEXT:    movntiq %rax, 32(%rdi)
407; CHECK-NEXT:    movntiq %rax, 56(%rdi)
408; CHECK-NEXT:    movntiq %rax, 48(%rdi)
409; CHECK-NEXT:    retq
410  store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
411  ret void
412}
413
414define void @test_zero_v8f64_align16(ptr %dst) nounwind {
415; SSE-LABEL: test_zero_v8f64_align16:
416; SSE:       # %bb.0:
417; SSE-NEXT:    xorps %xmm0, %xmm0
418; SSE-NEXT:    movntps %xmm0, 16(%rdi)
419; SSE-NEXT:    movntps %xmm0, (%rdi)
420; SSE-NEXT:    movntps %xmm0, 48(%rdi)
421; SSE-NEXT:    movntps %xmm0, 32(%rdi)
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: test_zero_v8f64_align16:
425; AVX:       # %bb.0:
426; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
427; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
428; AVX-NEXT:    vmovntps %xmm0, (%rdi)
429; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
430; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
431; AVX-NEXT:    retq
432;
433; AVX512-LABEL: test_zero_v8f64_align16:
434; AVX512:       # %bb.0:
435; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
436; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
437; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
438; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
439; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
440; AVX512-NEXT:    retq
441  store <8 x double> zeroinitializer, ptr %dst, align 16, !nontemporal !1
442  ret void
443}
444
445define void @test_zero_v16f32_align16(ptr %dst) nounwind {
446; SSE-LABEL: test_zero_v16f32_align16:
447; SSE:       # %bb.0:
448; SSE-NEXT:    xorps %xmm0, %xmm0
449; SSE-NEXT:    movntps %xmm0, 16(%rdi)
450; SSE-NEXT:    movntps %xmm0, (%rdi)
451; SSE-NEXT:    movntps %xmm0, 48(%rdi)
452; SSE-NEXT:    movntps %xmm0, 32(%rdi)
453; SSE-NEXT:    retq
454;
455; AVX-LABEL: test_zero_v16f32_align16:
456; AVX:       # %bb.0:
457; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
458; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
459; AVX-NEXT:    vmovntps %xmm0, (%rdi)
460; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
461; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
462; AVX-NEXT:    retq
463;
464; AVX512-LABEL: test_zero_v16f32_align16:
465; AVX512:       # %bb.0:
466; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
467; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
468; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
469; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
470; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
471; AVX512-NEXT:    retq
472  store <16 x float> zeroinitializer, ptr %dst, align 16, !nontemporal !1
473  ret void
474}
475
476define void @test_zero_v8i64_align16(ptr %dst) nounwind {
477; SSE-LABEL: test_zero_v8i64_align16:
478; SSE:       # %bb.0:
479; SSE-NEXT:    xorps %xmm0, %xmm0
480; SSE-NEXT:    movntps %xmm0, 16(%rdi)
481; SSE-NEXT:    movntps %xmm0, (%rdi)
482; SSE-NEXT:    movntps %xmm0, 48(%rdi)
483; SSE-NEXT:    movntps %xmm0, 32(%rdi)
484; SSE-NEXT:    retq
485;
486; AVX-LABEL: test_zero_v8i64_align16:
487; AVX:       # %bb.0:
488; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
489; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
490; AVX-NEXT:    vmovntps %xmm0, (%rdi)
491; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
492; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
493; AVX-NEXT:    retq
494;
495; AVX512-LABEL: test_zero_v8i64_align16:
496; AVX512:       # %bb.0:
497; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
498; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
499; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
500; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
501; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
502; AVX512-NEXT:    retq
503  store <8 x i64> zeroinitializer, ptr %dst, align 16, !nontemporal !1
504  ret void
505}
506
507define void @test_zero_v16i32_align16(ptr %dst) nounwind {
508; SSE-LABEL: test_zero_v16i32_align16:
509; SSE:       # %bb.0:
510; SSE-NEXT:    xorps %xmm0, %xmm0
511; SSE-NEXT:    movntps %xmm0, 16(%rdi)
512; SSE-NEXT:    movntps %xmm0, (%rdi)
513; SSE-NEXT:    movntps %xmm0, 48(%rdi)
514; SSE-NEXT:    movntps %xmm0, 32(%rdi)
515; SSE-NEXT:    retq
516;
517; AVX-LABEL: test_zero_v16i32_align16:
518; AVX:       # %bb.0:
519; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
520; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
521; AVX-NEXT:    vmovntps %xmm0, (%rdi)
522; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
523; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
524; AVX-NEXT:    retq
525;
526; AVX512-LABEL: test_zero_v16i32_align16:
527; AVX512:       # %bb.0:
528; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
529; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
530; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
531; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
532; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
533; AVX512-NEXT:    retq
534  store <16 x i32> zeroinitializer, ptr %dst, align 16, !nontemporal !1
535  ret void
536}
537
538define void @test_zero_v32i16_align16(ptr %dst) nounwind {
539; SSE-LABEL: test_zero_v32i16_align16:
540; SSE:       # %bb.0:
541; SSE-NEXT:    xorps %xmm0, %xmm0
542; SSE-NEXT:    movntps %xmm0, 16(%rdi)
543; SSE-NEXT:    movntps %xmm0, (%rdi)
544; SSE-NEXT:    movntps %xmm0, 48(%rdi)
545; SSE-NEXT:    movntps %xmm0, 32(%rdi)
546; SSE-NEXT:    retq
547;
548; AVX-LABEL: test_zero_v32i16_align16:
549; AVX:       # %bb.0:
550; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
551; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
552; AVX-NEXT:    vmovntps %xmm0, (%rdi)
553; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
554; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
555; AVX-NEXT:    retq
556;
557; AVX512-LABEL: test_zero_v32i16_align16:
558; AVX512:       # %bb.0:
559; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
560; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
561; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
562; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
563; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
564; AVX512-NEXT:    retq
565  store <32 x i16> zeroinitializer, ptr %dst, align 16, !nontemporal !1
566  ret void
567}
568
569define void @test_zero_v64i8_align16(ptr %dst) nounwind {
570; SSE-LABEL: test_zero_v64i8_align16:
571; SSE:       # %bb.0:
572; SSE-NEXT:    xorps %xmm0, %xmm0
573; SSE-NEXT:    movntps %xmm0, 16(%rdi)
574; SSE-NEXT:    movntps %xmm0, (%rdi)
575; SSE-NEXT:    movntps %xmm0, 48(%rdi)
576; SSE-NEXT:    movntps %xmm0, 32(%rdi)
577; SSE-NEXT:    retq
578;
579; AVX-LABEL: test_zero_v64i8_align16:
580; AVX:       # %bb.0:
581; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
582; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
583; AVX-NEXT:    vmovntps %xmm0, (%rdi)
584; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
585; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
586; AVX-NEXT:    retq
587;
588; AVX512-LABEL: test_zero_v64i8_align16:
589; AVX512:       # %bb.0:
590; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
591; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
592; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
593; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
594; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
595; AVX512-NEXT:    retq
596  store <64 x i8> zeroinitializer, ptr %dst, align 16, !nontemporal !1
597  ret void
598}
599
600define void @test_zero_v8f64_align32(ptr %dst) nounwind {
601; SSE-LABEL: test_zero_v8f64_align32:
602; SSE:       # %bb.0:
603; SSE-NEXT:    xorps %xmm0, %xmm0
604; SSE-NEXT:    movntps %xmm0, 48(%rdi)
605; SSE-NEXT:    movntps %xmm0, 32(%rdi)
606; SSE-NEXT:    movntps %xmm0, 16(%rdi)
607; SSE-NEXT:    movntps %xmm0, (%rdi)
608; SSE-NEXT:    retq
609;
610; AVX-LABEL: test_zero_v8f64_align32:
611; AVX:       # %bb.0:
612; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
613; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
614; AVX-NEXT:    vmovntps %ymm0, (%rdi)
615; AVX-NEXT:    vzeroupper
616; AVX-NEXT:    retq
617;
618; AVX512-LABEL: test_zero_v8f64_align32:
619; AVX512:       # %bb.0:
620; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
621; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
622; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
623; AVX512-NEXT:    vzeroupper
624; AVX512-NEXT:    retq
625  store <8 x double> zeroinitializer, ptr %dst, align 32, !nontemporal !1
626  ret void
627}
628
629define void @test_zero_v16f32_align32(ptr %dst) nounwind {
630; SSE-LABEL: test_zero_v16f32_align32:
631; SSE:       # %bb.0:
632; SSE-NEXT:    xorps %xmm0, %xmm0
633; SSE-NEXT:    movntps %xmm0, 48(%rdi)
634; SSE-NEXT:    movntps %xmm0, 32(%rdi)
635; SSE-NEXT:    movntps %xmm0, 16(%rdi)
636; SSE-NEXT:    movntps %xmm0, (%rdi)
637; SSE-NEXT:    retq
638;
639; AVX-LABEL: test_zero_v16f32_align32:
640; AVX:       # %bb.0:
641; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
642; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
643; AVX-NEXT:    vmovntps %ymm0, (%rdi)
644; AVX-NEXT:    vzeroupper
645; AVX-NEXT:    retq
646;
647; AVX512-LABEL: test_zero_v16f32_align32:
648; AVX512:       # %bb.0:
649; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
650; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
651; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
652; AVX512-NEXT:    vzeroupper
653; AVX512-NEXT:    retq
654  store <16 x float> zeroinitializer, ptr %dst, align 32, !nontemporal !1
655  ret void
656}
657
658define void @test_zero_v8i64_align32(ptr %dst) nounwind {
659; SSE-LABEL: test_zero_v8i64_align32:
660; SSE:       # %bb.0:
661; SSE-NEXT:    xorps %xmm0, %xmm0
662; SSE-NEXT:    movntps %xmm0, 48(%rdi)
663; SSE-NEXT:    movntps %xmm0, 32(%rdi)
664; SSE-NEXT:    movntps %xmm0, 16(%rdi)
665; SSE-NEXT:    movntps %xmm0, (%rdi)
666; SSE-NEXT:    retq
667;
668; AVX-LABEL: test_zero_v8i64_align32:
669; AVX:       # %bb.0:
670; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
671; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
672; AVX-NEXT:    vmovntps %ymm0, (%rdi)
673; AVX-NEXT:    vzeroupper
674; AVX-NEXT:    retq
675;
676; AVX512-LABEL: test_zero_v8i64_align32:
677; AVX512:       # %bb.0:
678; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
679; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
680; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
681; AVX512-NEXT:    vzeroupper
682; AVX512-NEXT:    retq
683  store <8 x i64> zeroinitializer, ptr %dst, align 32, !nontemporal !1
684  ret void
685}
686
687define void @test_zero_v16i32_align32(ptr %dst) nounwind {
688; SSE-LABEL: test_zero_v16i32_align32:
689; SSE:       # %bb.0:
690; SSE-NEXT:    xorps %xmm0, %xmm0
691; SSE-NEXT:    movntps %xmm0, 48(%rdi)
692; SSE-NEXT:    movntps %xmm0, 32(%rdi)
693; SSE-NEXT:    movntps %xmm0, 16(%rdi)
694; SSE-NEXT:    movntps %xmm0, (%rdi)
695; SSE-NEXT:    retq
696;
697; AVX-LABEL: test_zero_v16i32_align32:
698; AVX:       # %bb.0:
699; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
700; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
701; AVX-NEXT:    vmovntps %ymm0, (%rdi)
702; AVX-NEXT:    vzeroupper
703; AVX-NEXT:    retq
704;
705; AVX512-LABEL: test_zero_v16i32_align32:
706; AVX512:       # %bb.0:
707; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
708; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
709; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
710; AVX512-NEXT:    vzeroupper
711; AVX512-NEXT:    retq
712  store <16 x i32> zeroinitializer, ptr %dst, align 32, !nontemporal !1
713  ret void
714}
715
716define void @test_zero_v32i16_align32(ptr %dst) nounwind {
717; SSE-LABEL: test_zero_v32i16_align32:
718; SSE:       # %bb.0:
719; SSE-NEXT:    xorps %xmm0, %xmm0
720; SSE-NEXT:    movntps %xmm0, 48(%rdi)
721; SSE-NEXT:    movntps %xmm0, 32(%rdi)
722; SSE-NEXT:    movntps %xmm0, 16(%rdi)
723; SSE-NEXT:    movntps %xmm0, (%rdi)
724; SSE-NEXT:    retq
725;
726; AVX-LABEL: test_zero_v32i16_align32:
727; AVX:       # %bb.0:
728; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
729; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
730; AVX-NEXT:    vmovntps %ymm0, (%rdi)
731; AVX-NEXT:    vzeroupper
732; AVX-NEXT:    retq
733;
734; AVX512-LABEL: test_zero_v32i16_align32:
735; AVX512:       # %bb.0:
736; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
737; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
738; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
739; AVX512-NEXT:    vzeroupper
740; AVX512-NEXT:    retq
741  store <32 x i16> zeroinitializer, ptr %dst, align 32, !nontemporal !1
742  ret void
743}
744
745define void @test_zero_v64i8_align32(ptr %dst) nounwind {
746; SSE-LABEL: test_zero_v64i8_align32:
747; SSE:       # %bb.0:
748; SSE-NEXT:    xorps %xmm0, %xmm0
749; SSE-NEXT:    movntps %xmm0, 48(%rdi)
750; SSE-NEXT:    movntps %xmm0, 32(%rdi)
751; SSE-NEXT:    movntps %xmm0, 16(%rdi)
752; SSE-NEXT:    movntps %xmm0, (%rdi)
753; SSE-NEXT:    retq
754;
755; AVX-LABEL: test_zero_v64i8_align32:
756; AVX:       # %bb.0:
757; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
758; AVX-NEXT:    vmovntps %ymm0, 32(%rdi)
759; AVX-NEXT:    vmovntps %ymm0, (%rdi)
760; AVX-NEXT:    vzeroupper
761; AVX-NEXT:    retq
762;
763; AVX512-LABEL: test_zero_v64i8_align32:
764; AVX512:       # %bb.0:
765; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
766; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
767; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
768; AVX512-NEXT:    vzeroupper
769; AVX512-NEXT:    retq
770  store <64 x i8> zeroinitializer, ptr %dst, align 32, !nontemporal !1
771  ret void
772}
773
774!1 = !{i32 1}
775;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
776; SSE2: {{.*}}
777; SSE41: {{.*}}
778; SSE4A: {{.*}}
779