xref: /llvm-project/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll (revision d4a0154902fb9b0611ed857134b26a64a1d5ad1e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X86,X86-SSE4A
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64-SSE,X64-SSE2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=X64-SSE,X64-SSE4A
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X64-SSE,X64-SSE41
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=X64-AVX,X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64-AVX,X64-AVX2
9
10;
11; PR42123
12;
13
14define void @merge_2_v4f32_align32(ptr %a0, ptr %a1) nounwind {
15; X86-LABEL: merge_2_v4f32_align32:
16; X86:       # %bb.0:
17; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
18; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
19; X86-NEXT:    movaps (%ecx), %xmm0
20; X86-NEXT:    movaps 16(%ecx), %xmm1
21; X86-NEXT:    movntps %xmm0, (%eax)
22; X86-NEXT:    movntps %xmm1, 16(%eax)
23; X86-NEXT:    retl
24;
25; X64-SSE2-LABEL: merge_2_v4f32_align32:
26; X64-SSE2:       # %bb.0:
27; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
28; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
29; X64-SSE2-NEXT:    movntps %xmm0, (%rsi)
30; X64-SSE2-NEXT:    movntps %xmm1, 16(%rsi)
31; X64-SSE2-NEXT:    retq
32;
33; X64-SSE4A-LABEL: merge_2_v4f32_align32:
34; X64-SSE4A:       # %bb.0:
35; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
36; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
37; X64-SSE4A-NEXT:    movntps %xmm0, (%rsi)
38; X64-SSE4A-NEXT:    movntps %xmm1, 16(%rsi)
39; X64-SSE4A-NEXT:    retq
40;
41; X64-SSE41-LABEL: merge_2_v4f32_align32:
42; X64-SSE41:       # %bb.0:
43; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
44; X64-SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
45; X64-SSE41-NEXT:    movntdq %xmm0, (%rsi)
46; X64-SSE41-NEXT:    movntdq %xmm1, 16(%rsi)
47; X64-SSE41-NEXT:    retq
48;
49; X64-AVX1-LABEL: merge_2_v4f32_align32:
50; X64-AVX1:       # %bb.0:
51; X64-AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
52; X64-AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
53; X64-AVX1-NEXT:    vmovntdq %xmm0, (%rsi)
54; X64-AVX1-NEXT:    vmovntdq %xmm1, 16(%rsi)
55; X64-AVX1-NEXT:    retq
56;
57; X64-AVX2-LABEL: merge_2_v4f32_align32:
58; X64-AVX2:       # %bb.0:
59; X64-AVX2-NEXT:    vmovntdqa (%rdi), %ymm0
60; X64-AVX2-NEXT:    vmovntdq %ymm0, (%rsi)
61; X64-AVX2-NEXT:    vzeroupper
62; X64-AVX2-NEXT:    retq
63  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
64  %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0
65  %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0
66  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
67  store <4 x float> %2, ptr %a1, align 32, !nontemporal !0
68  store <4 x float> %3, ptr %4, align 16, !nontemporal !0
69  ret void
70}
71
72; Don't merge nt and non-nt loads even if aligned.
73define void @merge_2_v4f32_align32_mix_ntload(ptr %a0, ptr %a1) nounwind {
74; X86-LABEL: merge_2_v4f32_align32_mix_ntload:
75; X86:       # %bb.0:
76; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
77; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
78; X86-NEXT:    movaps (%ecx), %xmm0
79; X86-NEXT:    movaps 16(%ecx), %xmm1
80; X86-NEXT:    movaps %xmm0, (%eax)
81; X86-NEXT:    movaps %xmm1, 16(%eax)
82; X86-NEXT:    retl
83;
84; X64-SSE2-LABEL: merge_2_v4f32_align32_mix_ntload:
85; X64-SSE2:       # %bb.0:
86; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
87; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
88; X64-SSE2-NEXT:    movaps %xmm0, (%rsi)
89; X64-SSE2-NEXT:    movaps %xmm1, 16(%rsi)
90; X64-SSE2-NEXT:    retq
91;
92; X64-SSE4A-LABEL: merge_2_v4f32_align32_mix_ntload:
93; X64-SSE4A:       # %bb.0:
94; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
95; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
96; X64-SSE4A-NEXT:    movaps %xmm0, (%rsi)
97; X64-SSE4A-NEXT:    movaps %xmm1, 16(%rsi)
98; X64-SSE4A-NEXT:    retq
99;
100; X64-SSE41-LABEL: merge_2_v4f32_align32_mix_ntload:
101; X64-SSE41:       # %bb.0:
102; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
103; X64-SSE41-NEXT:    movaps 16(%rdi), %xmm1
104; X64-SSE41-NEXT:    movdqa %xmm0, (%rsi)
105; X64-SSE41-NEXT:    movaps %xmm1, 16(%rsi)
106; X64-SSE41-NEXT:    retq
107;
108; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntload:
109; X64-AVX:       # %bb.0:
110; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
111; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
112; X64-AVX-NEXT:    vmovdqa %xmm0, (%rsi)
113; X64-AVX-NEXT:    vmovaps %xmm1, 16(%rsi)
114; X64-AVX-NEXT:    retq
115  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
116  %2 = load <4 x float>, ptr %a0, align 32, !nontemporal !0
117  %3 = load <4 x float>, ptr %1, align 16
118  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
119  store <4 x float> %2, ptr %a1, align 32
120  store <4 x float> %3, ptr %4, align 16
121  ret void
122}
123
124; Don't merge nt and non-nt stores even if aligned.
125define void @merge_2_v4f32_align32_mix_ntstore(ptr %a0, ptr %a1) nounwind {
126; X86-LABEL: merge_2_v4f32_align32_mix_ntstore:
127; X86:       # %bb.0:
128; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
130; X86-NEXT:    movaps (%ecx), %xmm0
131; X86-NEXT:    movaps 16(%ecx), %xmm1
132; X86-NEXT:    movntps %xmm0, (%eax)
133; X86-NEXT:    movaps %xmm1, 16(%eax)
134; X86-NEXT:    retl
135;
136; X64-SSE-LABEL: merge_2_v4f32_align32_mix_ntstore:
137; X64-SSE:       # %bb.0:
138; X64-SSE-NEXT:    movaps (%rdi), %xmm0
139; X64-SSE-NEXT:    movaps 16(%rdi), %xmm1
140; X64-SSE-NEXT:    movntps %xmm0, (%rsi)
141; X64-SSE-NEXT:    movaps %xmm1, 16(%rsi)
142; X64-SSE-NEXT:    retq
143;
144; X64-AVX-LABEL: merge_2_v4f32_align32_mix_ntstore:
145; X64-AVX:       # %bb.0:
146; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
147; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
148; X64-AVX-NEXT:    vmovntps %xmm0, (%rsi)
149; X64-AVX-NEXT:    vmovaps %xmm1, 16(%rsi)
150; X64-AVX-NEXT:    retq
151  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
152  %2 = load <4 x float>, ptr %a0, align 32
153  %3 = load <4 x float>, ptr %1, align 16
154  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
155  store <4 x float> %2, ptr %a1, align 32, !nontemporal !0
156  store <4 x float> %3, ptr %4, align 16
157  ret void
158}
159
160; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
161; Must be kept separate as VMOVNTDQA xmm.
162define void @merge_2_v4f32_align16_ntload(ptr %a0, ptr %a1) nounwind {
163; X86-LABEL: merge_2_v4f32_align16_ntload:
164; X86:       # %bb.0:
165; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
166; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
167; X86-NEXT:    movaps (%ecx), %xmm0
168; X86-NEXT:    movaps 16(%ecx), %xmm1
169; X86-NEXT:    movaps %xmm0, (%eax)
170; X86-NEXT:    movaps %xmm1, 16(%eax)
171; X86-NEXT:    retl
172;
173; X64-SSE2-LABEL: merge_2_v4f32_align16_ntload:
174; X64-SSE2:       # %bb.0:
175; X64-SSE2-NEXT:    movaps (%rdi), %xmm0
176; X64-SSE2-NEXT:    movaps 16(%rdi), %xmm1
177; X64-SSE2-NEXT:    movaps %xmm0, (%rsi)
178; X64-SSE2-NEXT:    movaps %xmm1, 16(%rsi)
179; X64-SSE2-NEXT:    retq
180;
181; X64-SSE4A-LABEL: merge_2_v4f32_align16_ntload:
182; X64-SSE4A:       # %bb.0:
183; X64-SSE4A-NEXT:    movaps (%rdi), %xmm0
184; X64-SSE4A-NEXT:    movaps 16(%rdi), %xmm1
185; X64-SSE4A-NEXT:    movaps %xmm0, (%rsi)
186; X64-SSE4A-NEXT:    movaps %xmm1, 16(%rsi)
187; X64-SSE4A-NEXT:    retq
188;
189; X64-SSE41-LABEL: merge_2_v4f32_align16_ntload:
190; X64-SSE41:       # %bb.0:
191; X64-SSE41-NEXT:    movntdqa (%rdi), %xmm0
192; X64-SSE41-NEXT:    movntdqa 16(%rdi), %xmm1
193; X64-SSE41-NEXT:    movdqa %xmm0, (%rsi)
194; X64-SSE41-NEXT:    movdqa %xmm1, 16(%rsi)
195; X64-SSE41-NEXT:    retq
196;
197; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
198; X64-AVX:       # %bb.0:
199; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
200; X64-AVX-NEXT:    vmovntdqa 16(%rdi), %xmm1
201; X64-AVX-NEXT:    vmovdqa %xmm0, (%rsi)
202; X64-AVX-NEXT:    vmovdqa %xmm1, 16(%rsi)
203; X64-AVX-NEXT:    retq
204  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
205  %2 = load <4 x float>, ptr %a0, align 16, !nontemporal !0
206  %3 = load <4 x float>, ptr %1, align 16, !nontemporal !0
207  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
208  store <4 x float> %2, ptr %a1, align 16
209  store <4 x float> %3, ptr %4, align 16
210  ret void
211}
212
213; AVX can't perform NT-store-ymm on 16-byte aligned memory.
214; Must be kept separate as VMOVNTPS xmm.
215define void @merge_2_v4f32_align16_ntstore(ptr %a0, ptr %a1) nounwind {
216; X86-LABEL: merge_2_v4f32_align16_ntstore:
217; X86:       # %bb.0:
218; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
219; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
220; X86-NEXT:    movaps (%ecx), %xmm0
221; X86-NEXT:    movaps 16(%ecx), %xmm1
222; X86-NEXT:    movntps %xmm0, (%eax)
223; X86-NEXT:    movntps %xmm1, 16(%eax)
224; X86-NEXT:    retl
225;
226; X64-SSE-LABEL: merge_2_v4f32_align16_ntstore:
227; X64-SSE:       # %bb.0:
228; X64-SSE-NEXT:    movaps (%rdi), %xmm0
229; X64-SSE-NEXT:    movaps 16(%rdi), %xmm1
230; X64-SSE-NEXT:    movntps %xmm0, (%rsi)
231; X64-SSE-NEXT:    movntps %xmm1, 16(%rsi)
232; X64-SSE-NEXT:    retq
233;
234; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
235; X64-AVX:       # %bb.0:
236; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
237; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
238; X64-AVX-NEXT:    vmovntps %xmm0, (%rsi)
239; X64-AVX-NEXT:    vmovntps %xmm1, 16(%rsi)
240; X64-AVX-NEXT:    retq
241  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
242  %2 = load <4 x float>, ptr %a0, align 16
243  %3 = load <4 x float>, ptr %1, align 16
244  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
245  store <4 x float> %2, ptr %a1, align 16, !nontemporal !0
246  store <4 x float> %3, ptr %4, align 16, !nontemporal !0
247  ret void
248}
249
250; Nothing can perform NT-load-vector on 1-byte aligned memory.
251; Just perform regular loads.
252define void @merge_2_v4f32_align1_ntload(ptr %a0, ptr %a1) nounwind {
253; X86-LABEL: merge_2_v4f32_align1_ntload:
254; X86:       # %bb.0:
255; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
256; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
257; X86-NEXT:    movups (%ecx), %xmm0
258; X86-NEXT:    movups 16(%ecx), %xmm1
259; X86-NEXT:    movups %xmm0, (%eax)
260; X86-NEXT:    movups %xmm1, 16(%eax)
261; X86-NEXT:    retl
262;
263; X64-SSE-LABEL: merge_2_v4f32_align1_ntload:
264; X64-SSE:       # %bb.0:
265; X64-SSE-NEXT:    movups (%rdi), %xmm0
266; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
267; X64-SSE-NEXT:    movups %xmm0, (%rsi)
268; X64-SSE-NEXT:    movups %xmm1, 16(%rsi)
269; X64-SSE-NEXT:    retq
270;
271; X64-AVX-LABEL: merge_2_v4f32_align1_ntload:
272; X64-AVX:       # %bb.0:
273; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
274; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
275; X64-AVX-NEXT:    vzeroupper
276; X64-AVX-NEXT:    retq
277  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
278  %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0
279  %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0
280  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
281  store <4 x float> %2, ptr %a1, align 1
282  store <4 x float> %3, ptr %4, align 1
283  ret void
284}
285
286; Nothing can perform NT-store-vector on 1-byte aligned memory.
287; Must be scalarized to use MOVTNI/MOVNTSD.
288define void @merge_2_v4f32_align1_ntstore(ptr %a0, ptr %a1) nounwind {
289; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
290; X86-SSE2:       # %bb.0:
291; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
292; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
293; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
294; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
295; X86-SSE2-NEXT:    movd %xmm1, %ecx
296; X86-SSE2-NEXT:    movntil %ecx, (%eax)
297; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
298; X86-SSE2-NEXT:    movd %xmm2, %ecx
299; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
300; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
301; X86-SSE2-NEXT:    movd %xmm2, %ecx
302; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
303; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
304; X86-SSE2-NEXT:    movd %xmm1, %ecx
305; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
306; X86-SSE2-NEXT:    movd %xmm0, %ecx
307; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
308; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
309; X86-SSE2-NEXT:    movd %xmm1, %ecx
310; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
311; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
312; X86-SSE2-NEXT:    movd %xmm1, %ecx
313; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
314; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
315; X86-SSE2-NEXT:    movd %xmm0, %ecx
316; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
317; X86-SSE2-NEXT:    retl
318;
319; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
320; X86-SSE4A:       # %bb.0:
321; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
322; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
323; X86-SSE4A-NEXT:    movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
324; X86-SSE4A-NEXT:    movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
325; X86-SSE4A-NEXT:    movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
326; X86-SSE4A-NEXT:    movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
327; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
328; X86-SSE4A-NEXT:    movntsd %xmm1, 8(%eax)
329; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
330; X86-SSE4A-NEXT:    movntsd %xmm2, 16(%eax)
331; X86-SSE4A-NEXT:    retl
332;
333; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
334; X64-SSE2:       # %bb.0:
335; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
336; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
337; X64-SSE2-NEXT:    movq %xmm0, %rax
338; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
339; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
340; X64-SSE2-NEXT:    movq %xmm0, %rax
341; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
342; X64-SSE2-NEXT:    movq %xmm1, %rax
343; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
344; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
345; X64-SSE2-NEXT:    movq %xmm0, %rax
346; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
347; X64-SSE2-NEXT:    retq
348;
349; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
350; X64-SSE4A:       # %bb.0:
351; X64-SSE4A-NEXT:    movsd (%rdi), %xmm0 # xmm0 = mem[0],zero
352; X64-SSE4A-NEXT:    movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
353; X64-SSE4A-NEXT:    movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
354; X64-SSE4A-NEXT:    movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero
355; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
356; X64-SSE4A-NEXT:    movntsd %xmm1, 8(%rsi)
357; X64-SSE4A-NEXT:    movntsd %xmm3, 24(%rsi)
358; X64-SSE4A-NEXT:    movntsd %xmm2, 16(%rsi)
359; X64-SSE4A-NEXT:    retq
360;
361; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
362; X64-SSE41:       # %bb.0:
363; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
364; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
365; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
366; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
367; X64-SSE41-NEXT:    movq %xmm0, %rax
368; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
369; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
370; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
371; X64-SSE41-NEXT:    movq %xmm1, %rax
372; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
373; X64-SSE41-NEXT:    retq
374;
375; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
376; X64-AVX:       # %bb.0:
377; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
378; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
379; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
380; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
381; X64-AVX-NEXT:    vmovq %xmm0, %rax
382; X64-AVX-NEXT:    movntiq %rax, (%rsi)
383; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
384; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
385; X64-AVX-NEXT:    vmovq %xmm1, %rax
386; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
387; X64-AVX-NEXT:    retq
388  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
389  %2 = load <4 x float>, ptr %a0, align 1
390  %3 = load <4 x float>, ptr %1, align 1
391  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
392  store <4 x float> %2, ptr %a1, align 1, !nontemporal !0
393  store <4 x float> %3, ptr %4, align 1, !nontemporal !0
394  ret void
395}
396
397; Nothing can perform NT-load-vector on 1-byte aligned memory.
398; Just perform regular loads and scalarize NT-stores.
399define void @merge_2_v4f32_align1(ptr %a0, ptr %a1) nounwind {
400; X86-SSE2-LABEL: merge_2_v4f32_align1:
401; X86-SSE2:       # %bb.0:
402; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
403; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
404; X86-SSE2-NEXT:    movdqu (%ecx), %xmm1
405; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm0
406; X86-SSE2-NEXT:    movd %xmm1, %ecx
407; X86-SSE2-NEXT:    movntil %ecx, (%eax)
408; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
409; X86-SSE2-NEXT:    movd %xmm2, %ecx
410; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
411; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
412; X86-SSE2-NEXT:    movd %xmm2, %ecx
413; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
414; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
415; X86-SSE2-NEXT:    movd %xmm1, %ecx
416; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
417; X86-SSE2-NEXT:    movd %xmm0, %ecx
418; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
419; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
420; X86-SSE2-NEXT:    movd %xmm1, %ecx
421; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
422; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
423; X86-SSE2-NEXT:    movd %xmm1, %ecx
424; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
425; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
426; X86-SSE2-NEXT:    movd %xmm0, %ecx
427; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
428; X86-SSE2-NEXT:    retl
429;
430; X86-SSE4A-LABEL: merge_2_v4f32_align1:
431; X86-SSE4A:       # %bb.0:
432; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
433; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
434; X86-SSE4A-NEXT:    movsd (%ecx), %xmm0 # xmm0 = mem[0],zero
435; X86-SSE4A-NEXT:    movsd 8(%ecx), %xmm1 # xmm1 = mem[0],zero
436; X86-SSE4A-NEXT:    movsd 16(%ecx), %xmm2 # xmm2 = mem[0],zero
437; X86-SSE4A-NEXT:    movsd 24(%ecx), %xmm3 # xmm3 = mem[0],zero
438; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
439; X86-SSE4A-NEXT:    movntsd %xmm1, 8(%eax)
440; X86-SSE4A-NEXT:    movntsd %xmm3, 24(%eax)
441; X86-SSE4A-NEXT:    movntsd %xmm2, 16(%eax)
442; X86-SSE4A-NEXT:    retl
443;
444; X64-SSE2-LABEL: merge_2_v4f32_align1:
445; X64-SSE2:       # %bb.0:
446; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
447; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
448; X64-SSE2-NEXT:    movq %xmm0, %rax
449; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
450; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
451; X64-SSE2-NEXT:    movq %xmm0, %rax
452; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
453; X64-SSE2-NEXT:    movq %xmm1, %rax
454; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
455; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
456; X64-SSE2-NEXT:    movq %xmm0, %rax
457; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
458; X64-SSE2-NEXT:    retq
459;
460; X64-SSE4A-LABEL: merge_2_v4f32_align1:
461; X64-SSE4A:       # %bb.0:
462; X64-SSE4A-NEXT:    movsd (%rdi), %xmm0 # xmm0 = mem[0],zero
463; X64-SSE4A-NEXT:    movsd 8(%rdi), %xmm1 # xmm1 = mem[0],zero
464; X64-SSE4A-NEXT:    movsd 16(%rdi), %xmm2 # xmm2 = mem[0],zero
465; X64-SSE4A-NEXT:    movsd 24(%rdi), %xmm3 # xmm3 = mem[0],zero
466; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
467; X64-SSE4A-NEXT:    movntsd %xmm1, 8(%rsi)
468; X64-SSE4A-NEXT:    movntsd %xmm3, 24(%rsi)
469; X64-SSE4A-NEXT:    movntsd %xmm2, 16(%rsi)
470; X64-SSE4A-NEXT:    retq
471;
472; X64-SSE41-LABEL: merge_2_v4f32_align1:
473; X64-SSE41:       # %bb.0:
474; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
475; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
476; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
477; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
478; X64-SSE41-NEXT:    movq %xmm0, %rax
479; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
480; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
481; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
482; X64-SSE41-NEXT:    movq %xmm1, %rax
483; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
484; X64-SSE41-NEXT:    retq
485;
486; X64-AVX-LABEL: merge_2_v4f32_align1:
487; X64-AVX:       # %bb.0:
488; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
489; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
490; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
491; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
492; X64-AVX-NEXT:    vmovq %xmm0, %rax
493; X64-AVX-NEXT:    movntiq %rax, (%rsi)
494; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
495; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
496; X64-AVX-NEXT:    vmovq %xmm1, %rax
497; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
498; X64-AVX-NEXT:    retq
499  %1 = getelementptr inbounds <4 x float>, ptr %a0, i64 1, i64 0
500  %2 = load <4 x float>, ptr %a0, align 1, !nontemporal !0
501  %3 = load <4 x float>, ptr %1, align 1, !nontemporal !0
502  %4 = getelementptr inbounds <4 x float>, ptr %a1, i64 1, i64 0
503  store <4 x float> %2, ptr %a1, align 1, !nontemporal !0
504  store <4 x float> %3, ptr %4, align 1, !nontemporal !0
505  ret void
506}
507
508!0 = !{i32 1}
509