xref: /llvm-project/llvm/test/CodeGen/X86/fast-isel-vecload.ll (revision f54ebca2ddf6a7b244020c1101ef616c298ac892)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
3; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVXONLY
4; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
5; RUN: llc -O0 -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512bw,+avx512vl < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
6
7; Verify that fast-isel knows how to select aligned/unaligned vector loads.
8; Also verify that the selected load instruction is in the correct domain.
9
10define <16 x i8> @test_v16i8(<16 x i8>* %V) {
11; SSE-LABEL: test_v16i8:
12; SSE:       # BB#0: # %entry
13; SSE-NEXT:    movdqa (%rdi), %xmm0
14; SSE-NEXT:    retq
15;
16; AVX-LABEL: test_v16i8:
17; AVX:       # BB#0: # %entry
18; AVX-NEXT:    vmovdqa (%rdi), %xmm0
19; AVX-NEXT:    retq
20entry:
21  %0 = load <16 x i8>, <16 x i8>* %V, align 16
22  ret <16 x i8> %0
23}
24
25define <8 x i16> @test_v8i16(<8 x i16>* %V) {
26; SSE-LABEL: test_v8i16:
27; SSE:       # BB#0: # %entry
28; SSE-NEXT:    movdqa (%rdi), %xmm0
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: test_v8i16:
32; AVX:       # BB#0: # %entry
33; AVX-NEXT:    vmovdqa (%rdi), %xmm0
34; AVX-NEXT:    retq
35entry:
36  %0 = load <8 x i16>, <8 x i16>* %V, align 16
37  ret <8 x i16> %0
38}
39
40define <4 x i32> @test_v4i32(<4 x i32>* %V) {
41; SSE-LABEL: test_v4i32:
42; SSE:       # BB#0: # %entry
43; SSE-NEXT:    movdqa (%rdi), %xmm0
44; SSE-NEXT:    retq
45;
46; AVX-LABEL: test_v4i32:
47; AVX:       # BB#0: # %entry
48; AVX-NEXT:    vmovdqa (%rdi), %xmm0
49; AVX-NEXT:    retq
50entry:
51  %0 = load <4 x i32>, <4 x i32>* %V, align 16
52  ret <4 x i32> %0
53}
54
55define <2 x i64> @test_v2i64(<2 x i64>* %V) {
56; SSE-LABEL: test_v2i64:
57; SSE:       # BB#0: # %entry
58; SSE-NEXT:    movdqa (%rdi), %xmm0
59; SSE-NEXT:    retq
60;
61; AVX-LABEL: test_v2i64:
62; AVX:       # BB#0: # %entry
63; AVX-NEXT:    vmovdqa (%rdi), %xmm0
64; AVX-NEXT:    retq
65entry:
66  %0 = load <2 x i64>, <2 x i64>* %V, align 16
67  ret <2 x i64> %0
68}
69
70define <16 x i8> @test_v16i8_unaligned(<16 x i8>* %V) {
71; SSE-LABEL: test_v16i8_unaligned:
72; SSE:       # BB#0: # %entry
73; SSE-NEXT:    movdqu (%rdi), %xmm0
74; SSE-NEXT:    retq
75;
76; AVX-LABEL: test_v16i8_unaligned:
77; AVX:       # BB#0: # %entry
78; AVX-NEXT:    vmovdqu (%rdi), %xmm0
79; AVX-NEXT:    retq
80entry:
81  %0 = load <16 x i8>, <16 x i8>* %V, align 4
82  ret <16 x i8> %0
83}
84
85define <8 x i16> @test_v8i16_unaligned(<8 x i16>* %V) {
86; SSE-LABEL: test_v8i16_unaligned:
87; SSE:       # BB#0: # %entry
88; SSE-NEXT:    movdqu (%rdi), %xmm0
89; SSE-NEXT:    retq
90;
91; AVX-LABEL: test_v8i16_unaligned:
92; AVX:       # BB#0: # %entry
93; AVX-NEXT:    vmovdqu (%rdi), %xmm0
94; AVX-NEXT:    retq
95entry:
96  %0 = load <8 x i16>, <8 x i16>* %V, align 4
97  ret <8 x i16> %0
98}
99
100define <4 x i32> @test_v4i32_unaligned(<4 x i32>* %V) {
101; SSE-LABEL: test_v4i32_unaligned:
102; SSE:       # BB#0: # %entry
103; SSE-NEXT:    movdqu (%rdi), %xmm0
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: test_v4i32_unaligned:
107; AVX:       # BB#0: # %entry
108; AVX-NEXT:    vmovdqu (%rdi), %xmm0
109; AVX-NEXT:    retq
110entry:
111  %0 = load <4 x i32>, <4 x i32>* %V, align 4
112  ret <4 x i32> %0
113}
114
115define <2 x i64> @test_v2i64_unaligned(<2 x i64>* %V) {
116; SSE-LABEL: test_v2i64_unaligned:
117; SSE:       # BB#0: # %entry
118; SSE-NEXT:    movdqu (%rdi), %xmm0
119; SSE-NEXT:    retq
120;
121; AVX-LABEL: test_v2i64_unaligned:
122; AVX:       # BB#0: # %entry
123; AVX-NEXT:    vmovdqu (%rdi), %xmm0
124; AVX-NEXT:    retq
125entry:
126  %0 = load <2 x i64>, <2 x i64>* %V, align 4
127  ret <2 x i64> %0
128}
129
130define <4 x float> @test_v4f32(<4 x float>* %V) {
131; SSE-LABEL: test_v4f32:
132; SSE:       # BB#0: # %entry
133; SSE-NEXT:    movaps (%rdi), %xmm0
134; SSE-NEXT:    retq
135;
136; AVX-LABEL: test_v4f32:
137; AVX:       # BB#0: # %entry
138; AVX-NEXT:    vmovaps (%rdi), %xmm0
139; AVX-NEXT:    retq
140entry:
141  %0 = load <4 x float>, <4 x float>* %V, align 16
142  ret <4 x float> %0
143}
144
145define <2 x double> @test_v2f64(<2 x double>* %V) {
146; SSE-LABEL: test_v2f64:
147; SSE:       # BB#0: # %entry
148; SSE-NEXT:    movapd (%rdi), %xmm0
149; SSE-NEXT:    retq
150;
151; AVX-LABEL: test_v2f64:
152; AVX:       # BB#0: # %entry
153; AVX-NEXT:    vmovapd (%rdi), %xmm0
154; AVX-NEXT:    retq
155entry:
156  %0 = load <2 x double>, <2 x double>* %V, align 16
157  ret <2 x double> %0
158}
159
160define <4 x float> @test_v4f32_unaligned(<4 x float>* %V) {
161; SSE-LABEL: test_v4f32_unaligned:
162; SSE:       # BB#0: # %entry
163; SSE-NEXT:    movups (%rdi), %xmm0
164; SSE-NEXT:    retq
165;
166; AVX-LABEL: test_v4f32_unaligned:
167; AVX:       # BB#0: # %entry
168; AVX-NEXT:    vmovups (%rdi), %xmm0
169; AVX-NEXT:    retq
170entry:
171  %0 = load <4 x float>, <4 x float>* %V, align 4
172  ret <4 x float> %0
173}
174
175define <2 x double> @test_v2f64_unaligned(<2 x double>* %V) {
176; SSE-LABEL: test_v2f64_unaligned:
177; SSE:       # BB#0: # %entry
178; SSE-NEXT:    movupd (%rdi), %xmm0
179; SSE-NEXT:    retq
180;
181; AVX-LABEL: test_v2f64_unaligned:
182; AVX:       # BB#0: # %entry
183; AVX-NEXT:    vmovupd (%rdi), %xmm0
184; AVX-NEXT:    retq
185entry:
186  %0 = load <2 x double>, <2 x double>* %V, align 4
187  ret <2 x double> %0
188}
189
190define <16 x i8> @test_v16i8_abi_alignment(<16 x i8>* %V) {
191; SSE-LABEL: test_v16i8_abi_alignment:
192; SSE:       # BB#0: # %entry
193; SSE-NEXT:    movdqa (%rdi), %xmm0
194; SSE-NEXT:    retq
195;
196; AVX-LABEL: test_v16i8_abi_alignment:
197; AVX:       # BB#0: # %entry
198; AVX-NEXT:    vmovdqa (%rdi), %xmm0
199; AVX-NEXT:    retq
200entry:
201  %0 = load <16 x i8>, <16 x i8>* %V
202  ret <16 x i8> %0
203}
204
205define <8 x i16> @test_v8i16_abi_alignment(<8 x i16>* %V) {
206; SSE-LABEL: test_v8i16_abi_alignment:
207; SSE:       # BB#0: # %entry
208; SSE-NEXT:    movdqa (%rdi), %xmm0
209; SSE-NEXT:    retq
210;
211; AVX-LABEL: test_v8i16_abi_alignment:
212; AVX:       # BB#0: # %entry
213; AVX-NEXT:    vmovdqa (%rdi), %xmm0
214; AVX-NEXT:    retq
215entry:
216  %0 = load <8 x i16>, <8 x i16>* %V
217  ret <8 x i16> %0
218}
219
220define <4 x i32> @test_v4i32_abi_alignment(<4 x i32>* %V) {
221; SSE-LABEL: test_v4i32_abi_alignment:
222; SSE:       # BB#0: # %entry
223; SSE-NEXT:    movdqa (%rdi), %xmm0
224; SSE-NEXT:    retq
225;
226; AVX-LABEL: test_v4i32_abi_alignment:
227; AVX:       # BB#0: # %entry
228; AVX-NEXT:    vmovdqa (%rdi), %xmm0
229; AVX-NEXT:    retq
230entry:
231  %0 = load <4 x i32>, <4 x i32>* %V
232  ret <4 x i32> %0
233}
234
235define <2 x i64> @test_v2i64_abi_alignment(<2 x i64>* %V) {
236; SSE-LABEL: test_v2i64_abi_alignment:
237; SSE:       # BB#0: # %entry
238; SSE-NEXT:    movdqa (%rdi), %xmm0
239; SSE-NEXT:    retq
240;
241; AVX-LABEL: test_v2i64_abi_alignment:
242; AVX:       # BB#0: # %entry
243; AVX-NEXT:    vmovdqa (%rdi), %xmm0
244; AVX-NEXT:    retq
245entry:
246  %0 = load <2 x i64>, <2 x i64>* %V
247  ret <2 x i64> %0
248}
249
250define <4 x float> @test_v4f32_abi_alignment(<4 x float>* %V) {
251; SSE-LABEL: test_v4f32_abi_alignment:
252; SSE:       # BB#0: # %entry
253; SSE-NEXT:    movaps (%rdi), %xmm0
254; SSE-NEXT:    retq
255;
256; AVX-LABEL: test_v4f32_abi_alignment:
257; AVX:       # BB#0: # %entry
258; AVX-NEXT:    vmovaps (%rdi), %xmm0
259; AVX-NEXT:    retq
260entry:
261  %0 = load <4 x float>, <4 x float>* %V
262  ret <4 x float> %0
263}
264
265define <2 x double> @test_v2f64_abi_alignment(<2 x double>* %V) {
266; SSE-LABEL: test_v2f64_abi_alignment:
267; SSE:       # BB#0: # %entry
268; SSE-NEXT:    movapd (%rdi), %xmm0
269; SSE-NEXT:    retq
270;
271; AVX-LABEL: test_v2f64_abi_alignment:
272; AVX:       # BB#0: # %entry
273; AVX-NEXT:    vmovapd (%rdi), %xmm0
274; AVX-NEXT:    retq
275entry:
276  %0 = load <2 x double>, <2 x double>* %V
277  ret <2 x double> %0
278}
279
280define <32 x i8> @test_v32i8(<32 x i8>* %V) {
281; SSE-LABEL: test_v32i8:
282; SSE:       # BB#0: # %entry
283; SSE-NEXT:    movaps (%rdi), %xmm0
284; SSE-NEXT:    movaps 16(%rdi), %xmm1
285; SSE-NEXT:    retq
286;
287; AVX-LABEL: test_v32i8:
288; AVX:       # BB#0: # %entry
289; AVX-NEXT:    vmovdqa (%rdi), %ymm0
290; AVX-NEXT:    retq
291entry:
292  %0 = load <32 x i8>, <32 x i8>* %V, align 32
293  ret <32 x i8> %0
294}
295
296define <16 x i16> @test_v16i16(<16 x i16>* %V) {
297; SSE-LABEL: test_v16i16:
298; SSE:       # BB#0: # %entry
299; SSE-NEXT:    movaps (%rdi), %xmm0
300; SSE-NEXT:    movaps 16(%rdi), %xmm1
301; SSE-NEXT:    retq
302;
303; AVX-LABEL: test_v16i16:
304; AVX:       # BB#0: # %entry
305; AVX-NEXT:    vmovdqa (%rdi), %ymm0
306; AVX-NEXT:    retq
307entry:
308  %0 = load <16 x i16>, <16 x i16>* %V, align 32
309  ret <16 x i16> %0
310}
311
312define <8 x i32> @test_v8i32(<8 x i32>* %V) {
313; SSE-LABEL: test_v8i32:
314; SSE:       # BB#0: # %entry
315; SSE-NEXT:    movaps (%rdi), %xmm0
316; SSE-NEXT:    movaps 16(%rdi), %xmm1
317; SSE-NEXT:    retq
318;
319; AVX-LABEL: test_v8i32:
320; AVX:       # BB#0: # %entry
321; AVX-NEXT:    vmovdqu (%rdi), %ymm0
322; AVX-NEXT:    retq
323entry:
324  %0 = load <8 x i32>, <8 x i32>* %V, align 16
325  ret <8 x i32> %0
326}
327
328define <4 x i64> @test_v4i64(<4 x i64>* %V) {
329; SSE-LABEL: test_v4i64:
330; SSE:       # BB#0: # %entry
331; SSE-NEXT:    movaps (%rdi), %xmm0
332; SSE-NEXT:    movaps 16(%rdi), %xmm1
333; SSE-NEXT:    retq
334;
335; AVX-LABEL: test_v4i64:
336; AVX:       # BB#0: # %entry
337; AVX-NEXT:    vmovdqa (%rdi), %ymm0
338; AVX-NEXT:    retq
339entry:
340  %0 = load <4 x i64>, <4 x i64>* %V, align 32
341  ret <4 x i64> %0
342}
343
344define <32 x i8> @test_v32i8_unaligned(<32 x i8>* %V) {
345; SSE-LABEL: test_v32i8_unaligned:
346; SSE:       # BB#0: # %entry
347; SSE-NEXT:    movups (%rdi), %xmm0
348; SSE-NEXT:    movups 16(%rdi), %xmm1
349; SSE-NEXT:    retq
350;
351; AVX-LABEL: test_v32i8_unaligned:
352; AVX:       # BB#0: # %entry
353; AVX-NEXT:    vmovdqu (%rdi), %ymm0
354; AVX-NEXT:    retq
355entry:
356  %0 = load <32 x i8>, <32 x i8>* %V, align 4
357  ret <32 x i8> %0
358}
359
360define <16 x i16> @test_v16i16_unaligned(<16 x i16>* %V) {
361; SSE-LABEL: test_v16i16_unaligned:
362; SSE:       # BB#0: # %entry
363; SSE-NEXT:    movups (%rdi), %xmm0
364; SSE-NEXT:    movups 16(%rdi), %xmm1
365; SSE-NEXT:    retq
366;
367; AVX-LABEL: test_v16i16_unaligned:
368; AVX:       # BB#0: # %entry
369; AVX-NEXT:    vmovdqu (%rdi), %ymm0
370; AVX-NEXT:    retq
371entry:
372  %0 = load <16 x i16>, <16 x i16>* %V, align 4
373  ret <16 x i16> %0
374}
375
376define <8 x i32> @test_v8i32_unaligned(<8 x i32>* %V) {
377; SSE-LABEL: test_v8i32_unaligned:
378; SSE:       # BB#0: # %entry
379; SSE-NEXT:    movups (%rdi), %xmm0
380; SSE-NEXT:    movups 16(%rdi), %xmm1
381; SSE-NEXT:    retq
382;
383; AVX-LABEL: test_v8i32_unaligned:
384; AVX:       # BB#0: # %entry
385; AVX-NEXT:    vmovdqu (%rdi), %ymm0
386; AVX-NEXT:    retq
387entry:
388  %0 = load <8 x i32>, <8 x i32>* %V, align 4
389  ret <8 x i32> %0
390}
391
392define <4 x i64> @test_v4i64_unaligned(<4 x i64>* %V) {
393; SSE-LABEL: test_v4i64_unaligned:
394; SSE:       # BB#0: # %entry
395; SSE-NEXT:    movups (%rdi), %xmm0
396; SSE-NEXT:    movups 16(%rdi), %xmm1
397; SSE-NEXT:    retq
398;
399; AVX-LABEL: test_v4i64_unaligned:
400; AVX:       # BB#0: # %entry
401; AVX-NEXT:    vmovdqu (%rdi), %ymm0
402; AVX-NEXT:    retq
403entry:
404  %0 = load <4 x i64>, <4 x i64>* %V, align 4
405  ret <4 x i64> %0
406}
407
408define <8 x float> @test_v8f32(<8 x float>* %V) {
409; SSE-LABEL: test_v8f32:
410; SSE:       # BB#0: # %entry
411; SSE-NEXT:    movaps (%rdi), %xmm0
412; SSE-NEXT:    movaps 16(%rdi), %xmm1
413; SSE-NEXT:    retq
414;
415; AVX-LABEL: test_v8f32:
416; AVX:       # BB#0: # %entry
417; AVX-NEXT:    vmovups (%rdi), %ymm0
418; AVX-NEXT:    retq
419entry:
420  %0 = load <8 x float>, <8 x float>* %V, align 16
421  ret <8 x float> %0
422}
423
424define <4 x double> @test_v4f64(<4 x double>* %V) {
425; SSE-LABEL: test_v4f64:
426; SSE:       # BB#0: # %entry
427; SSE-NEXT:    movapd (%rdi), %xmm0
428; SSE-NEXT:    movapd 16(%rdi), %xmm1
429; SSE-NEXT:    retq
430;
431; AVX-LABEL: test_v4f64:
432; AVX:       # BB#0: # %entry
433; AVX-NEXT:    vmovupd (%rdi), %ymm0
434; AVX-NEXT:    retq
435entry:
436  %0 = load <4 x double>, <4 x double>* %V, align 16
437  ret <4 x double> %0
438}
439
440define <8 x float> @test_v8f32_unaligned(<8 x float>* %V) {
441; SSE-LABEL: test_v8f32_unaligned:
442; SSE:       # BB#0: # %entry
443; SSE-NEXT:    movups (%rdi), %xmm0
444; SSE-NEXT:    movups 16(%rdi), %xmm1
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: test_v8f32_unaligned:
448; AVX:       # BB#0: # %entry
449; AVX-NEXT:    vmovups (%rdi), %ymm0
450; AVX-NEXT:    retq
451entry:
452  %0 = load <8 x float>, <8 x float>* %V, align 4
453  ret <8 x float> %0
454}
455
456define <4 x double> @test_v4f64_unaligned(<4 x double>* %V) {
457; SSE-LABEL: test_v4f64_unaligned:
458; SSE:       # BB#0: # %entry
459; SSE-NEXT:    movupd (%rdi), %xmm0
460; SSE-NEXT:    movupd 16(%rdi), %xmm1
461; SSE-NEXT:    retq
462;
463; AVX-LABEL: test_v4f64_unaligned:
464; AVX:       # BB#0: # %entry
465; AVX-NEXT:    vmovupd (%rdi), %ymm0
466; AVX-NEXT:    retq
467entry:
468  %0 = load <4 x double>, <4 x double>* %V, align 4
469  ret <4 x double> %0
470}
471
472define <64 x i8> @test_v64i8(<64 x i8>* %V) {
473; SSE-LABEL: test_v64i8:
474; SSE:       # BB#0: # %entry
475; SSE-NEXT:    movaps (%rdi), %xmm0
476; SSE-NEXT:    movaps 16(%rdi), %xmm1
477; SSE-NEXT:    movaps 32(%rdi), %xmm2
478; SSE-NEXT:    movaps 48(%rdi), %xmm3
479; SSE-NEXT:    retq
480;
481; AVXONLY-LABEL: test_v64i8:
482; AVXONLY:       # BB#0: # %entry
483; AVXONLY-NEXT:    vmovaps (%rdi), %ymm0
484; AVXONLY-NEXT:    vmovaps 32(%rdi), %ymm1
485; AVXONLY-NEXT:    retq
486;
487; KNL-LABEL: test_v64i8:
488; KNL:       # BB#0: # %entry
489; KNL-NEXT:    vmovaps (%rdi), %ymm0
490; KNL-NEXT:    vmovaps 32(%rdi), %ymm1
491; KNL-NEXT:    retq
492;
493; SKX-LABEL: test_v64i8:
494; SKX:       # BB#0: # %entry
495; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0
496; SKX-NEXT:    retq
497entry:
498  %0 = load <64 x i8>, <64 x i8>* %V, align 32
499  ret <64 x i8> %0
500}
501
502define <32 x i16> @test_v32i16(<32 x i16>* %V) {
503; SSE-LABEL: test_v32i16:
504; SSE:       # BB#0: # %entry
505; SSE-NEXT:    movaps (%rdi), %xmm0
506; SSE-NEXT:    movaps 16(%rdi), %xmm1
507; SSE-NEXT:    movaps 32(%rdi), %xmm2
508; SSE-NEXT:    movaps 48(%rdi), %xmm3
509; SSE-NEXT:    retq
510;
511; AVXONLY-LABEL: test_v32i16:
512; AVXONLY:       # BB#0: # %entry
513; AVXONLY-NEXT:    vmovaps (%rdi), %ymm0
514; AVXONLY-NEXT:    vmovaps 32(%rdi), %ymm1
515; AVXONLY-NEXT:    retq
516;
517; KNL-LABEL: test_v32i16:
518; KNL:       # BB#0: # %entry
519; KNL-NEXT:    vmovaps (%rdi), %ymm0
520; KNL-NEXT:    vmovaps 32(%rdi), %ymm1
521; KNL-NEXT:    retq
522;
523; SKX-LABEL: test_v32i16:
524; SKX:       # BB#0: # %entry
525; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0
526; SKX-NEXT:    retq
527entry:
528  %0 = load <32 x i16>, <32 x i16>* %V, align 32
529  ret <32 x i16> %0
530}
531
532define <16 x i32> @test_v16i32(<16 x i32>* %V) {
533; SSE-LABEL: test_v16i32:
534; SSE:       # BB#0: # %entry
535; SSE-NEXT:    movaps (%rdi), %xmm0
536; SSE-NEXT:    movaps 16(%rdi), %xmm1
537; SSE-NEXT:    movaps 32(%rdi), %xmm2
538; SSE-NEXT:    movaps 48(%rdi), %xmm3
539; SSE-NEXT:    retq
540;
541; AVXONLY-LABEL: test_v16i32:
542; AVXONLY:       # BB#0: # %entry
543; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
544; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
545; AVXONLY-NEXT:    retq
546;
547; AVX512-LABEL: test_v16i32:
548; AVX512:       # BB#0: # %entry
549; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
550; AVX512-NEXT:    retq
551entry:
552  %0 = load <16 x i32>, <16 x i32>* %V, align 16
553  ret <16 x i32> %0
554}
555
556define <8 x i64> @test_v8i64(<8 x i64>* %V) {
557; SSE-LABEL: test_v8i64:
558; SSE:       # BB#0: # %entry
559; SSE-NEXT:    movaps (%rdi), %xmm0
560; SSE-NEXT:    movaps 16(%rdi), %xmm1
561; SSE-NEXT:    movaps 32(%rdi), %xmm2
562; SSE-NEXT:    movaps 48(%rdi), %xmm3
563; SSE-NEXT:    retq
564;
565; AVXONLY-LABEL: test_v8i64:
566; AVXONLY:       # BB#0: # %entry
567; AVXONLY-NEXT:    vmovaps (%rdi), %ymm0
568; AVXONLY-NEXT:    vmovaps 32(%rdi), %ymm1
569; AVXONLY-NEXT:    retq
570;
571; AVX512-LABEL: test_v8i64:
572; AVX512:       # BB#0: # %entry
573; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
574; AVX512-NEXT:    retq
575entry:
576  %0 = load <8 x i64>, <8 x i64>* %V, align 32
577  ret <8 x i64> %0
578}
579
580define <64 x i8> @test_v64i8_unaligned(<64 x i8>* %V) {
581; SSE-LABEL: test_v64i8_unaligned:
582; SSE:       # BB#0: # %entry
583; SSE-NEXT:    movups (%rdi), %xmm0
584; SSE-NEXT:    movups 16(%rdi), %xmm1
585; SSE-NEXT:    movups 32(%rdi), %xmm2
586; SSE-NEXT:    movups 48(%rdi), %xmm3
587; SSE-NEXT:    retq
588;
589; AVXONLY-LABEL: test_v64i8_unaligned:
590; AVXONLY:       # BB#0: # %entry
591; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
592; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
593; AVXONLY-NEXT:    retq
594;
595; KNL-LABEL: test_v64i8_unaligned:
596; KNL:       # BB#0: # %entry
597; KNL-NEXT:    vmovups (%rdi), %ymm0
598; KNL-NEXT:    vmovups 32(%rdi), %ymm1
599; KNL-NEXT:    retq
600;
601; SKX-LABEL: test_v64i8_unaligned:
602; SKX:       # BB#0: # %entry
603; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0
604; SKX-NEXT:    retq
605entry:
606  %0 = load <64 x i8>, <64 x i8>* %V, align 4
607  ret <64 x i8> %0
608}
609
610define <32 x i16> @test_v32i16_unaligned(<32 x i16>* %V) {
611; SSE-LABEL: test_v32i16_unaligned:
612; SSE:       # BB#0: # %entry
613; SSE-NEXT:    movups (%rdi), %xmm0
614; SSE-NEXT:    movups 16(%rdi), %xmm1
615; SSE-NEXT:    movups 32(%rdi), %xmm2
616; SSE-NEXT:    movups 48(%rdi), %xmm3
617; SSE-NEXT:    retq
618;
619; AVXONLY-LABEL: test_v32i16_unaligned:
620; AVXONLY:       # BB#0: # %entry
621; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
622; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
623; AVXONLY-NEXT:    retq
624;
625; KNL-LABEL: test_v32i16_unaligned:
626; KNL:       # BB#0: # %entry
627; KNL-NEXT:    vmovups (%rdi), %ymm0
628; KNL-NEXT:    vmovups 32(%rdi), %ymm1
629; KNL-NEXT:    retq
630;
631; SKX-LABEL: test_v32i16_unaligned:
632; SKX:       # BB#0: # %entry
633; SKX-NEXT:    vmovdqu64 (%rdi), %zmm0
634; SKX-NEXT:    retq
635entry:
636  %0 = load <32 x i16>, <32 x i16>* %V, align 4
637  ret <32 x i16> %0
638}
639
640define <16 x i32> @test_v16i32_unaligned(<16 x i32>* %V) {
641; SSE-LABEL: test_v16i32_unaligned:
642; SSE:       # BB#0: # %entry
643; SSE-NEXT:    movups (%rdi), %xmm0
644; SSE-NEXT:    movups 16(%rdi), %xmm1
645; SSE-NEXT:    movups 32(%rdi), %xmm2
646; SSE-NEXT:    movups 48(%rdi), %xmm3
647; SSE-NEXT:    retq
648;
649; AVXONLY-LABEL: test_v16i32_unaligned:
650; AVXONLY:       # BB#0: # %entry
651; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
652; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
653; AVXONLY-NEXT:    retq
654;
655; AVX512-LABEL: test_v16i32_unaligned:
656; AVX512:       # BB#0: # %entry
657; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
658; AVX512-NEXT:    retq
659entry:
660  %0 = load <16 x i32>, <16 x i32>* %V, align 4
661  ret <16 x i32> %0
662}
663
664define <8 x i64> @test_v8i64_unaligned(<8 x i64>* %V) {
665; SSE-LABEL: test_v8i64_unaligned:
666; SSE:       # BB#0: # %entry
667; SSE-NEXT:    movups (%rdi), %xmm0
668; SSE-NEXT:    movups 16(%rdi), %xmm1
669; SSE-NEXT:    movups 32(%rdi), %xmm2
670; SSE-NEXT:    movups 48(%rdi), %xmm3
671; SSE-NEXT:    retq
672;
673; AVXONLY-LABEL: test_v8i64_unaligned:
674; AVXONLY:       # BB#0: # %entry
675; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
676; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
677; AVXONLY-NEXT:    retq
678;
679; AVX512-LABEL: test_v8i64_unaligned:
680; AVX512:       # BB#0: # %entry
681; AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
682; AVX512-NEXT:    retq
683entry:
684  %0 = load <8 x i64>, <8 x i64>* %V, align 4
685  ret <8 x i64> %0
686}
687
688define <8 x float> @test_v16f32(<8 x float>* %V) {
689; SSE-LABEL: test_v16f32:
690; SSE:       # BB#0: # %entry
691; SSE-NEXT:    movaps (%rdi), %xmm0
692; SSE-NEXT:    movaps 16(%rdi), %xmm1
693; SSE-NEXT:    retq
694;
695; AVX-LABEL: test_v16f32:
696; AVX:       # BB#0: # %entry
697; AVX-NEXT:    vmovups (%rdi), %ymm0
698; AVX-NEXT:    retq
699entry:
700  %0 = load <8 x float>, <8 x float>* %V, align 16
701  ret <8 x float> %0
702}
703
704define <8 x double> @test_v8f64(<8 x double>* %V) {
705; SSE-LABEL: test_v8f64:
706; SSE:       # BB#0: # %entry
707; SSE-NEXT:    movapd (%rdi), %xmm0
708; SSE-NEXT:    movapd 16(%rdi), %xmm1
709; SSE-NEXT:    movapd 32(%rdi), %xmm2
710; SSE-NEXT:    movapd 48(%rdi), %xmm3
711; SSE-NEXT:    retq
712;
713; AVXONLY-LABEL: test_v8f64:
714; AVXONLY:       # BB#0: # %entry
715; AVXONLY-NEXT:    vmovupd (%rdi), %ymm0
716; AVXONLY-NEXT:    vmovupd 32(%rdi), %ymm1
717; AVXONLY-NEXT:    retq
718;
719; AVX512-LABEL: test_v8f64:
720; AVX512:       # BB#0: # %entry
721; AVX512-NEXT:    vmovupd (%rdi), %zmm0
722; AVX512-NEXT:    retq
723entry:
724  %0 = load <8 x double>, <8 x double>* %V, align 16
725  ret <8 x double> %0
726}
727
728define <16 x float> @test_v16f32_unaligned(<16 x float>* %V) {
729; SSE-LABEL: test_v16f32_unaligned:
730; SSE:       # BB#0: # %entry
731; SSE-NEXT:    movups (%rdi), %xmm0
732; SSE-NEXT:    movups 16(%rdi), %xmm1
733; SSE-NEXT:    movups 32(%rdi), %xmm2
734; SSE-NEXT:    movups 48(%rdi), %xmm3
735; SSE-NEXT:    retq
736;
737; AVXONLY-LABEL: test_v16f32_unaligned:
738; AVXONLY:       # BB#0: # %entry
739; AVXONLY-NEXT:    vmovups (%rdi), %ymm0
740; AVXONLY-NEXT:    vmovups 32(%rdi), %ymm1
741; AVXONLY-NEXT:    retq
742;
743; AVX512-LABEL: test_v16f32_unaligned:
744; AVX512:       # BB#0: # %entry
745; AVX512-NEXT:    vmovups (%rdi), %zmm0
746; AVX512-NEXT:    retq
747entry:
748  %0 = load <16 x float>, <16 x float>* %V, align 4
749  ret <16 x float> %0
750}
751
752define <8 x double> @test_v8f64_unaligned(<8 x double>* %V) {
753; SSE-LABEL: test_v8f64_unaligned:
754; SSE:       # BB#0: # %entry
755; SSE-NEXT:    movupd (%rdi), %xmm0
756; SSE-NEXT:    movupd 16(%rdi), %xmm1
757; SSE-NEXT:    movupd 32(%rdi), %xmm2
758; SSE-NEXT:    movupd 48(%rdi), %xmm3
759; SSE-NEXT:    retq
760;
761; AVXONLY-LABEL: test_v8f64_unaligned:
762; AVXONLY:       # BB#0: # %entry
763; AVXONLY-NEXT:    vmovupd (%rdi), %ymm0
764; AVXONLY-NEXT:    vmovupd 32(%rdi), %ymm1
765; AVXONLY-NEXT:    retq
766;
767; AVX512-LABEL: test_v8f64_unaligned:
768; AVX512:       # BB#0: # %entry
769; AVX512-NEXT:    vmovupd (%rdi), %zmm0
770; AVX512-NEXT:    retq
771entry:
772  %0 = load <8 x double>, <8 x double>* %V, align 4
773  ret <8 x double> %0
774}
775
776