xref: /llvm-project/llvm/test/CodeGen/X86/slow-unaligned-mem.ll (revision 02e4186d0b3508e79d78b0ec844518b13a3fe9ea)
1; Intel chips with slow unaligned memory accesses
2
3; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium3m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
5; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium-m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
6; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
7; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=pentium4m     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
8; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=yonah         2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
9; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=prescott      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
10; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nocona        2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
11; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=core2         2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
12; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=penryn        2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
13; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bonnell       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
14
15; Intel chips with fast unaligned memory accesses
16
17; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont     2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
18; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=nehalem        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
19; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=westmere       2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
20; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=sandybridge    2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX128
21; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=ivybridge      2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX128
22; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=haswell        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
23; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=broadwell      2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
24; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl            2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
25; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
26
27; AMD chips with slow unaligned memory accesses
28
29; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-4      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
30; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-xp     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SCALAR
31; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8            2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
32; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
33; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64      2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
34; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon-fx     2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
35; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=k8-sse3       2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
36; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=opteron-sse3  2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
37; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefixes=SLOW,SLOW-SSE
38
39; AMD chips with fast unaligned memory accesses
40
41; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=amdfam10      2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
42; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=barcelona     2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
43; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver1        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
44; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=btver2        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
45; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver1        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
46; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver2        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
47; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver3        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
48; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=bdver4        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
49; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver1        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
50; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver2        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
51; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver3        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
52; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver4        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
53; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver5        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
54
55; Other chips with slow unaligned memory accesses
56
57; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=c3-2          2>&1 | FileCheck %s --check-prefixes=SLOW
58
59; Verify that the slow/fast unaligned memory attribute is set correctly for each CPU model.
60; Slow chips use 4-byte stores. Fast chips with SSE or later use something other than 4-byte stores.
61; Chips that don't have SSE use 4-byte stores either way, so they're not tested.
62
63; Also verify that SSE4.2 or SSE4a imply fast unaligned accesses.
64
65; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4.2       2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
66; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=sse4a        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-SSE
67
68; SLOW-NOT: not a recognized processor
69; FAST-NOT: not a recognized processor
70define void @store_zeros(ptr %a) {
71; SLOW-SCALAR-LABEL: store_zeros:
72; SLOW-SCALAR:       # %bb.0:
73; SLOW-SCALAR-NEXT:    movl {{[0-9]+}}(%esp), %eax
74; SLOW-SCALAR-NEXT:    movl $0
75; SLOW-SCALAR-NEXT:    movl $0
76; SLOW-SCALAR-NEXT:    movl $0
77; SLOW-SCALAR-NEXT:    movl $0
78; SLOW-SCALAR-NEXT:    movl $0
79; SLOW-SCALAR-NEXT:    movl $0
80; SLOW-SCALAR-NEXT:    movl $0
81; SLOW-SCALAR-NEXT:    movl $0
82; SLOW-SCALAR-NEXT:    movl $0
83; SLOW-SCALAR-NEXT:    movl $0
84; SLOW-SCALAR-NEXT:    movl $0
85; SLOW-SCALAR-NEXT:    movl $0
86; SLOW-SCALAR-NEXT:    movl $0
87; SLOW-SCALAR-NEXT:    movl $0
88; SLOW-SCALAR-NEXT:    movl $0
89; SLOW-SCALAR-NEXT:    movl $0
90; SLOW-SCALAR-NOT:     movl
91;
92; SLOW-SSE-LABEL: store_zeros:
93; SLOW-SSE:       # %bb.0:
94; SLOW-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
95; SLOW-SSE-NEXT:    xorps %xmm0, %xmm0
96; SLOW-SSE-NEXT:    movsd %xmm0
97; SLOW-SSE-NEXT:    movsd %xmm0
98; SLOW-SSE-NEXT:    movsd %xmm0
99; SLOW-SSE-NEXT:    movsd %xmm0
100; SLOW-SSE-NEXT:    movsd %xmm0
101; SLOW-SSE-NEXT:    movsd %xmm0
102; SLOW-SSE-NEXT:    movsd %xmm0
103; SLOW-SSE-NEXT:    movsd %xmm0
104; SLOW-SSE-NOT:     movsd
105;
106; FAST-SSE-LABEL: store_zeros:
107; FAST-SSE:       # %bb.0:
108; FAST-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
109; FAST-SSE-NEXT:    xorps %xmm0, %xmm0
110; FAST-SSE-NEXT:    movups %xmm0
111; FAST-SSE-NEXT:    movups %xmm0
112; FAST-SSE-NEXT:    movups %xmm0
113; FAST-SSE-NEXT:    movups %xmm0
114; FAST-SSE-NOT:     movups
115;
116; FAST-AVX128-LABEL: store_zeros:
117; FAST-AVX128:       # %bb.0:
118; FAST-AVX128-NEXT:    movl {{[0-9]+}}(%esp), %eax
119; FAST-AVX128-NEXT:    vxorps %xmm0, %xmm0, %xmm0
120; FAST-AVX128-NEXT:    vmovups %xmm0
121; FAST-AVX128-NEXT:    vmovups %xmm0
122; FAST-AVX128-NEXT:    vmovups %xmm0
123; FAST-AVX128-NEXT:    vmovups %xmm0
124; FAST-AVX128-NOT:     vmovups
125;
126; FAST-AVX256-LABEL: store_zeros:
127; FAST-AVX256:       # %bb.0:
128; FAST-AVX256-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; FAST-AVX256-NEXT:    vxorps %xmm0, %xmm0, %xmm0
130; FAST-AVX256-NEXT:    vmovups %ymm0
131; FAST-AVX256-NEXT:    vmovups %ymm0
132; FAST-AVX256-NOT:     vmovups
133;
134; FAST-AVX512-LABEL: store_zeros:
135; FAST-AVX512:       # %bb.0:
136; FAST-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
137; FAST-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
138; FAST-AVX512-NEXT:    vmovups %zmm0, (%eax)
139; FAST-AVX512-NOT:     vmovups
140  call void @llvm.memset.p0.i64(ptr %a, i8 0, i64 64, i1 false)
141  ret void
142}
143
144declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1)
145
146