xref: /llvm-project/llvm/test/CodeGen/X86/load-slice.ll (revision 2f448bf509432c1a19ec46ab8cbc7353c03c6280)
1; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
2; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
3;
4; <rdar://problem/14477220>
5
6%class.Complex = type { float, float }
7
8
9; Check that independent slices leads to independent loads then the slices leads to
10; different register file.
11;
12; The layout is:
13; LSB 0 1 2 3 | 4 5 6 7 MSB
14;       Low      High
15; The base address points to 0 and is 8-bytes aligned.
16; Low slice starts at 0 (base) and is 8-bytes aligned.
17; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
18;
19; STRESS-LABEL: _t1:
20; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
21; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
22; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
23; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
24; Add low slice: out[out_start].real, this is base + 0.
25; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
26; Add high slice: out[out_start].imm, this is base + 4.
27; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
28; Swap Imm and Real.
29; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
30; Put the results back into out[out_start].
31; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
32;
33; Same for REGULAR, we eliminate register bank copy with each slices.
34; REGULAR-LABEL: _t1:
35; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
36; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
37; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
38; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
39; Add low slice: out[out_start].real, this is base + 0.
40; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
41; Add high slice: out[out_start].imm, this is base + 4.
42; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
43; Swap Imm and Real.
44; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
45; Put the results back into out[out_start].
46; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
47define void @t1(ptr nocapture %out, i64 %out_start) {
48entry:
49  %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
50  %tmp1 = load i64, ptr %arrayidx, align 8
51  %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
52  %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
53  %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
54  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
55  %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
56  %add = add i64 %out_start, 8
57  %arrayidx2 = getelementptr inbounds %class.Complex, ptr %out, i64 %add
58  %tmp4 = load float, ptr %arrayidx2, align 4
59  %add.i = fadd float %tmp4, %tmp2
60  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
61  %r.i = getelementptr inbounds %class.Complex, ptr %arrayidx2, i64 0, i32 1
62  %tmp5 = load float, ptr %r.i, align 4
63  %add5.i = fadd float %tmp5, %tmp3
64  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
65  store <2 x float> %retval.sroa.0.4.vec.insert.i, ptr %arrayidx, align 4
66  ret void
67}
68
69; Function Attrs: nounwind
70declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
71
72; Function Attrs: nounwind
73declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
74
75; Function Attrs: nounwind
76declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
77
78; Check that we do not read outside of the chunk of bits of the original loads.
79;
80; The 64-bits should have been split in one 32-bits and one 16-bits slices.
81; The 16-bits should be zero extended to match the final type.
82;
83; The memory layout is:
84; LSB 0 1 2 3 | 4 5 | 6 7 MSB
85;      Low            High
86; The base address points to 0 and is 8-bytes aligned.
87; Low slice starts at 0 (base) and is 8-bytes aligned.
88; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
89;
90; STRESS-LABEL: _t2:
91; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
92; STRESS-NEXT: addl ([[BASE]]), %eax
93; STRESS-NEXT: ret
94;
95; For the REGULAR heuristic, this is not profitable to slice things that are not
96; next to each other in memory. Here we have a hole with bytes #4-5.
97; REGULAR-LABEL: _t2:
98; REGULAR: shrq $48
99define i32 @t2(ptr nocapture %out, i64 %out_start) {
100  %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
101  %chunk64 = load i64, ptr %arrayidx, align 8
102  %slice32_low = trunc i64 %chunk64 to i32
103  %shift48 = lshr i64 %chunk64, 48
104  %slice32_high = trunc i64 %shift48 to i32
105  %res = add i32 %slice32_high, %slice32_low
106  ret i32 %res
107}
108
109; Check that we do not optimize overlapping slices.
110;
111; The 64-bits should NOT have been split in as slices are overlapping.
112; First slice uses bytes numbered 0 to 3.
113; Second slice uses bytes numbered 6 and 7.
114; Third slice uses bytes numbered 4 to 7.
115;
116; STRESS-LABEL: _t3:
117; STRESS: shrq $48
118; STRESS: shrq $32
119;
120; REGULAR-LABEL: _t3:
121; REGULAR: shrq $48
122; REGULAR: shrq $32
123define i32 @t3(ptr nocapture %out, i64 %out_start) {
124  %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
125  %chunk64 = load i64, ptr %arrayidx, align 8
126  %slice32_low = trunc i64 %chunk64 to i32
127  %shift48 = lshr i64 %chunk64, 48
128  %slice32_high = trunc i64 %shift48 to i32
129  %shift32 = lshr i64 %chunk64, 32
130  %slice32_lowhigh = trunc i64 %shift32 to i32
131  %tmpres = add i32 %slice32_high, %slice32_low
132  %res = add i32 %slice32_lowhigh, %tmpres
133  ret i32 %res
134}
135