xref: /llvm-project/llvm/test/CodeGen/ARM/ParallelDSP/squaring.ll (revision bed1c7f061aa12417aa081e334afdba45767b938)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=armv8-a-linux-gnueabihf -arm-parallel-dsp -dce --verify %s -S -o - | FileCheck %s
3
4define dso_local void @a() align 2 {
5; CHECK-LABEL: @a(
6; CHECK-NEXT:  for.end:
7; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
8; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr @a, align 2
9; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
10; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV]]
11; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i16, ptr @a, i32 1), align 2
12; CHECK-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP1]] to i32
13; CHECK-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV3]], [[CONV3]]
14; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL6]], [[MUL]]
15; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i16, ptr @a, i32 2), align 2
16; CHECK-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP2]] to i32
17; CHECK-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[CONV11]], [[CONV3]]
18; CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[MUL12]], [[ADD]]
19; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i16, ptr @a, i32 3), align 2
20; CHECK-NEXT:    [[CONV17:%.*]] = sext i16 [[TMP3]] to i32
21; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[ADD14]], [[CONV17]]
22; CHECK-NEXT:    store i32 [[ADD19]], ptr [[B]], align 4
23; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i16, ptr @a, i32 4), align 2
24; CHECK-NEXT:    [[CONV21:%.*]] = sext i16 [[TMP4]] to i32
25; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[CONV21]]
26; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds i32, ptr [[ADD_PTR]], i32 9
27; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX22]], align 4
28; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP5]], 1
29; CHECK-NEXT:    store i32 [[SHL]], ptr [[ARRAYIDX22]], align 4
30; CHECK-NEXT:    br label [[FOR_COND23:%.*]]
31; CHECK:       for.cond23:
32; CHECK-NEXT:    br label [[FOR_COND23]]
33;
34for.end:
35  %b = alloca i32, align 4
36  %0 = load i16, ptr @a, align 2
37  %conv = sext i16 %0 to i32
38  %mul = mul nsw i32 %conv, %conv
39  %1 = load i16, ptr getelementptr (i16, ptr @a, i32 1), align 2
40  %conv3 = sext i16 %1 to i32
41  %mul6 = mul nsw i32 %conv3, %conv3
42  %add = add nuw nsw i32 %mul6, %mul
43  %2 = load i16, ptr getelementptr (i16, ptr @a, i32 2), align 2
44  %conv11 = sext i16 %2 to i32
45  %mul12 = mul nsw i32 %conv11, %conv3
46  %add14 = add nsw i32 %mul12, %add
47  %3 = load i16, ptr getelementptr (i16, ptr @a, i32 3), align 2
48  %conv17 = sext i16 %3 to i32
49  %add19 = add nsw i32 %add14, %conv17
50  store i32 %add19, ptr %b, align 4
51  %4 = load i16, ptr getelementptr (i16, ptr @a, i32 4), align 2
52  %conv21 = sext i16 %4 to i32
53  %add.ptr = getelementptr inbounds i32, ptr %b, i32 %conv21
54  %arrayidx22 = getelementptr inbounds i32, ptr %add.ptr, i32 9
55  %5 = load i32, ptr %arrayidx22, align 4
56  %shl = shl i32 %5, 1
57  store i32 %shl, ptr %arrayidx22, align 4
58  br label %for.cond23
59
60for.cond23:                                       ; preds = %for.cond23, %for.end
61  br label %for.cond23
62}
63
64define i32 @accumulate_square_a0(ptr %a, ptr %b, i32 %acc) {
65; CHECK-LABEL: @accumulate_square_a0(
66; CHECK-NEXT:  entry:
67; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
68; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
69; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]]
70; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
71; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ADDR_A_1]], align 2
72; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
73; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
74; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ADDR_B_1]], align 2
75; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
76; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i32
77; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
78; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[MUL_0]], [[ACC:%.*]]
79; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP7]]
80; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[MUL_1]], [[TMP8]]
81; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP5]], i32 [[TMP9]])
82; CHECK-NEXT:    ret i32 [[TMP10]]
83;
84entry:
85  %addr.a.1 = getelementptr i16, ptr %a, i32 1
86  %addr.b.1 = getelementptr i16, ptr %b, i32 1
87  %ld.a.0 = load i16, ptr %a
88  %sext.a.0 = sext i16 %ld.a.0 to i32
89  %ld.b.0 = load i16, ptr %b
90  %ld.a.1 = load i16, ptr %addr.a.1
91  %ld.b.1 = load i16, ptr %addr.b.1
92  %sext.a.1 = sext i16 %ld.a.1 to i32
93  %sext.b.1 = sext i16 %ld.b.1 to i32
94  %sext.b.0 = sext i16 %ld.b.0 to i32
95  %mul.0 = mul i32 %sext.a.0, %sext.a.0
96  %mul.1 = mul i32 %sext.a.1, %sext.b.1
97  %addr.a.2 = getelementptr i16, ptr %a, i32 2
98  %addr.b.2 = getelementptr i16, ptr %b, i32 2
99  %ld.a.2 = load i16, ptr %addr.a.2
100  %ld.b.2 = load i16, ptr %addr.b.2
101  %sext.a.2 = sext i16 %ld.a.2 to i32
102  %sext.b.2 = sext i16 %ld.b.2 to i32
103  %mul.2 = mul i32 %sext.a.2, %sext.b.2
104  %add = add i32 %mul.0, %mul.1
105  %add.1 = add i32 %mul.1, %mul.2
106  %add.2 = add i32 %add.1, %add
107  %res = add i32 %add.2, %acc
108  ret i32 %res
109}
110
111define i32 @accumulate_square_a2(ptr %a, ptr %b, i32 %acc) {
112; CHECK-LABEL: @accumulate_square_a2(
113; CHECK-NEXT:  entry:
114; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A:%.*]], align 2
115; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
116; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
117; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP3]] to i32
118; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B:%.*]], align 2
119; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
120; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
121; CHECK-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP8]] to i32
122; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP4]], [[TMP9]]
123; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
124; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2
125; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]]
126; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]]
127; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
128; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
129; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_A_2]]
130; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[MUL_2]], [[ACC:%.*]]
131; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[MUL_1]], [[TMP10]]
132; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP6]], i32 [[TMP11]])
133; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP12]], [[SEXT_B_2]]
134; CHECK-NEXT:    ret i32 [[RES]]
135;
136entry:
137  %addr.a.1 = getelementptr i16, ptr %a, i32 1
138  %addr.b.1 = getelementptr i16, ptr %b, i32 1
139  %ld.a.0 = load i16, ptr %a
140  %sext.a.0 = sext i16 %ld.a.0 to i32
141  %ld.b.0 = load i16, ptr %b
142  %ld.a.1 = load i16, ptr %addr.a.1
143  %ld.b.1 = load i16, ptr %addr.b.1
144  %sext.a.1 = sext i16 %ld.a.1 to i32
145  %sext.b.1 = sext i16 %ld.b.1 to i32
146  %sext.b.0 = sext i16 %ld.b.0 to i32
147  %mul.0 = mul i32 %sext.a.0, %sext.b.0
148  %mul.1 = mul i32 %sext.a.1, %sext.b.1
149  %addr.a.2 = getelementptr i16, ptr %a, i32 2
150  %addr.b.2 = getelementptr i16, ptr %b, i32 2
151  %ld.a.2 = load i16, ptr %addr.a.2
152  %ld.b.2 = load i16, ptr %addr.b.2
153  %sext.a.2 = sext i16 %ld.a.2 to i32
154  %sext.b.2 = sext i16 %ld.b.2 to i32
155  %mul.2 = mul i32 %sext.a.2, %sext.a.2
156  %add = add i32 %mul.0, %mul.1
157  %add.1 = add i32 %mul.1, %mul.2
158  %add.2 = add i32 %add.1, %add
159  %add.3 = add i32 %add.2, %acc
160  %res = add i32 %add.3, %sext.b.2
161  ret i32 %res
162}
163
164define i32 @accumulate_square_b2(ptr %a, ptr %b, i32 %acc) {
165; CHECK-LABEL: @accumulate_square_b2(
166; CHECK-NEXT:  entry:
167; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
168; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
169; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]]
170; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
171; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]]
172; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]]
173; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
174; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
175; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
176; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_1]]
177; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2
178; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]]
179; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
180; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_B_2]], [[SEXT_B_2]]
181; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
182; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
183; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]]
184; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_2]], [[ACC:%.*]]
185; CHECK-NEXT:    ret i32 [[RES]]
186;
187entry:
188  %addr.a.1 = getelementptr i16, ptr %a, i32 1
189  %addr.b.1 = getelementptr i16, ptr %b, i32 1
190  %ld.a.0 = load i16, ptr %a
191  %sext.a.0 = sext i16 %ld.a.0 to i32
192  %ld.b.0 = load i16, ptr %b
193  %ld.a.1 = load i16, ptr %addr.a.1
194  %ld.b.1 = load i16, ptr %addr.b.1
195  %sext.a.1 = sext i16 %ld.a.1 to i32
196  %sext.b.1 = sext i16 %ld.b.1 to i32
197  %sext.b.0 = sext i16 %ld.b.0 to i32
198  %mul.0 = mul i32 %sext.a.0, %sext.a.0
199  %mul.1 = mul i32 %sext.a.1, %sext.b.1
200  %addr.a.2 = getelementptr i16, ptr %a, i32 2
201  %addr.b.2 = getelementptr i16, ptr %b, i32 2
202  %ld.a.2 = load i16, ptr %addr.a.2
203  %ld.b.2 = load i16, ptr %addr.b.2
204  %sext.a.2 = sext i16 %ld.a.2 to i32
205  %sext.b.2 = sext i16 %ld.b.2 to i32
206  %mul.2 = mul i32 %sext.b.2, %sext.b.2
207  %add = add i32 %mul.0, %mul.1
208  %add.1 = add i32 %mul.1, %mul.2
209  %add.2 = add i32 %add.1, %add
210  %add.3 = add i32 %add.2, %sext.a.2
211  %res = add i32 %add.2, %acc
212  ret i32 %res
213}
214
215define i32 @accumulate_square_a1(ptr %a, ptr %b, i32 %acc) {
216; CHECK-LABEL: @accumulate_square_a1(
217; CHECK-NEXT:  entry:
218; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
219; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
220; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]]
221; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
222; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]]
223; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]]
224; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
225; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
226; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
227; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_A_1]]
228; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
229; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2
230; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]]
231; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]]
232; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
233; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
234; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_B_2]]
235; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[SEXT_B_1]]
236; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_0]], [[ADD]]
237; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
238; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[ADD_1]]
239; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[SEXT_A_2]]
240; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_4]], [[ACC:%.*]]
241; CHECK-NEXT:    ret i32 [[RES]]
242;
243entry:
244  %addr.a.1 = getelementptr i16, ptr %a, i32 1
245  %addr.b.1 = getelementptr i16, ptr %b, i32 1
246  %ld.a.0 = load i16, ptr %a
247  %sext.a.0 = sext i16 %ld.a.0 to i32
248  %ld.b.0 = load i16, ptr %b
249  %ld.a.1 = load i16, ptr %addr.a.1
250  %ld.b.1 = load i16, ptr %addr.b.1
251  %sext.a.1 = sext i16 %ld.a.1 to i32
252  %sext.b.1 = sext i16 %ld.b.1 to i32
253  %sext.b.0 = sext i16 %ld.b.0 to i32
254  %mul.0 = mul i32 %sext.a.0, %sext.a.0
255  %mul.1 = mul i32 %sext.a.1, %sext.a.1
256  %addr.a.2 = getelementptr i16, ptr %a, i32 2
257  %addr.b.2 = getelementptr i16, ptr %b, i32 2
258  %ld.a.2 = load i16, ptr %addr.a.2
259  %ld.b.2 = load i16, ptr %addr.b.2
260  %sext.a.2 = sext i16 %ld.a.2 to i32
261  %sext.b.2 = sext i16 %ld.b.2 to i32
262  %mul.2 = mul i32 %sext.a.2, %sext.b.2
263  %add = add i32 %mul.1, %sext.b.1
264  %add.1 = add i32 %mul.0, %add
265  %add.2 = add i32 %mul.1, %mul.2
266  %add.3 = add i32 %add.2, %add.1
267  %add.4 = add i32 %add.3, %sext.a.2
268  %res = add i32 %add.4, %acc
269  ret i32 %res
270}
271