xref: /llvm-project/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll (revision 701890164d567866900f3087ffd2ad4da963111c)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
3
4define i32 @exchange_1(ptr %a, ptr %b, i32 %acc) {
5; CHECK-LABEL: @exchange_1(
6; CHECK-NEXT:  entry:
7; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
8; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
9; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
10; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
11; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
12; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
13; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
14; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
15; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
16; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
17; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
18; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
19; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
20; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
21; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
22; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
23; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
24; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
25; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
26; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
27; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
28; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
29; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
30; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]]
31; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]]
32; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
33; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
34; CHECK-NEXT:    ret i32 [[TMP10]]
35;
36entry:
37  %addr.a.1 = getelementptr i16, ptr %a, i32 1
38  %addr.b.1 = getelementptr i16, ptr %b, i32 1
39  %ld.a.0 = load i16, ptr %a
40  %sext.a.0 = sext i16 %ld.a.0 to i32
41  %ld.b.0 = load i16, ptr %b
42  %ld.a.1 = load i16, ptr %addr.a.1
43  %ld.b.1 = load i16, ptr %addr.b.1
44  %sext.a.1 = sext i16 %ld.a.1 to i32
45  %sext.b.1 = sext i16 %ld.b.1 to i32
46  %sext.b.0 = sext i16 %ld.b.0 to i32
47  %mul.0 = mul i32 %sext.a.0, %sext.b.1
48  %mul.1 = mul i32 %sext.a.1, %sext.b.0
49  %add = add i32 %mul.0, %mul.1
50  %res = add i32 %add, %acc
51  ret i32 %res
52}
53
54define i32 @exchange_2(ptr %a, ptr %b, i32 %acc) {
55; CHECK-LABEL: @exchange_2(
56; CHECK-NEXT:  entry:
57; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
58; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
59; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
60; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
61; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
62; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
63; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
64; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
65; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
66; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
67; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
68; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
69; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
70; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
71; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
72; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
73; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
74; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
75; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
76; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
77; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
78; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
79; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
80; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP14]], [[TMP3]]
81; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP11]], [[TMP6]]
82; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
83; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
84; CHECK-NEXT:    ret i32 [[TMP10]]
85;
86entry:
87  %addr.a.1 = getelementptr i16, ptr %a, i32 1
88  %addr.b.1 = getelementptr i16, ptr %b, i32 1
89  %ld.a.0 = load i16, ptr %a
90  %sext.a.0 = sext i16 %ld.a.0 to i32
91  %ld.b.0 = load i16, ptr %b
92  %ld.a.1 = load i16, ptr %addr.a.1
93  %ld.b.1 = load i16, ptr %addr.b.1
94  %sext.a.1 = sext i16 %ld.a.1 to i32
95  %sext.b.1 = sext i16 %ld.b.1 to i32
96  %sext.b.0 = sext i16 %ld.b.0 to i32
97  %mul.0 = mul i32 %sext.b.1, %sext.a.0
98  %mul.1 = mul i32 %sext.b.0, %sext.a.1
99  %add = add i32 %mul.0, %mul.1
100  %res = add i32 %add, %acc
101  ret i32 %res
102}
103
104define i32 @exchange_3(ptr %a, ptr %b, i32 %acc) {
105; CHECK-LABEL: @exchange_3(
106; CHECK-NEXT:  entry:
107; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
108; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
109; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
110; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
111; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
112; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
113; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
114; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
115; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
116; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
117; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
118; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
119; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
120; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]])
121; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
122; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
123; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
124; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
125; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
126; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
127; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
128; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
129; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
130; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]]
131; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]]
132; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
133; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
134; CHECK-NEXT:    ret i32 [[TMP10]]
135;
136entry:
137  %addr.a.1 = getelementptr i16, ptr %a, i32 1
138  %addr.b.1 = getelementptr i16, ptr %b, i32 1
139  %ld.a.0 = load i16, ptr %a
140  %sext.a.0 = sext i16 %ld.a.0 to i32
141  %ld.b.0 = load i16, ptr %b
142  %ld.a.1 = load i16, ptr %addr.a.1
143  %ld.b.1 = load i16, ptr %addr.b.1
144  %sext.a.1 = sext i16 %ld.a.1 to i32
145  %sext.b.1 = sext i16 %ld.b.1 to i32
146  %sext.b.0 = sext i16 %ld.b.0 to i32
147  %mul.0 = mul i32 %sext.a.0, %sext.b.1
148  %mul.1 = mul i32 %sext.a.1, %sext.b.0
149  %add = add i32 %mul.1, %mul.0
150  %res = add i32 %add, %acc
151  ret i32 %res
152}
153
154define i32 @exchange_4(ptr %a, ptr %b, i32 %acc) {
155; CHECK-LABEL: @exchange_4(
156; CHECK-NEXT:  entry:
157; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
158; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
159; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
160; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
161; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
162; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
163; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
164; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
165; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
166; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
167; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
168; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
169; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
170; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]])
171; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
172; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
173; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
174; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
175; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
176; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
177; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
178; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
179; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
180; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP14]], [[TMP3]]
181; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP11]], [[TMP6]]
182; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
183; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
184; CHECK-NEXT:    ret i32 [[TMP10]]
185;
186entry:
187  %addr.a.1 = getelementptr i16, ptr %a, i32 1
188  %addr.b.1 = getelementptr i16, ptr %b, i32 1
189  %ld.a.0 = load i16, ptr %a
190  %sext.a.0 = sext i16 %ld.a.0 to i32
191  %ld.b.0 = load i16, ptr %b
192  %ld.a.1 = load i16, ptr %addr.a.1
193  %ld.b.1 = load i16, ptr %addr.b.1
194  %sext.a.1 = sext i16 %ld.a.1 to i32
195  %sext.b.1 = sext i16 %ld.b.1 to i32
196  %sext.b.0 = sext i16 %ld.b.0 to i32
197  %mul.0 = mul i32 %sext.b.1, %sext.a.0
198  %mul.1 = mul i32 %sext.b.0, %sext.a.1
199  %add = add i32 %mul.1, %mul.0
200  %res = add i32 %add, %acc
201  ret i32 %res
202}
203
204define i32 @exchange_multi_use_1(ptr %a, ptr %b, i32 %acc) {
205; CHECK-LABEL: @exchange_multi_use_1(
206; CHECK-NEXT:  entry:
207; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
208; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
209; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
210; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
211; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
212; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
213; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
214; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
215; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
216; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
217; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
218; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
219; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
220; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
221; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
222; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
223; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
224; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
225; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
226; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
227; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
228; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
229; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
230; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]]
231; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]]
232; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
233; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
234; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
235; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
236; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
237; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
238; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP16]], i32 [[TMP8]], i32 [[TMP10]])
239; CHECK-NEXT:    [[TMP19:%.*]] = sext i16 [[TMP17]] to i32
240; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP16]], 16
241; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
242; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
243; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
244; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
245; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
246; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]]
247; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]]
248; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
249; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]]
250; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]]
251; CHECK-NEXT:    ret i32 [[TMP18]]
252;
253entry:
254  %addr.a.1 = getelementptr i16, ptr %a, i32 1
255  %addr.b.1 = getelementptr i16, ptr %b, i32 1
256  %ld.a.0 = load i16, ptr %a
257  %sext.a.0 = sext i16 %ld.a.0 to i32
258  %ld.b.0 = load i16, ptr %b
259  %ld.a.1 = load i16, ptr %addr.a.1
260  %ld.b.1 = load i16, ptr %addr.b.1
261  %sext.a.1 = sext i16 %ld.a.1 to i32
262  %sext.b.1 = sext i16 %ld.b.1 to i32
263  %sext.b.0 = sext i16 %ld.b.0 to i32
264  %mul.0 = mul i32 %sext.a.0, %sext.b.1
265  %mul.1 = mul i32 %sext.a.1, %sext.b.0
266  %add = add i32 %mul.0, %mul.1
267  %addr.a.2 = getelementptr i16, ptr %a, i32 2
268  %addr.a.3 = getelementptr i16, ptr %a, i32 3
269  %ld.a.2 = load i16, ptr %addr.a.2
270  %ld.a.3 = load i16, ptr %addr.a.3
271  %sext.a.2 = sext i16 %ld.a.2 to i32
272  %sext.a.3 = sext i16 %ld.a.3 to i32
273  %mul.2 = mul i32 %sext.a.3, %sext.b.1
274  %mul.3 = mul i32 %sext.a.2, %sext.b.0
275  %add.1 = add i32 %mul.2, %mul.3
276  %add.2 = add i32 %add, %add.1
277  %res = add i32 %add.2, %acc
278  ret i32 %res
279}
280
281define i64 @exchange_multi_use_64_1(ptr %a, ptr %b, i64 %acc) {
282; CHECK-LABEL: @exchange_multi_use_64_1(
283; CHECK-NEXT:  entry:
284; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
285; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
286; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
287; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
288; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
289; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
290; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
291; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
292; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
293; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
294; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
295; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
296; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
297; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]])
298; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
299; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
300; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
301; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
302; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
303; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
304; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
305; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
306; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
307; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]]
308; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]]
309; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
310; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
311; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
312; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
313; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
314; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
315; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP8]], i64 [[TMP10]])
316; CHECK-NEXT:    [[TMP19:%.*]] = sext i16 [[TMP17]] to i32
317; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP16]], 16
318; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
319; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
320; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
321; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
322; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
323; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]]
324; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]]
325; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
326; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]]
327; CHECK-NEXT:    [[SEXT_ADD_2:%.*]] = sext i32 [[ADD_2]] to i64
328; CHECK-NEXT:    [[RES:%.*]] = add i64 [[SEXT_ADD_2]], [[ACC]]
329; CHECK-NEXT:    ret i64 [[TMP18]]
330;
331entry:
332  %addr.a.1 = getelementptr i16, ptr %a, i32 1
333  %addr.b.1 = getelementptr i16, ptr %b, i32 1
334  %ld.a.0 = load i16, ptr %a
335  %sext.a.0 = sext i16 %ld.a.0 to i32
336  %ld.b.0 = load i16, ptr %b
337  %ld.a.1 = load i16, ptr %addr.a.1
338  %ld.b.1 = load i16, ptr %addr.b.1
339  %sext.a.1 = sext i16 %ld.a.1 to i32
340  %sext.b.1 = sext i16 %ld.b.1 to i32
341  %sext.b.0 = sext i16 %ld.b.0 to i32
342  %mul.0 = mul i32 %sext.a.0, %sext.b.1
343  %mul.1 = mul i32 %sext.a.1, %sext.b.0
344  %add = add i32 %mul.0, %mul.1
345  %addr.a.2 = getelementptr i16, ptr %a, i32 2
346  %addr.a.3 = getelementptr i16, ptr %a, i32 3
347  %ld.a.2 = load i16, ptr %addr.a.2
348  %ld.a.3 = load i16, ptr %addr.a.3
349  %sext.a.2 = sext i16 %ld.a.2 to i32
350  %sext.a.3 = sext i16 %ld.a.3 to i32
351  %mul.2 = mul i32 %sext.a.3, %sext.b.1
352  %mul.3 = mul i32 %sext.a.2, %sext.b.0
353  %add.1 = add i32 %mul.2, %mul.3
354  %add.2 = add i32 %add, %add.1
355  %sext.add.2 = sext i32 %add.2 to i64
356  %res = add i64 %sext.add.2, %acc
357  ret i64 %res
358}
359
360define i64 @exchange_multi_use_64_2(ptr %a, ptr %b, i64 %acc) {
361; CHECK-LABEL: @exchange_multi_use_64_2(
362; CHECK-NEXT:  entry:
363; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
364; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
365; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
366; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
367; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
368; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
369; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
370; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
371; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
372; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
373; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
374; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
375; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
376; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]])
377; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
378; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
379; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
380; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
381; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
382; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
383; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
384; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
385; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
386; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP14]]
387; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP11]]
388; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
389; CHECK-NEXT:    [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64
390; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
391; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
392; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
393; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
394; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
395; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP8]], i64 [[TMP10]])
396; CHECK-NEXT:    [[TMP19:%.*]] = sext i16 [[TMP17]] to i32
397; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP16]], 16
398; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
399; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
400; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
401; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
402; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
403; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP22]], [[TMP14]]
404; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP19]], [[TMP11]]
405; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
406; CHECK-NEXT:    [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64
407; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[SEXT_ADD]], [[SEXT_ADD_1]]
408; CHECK-NEXT:    [[RES:%.*]] = add i64 [[ADD_2]], [[ACC]]
409; CHECK-NEXT:    ret i64 [[TMP18]]
410;
411entry:
412  %addr.a.1 = getelementptr i16, ptr %a, i32 1
413  %addr.b.1 = getelementptr i16, ptr %b, i32 1
414  %ld.a.0 = load i16, ptr %a
415  %sext.a.0 = sext i16 %ld.a.0 to i32
416  %ld.b.0 = load i16, ptr %b
417  %ld.a.1 = load i16, ptr %addr.a.1
418  %ld.b.1 = load i16, ptr %addr.b.1
419  %sext.a.1 = sext i16 %ld.a.1 to i32
420  %sext.b.1 = sext i16 %ld.b.1 to i32
421  %sext.b.0 = sext i16 %ld.b.0 to i32
422  %mul.0 = mul i32 %sext.a.0, %sext.b.1
423  %mul.1 = mul i32 %sext.a.1, %sext.b.0
424  %add = add i32 %mul.0, %mul.1
425  %sext.add = sext i32 %add to i64
426  %addr.a.2 = getelementptr i16, ptr %a, i32 2
427  %addr.a.3 = getelementptr i16, ptr %a, i32 3
428  %ld.a.2 = load i16, ptr %addr.a.2
429  %ld.a.3 = load i16, ptr %addr.a.3
430  %sext.a.2 = sext i16 %ld.a.2 to i32
431  %sext.a.3 = sext i16 %ld.a.3 to i32
432  %mul.2 = mul i32 %sext.a.3, %sext.b.1
433  %mul.3 = mul i32 %sext.a.2, %sext.b.0
434  %add.1 = add i32 %mul.2, %mul.3
435  %sext.add.1 = sext i32 %add.1 to i64
436  %add.2 = add i64 %sext.add, %sext.add.1
437  %res = add i64 %add.2, %acc
438  ret i64 %res
439}
440
441define i32 @exchange_multi_use_2(ptr %a, ptr %b, i32 %acc) {
442; CHECK-LABEL: @exchange_multi_use_2(
443; CHECK-NEXT:  entry:
444; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
445; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
446; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
447; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
448; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
449; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
450; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
451; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
452; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
453; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
454; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
455; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
456; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
457; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
458; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
459; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
460; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
461; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
462; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
463; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
464; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
465; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
466; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
467; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]]
468; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]]
469; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
470; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
471; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
472; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
473; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
474; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
475; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP16]], i32 [[TMP10]])
476; CHECK-NEXT:    [[TMP19:%.*]] = sext i16 [[TMP17]] to i32
477; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP16]], 16
478; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
479; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
480; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
481; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
482; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
483; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP11]], [[TMP22]]
484; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP14]], [[TMP19]]
485; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
486; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]]
487; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]]
488; CHECK-NEXT:    ret i32 [[TMP18]]
489;
490entry:
491  %addr.a.1 = getelementptr i16, ptr %a, i32 1
492  %addr.b.1 = getelementptr i16, ptr %b, i32 1
493  %ld.a.0 = load i16, ptr %a
494  %sext.a.0 = sext i16 %ld.a.0 to i32
495  %ld.b.0 = load i16, ptr %b
496  %ld.a.1 = load i16, ptr %addr.a.1
497  %ld.b.1 = load i16, ptr %addr.b.1
498  %sext.a.1 = sext i16 %ld.a.1 to i32
499  %sext.b.1 = sext i16 %ld.b.1 to i32
500  %sext.b.0 = sext i16 %ld.b.0 to i32
501  %mul.0 = mul i32 %sext.a.0, %sext.b.0
502  %mul.1 = mul i32 %sext.a.1, %sext.b.1
503  %add = add i32 %mul.0, %mul.1
504  %addr.a.2 = getelementptr i16, ptr %a, i32 2
505  %addr.a.3 = getelementptr i16, ptr %a, i32 3
506  %ld.a.2 = load i16, ptr %addr.a.2
507  %ld.a.3 = load i16, ptr %addr.a.3
508  %sext.a.2 = sext i16 %ld.a.2 to i32
509  %sext.a.3 = sext i16 %ld.a.3 to i32
510  %mul.2 = mul i32 %sext.b.0, %sext.a.3
511  %mul.3 = mul i32 %sext.b.1, %sext.a.2
512  %add.1 = add i32 %mul.2, %mul.3
513  %add.2 = add i32 %add, %add.1
514  %res = add i32 %add.2, %acc
515  ret i32 %res
516}
517
518; TODO: Why aren't two intrinsics generated?
519define i32 @exchange_multi_use_3(ptr %a, ptr %b, i32 %acc) {
520; CHECK-LABEL: @exchange_multi_use_3(
521; CHECK-NEXT:  entry:
522; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
523; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
524; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
525; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
526; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
527; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 2
528; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
529; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
530; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
531; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
532; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
533; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
534; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
535; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
536; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
537; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
538; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
539; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
540; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
541; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
542; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
543; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 0)
544; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
545; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
546; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
547; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
548; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
549; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
550; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
551; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP3]], [[TMP14]]
552; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP6]], [[TMP11]]
553; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[TMP3]]
554; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[TMP6]]
555; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
556; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
557; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[ADD]], [[TMP10]]
558; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ACC:%.*]], [[SUB]]
559; CHECK-NEXT:    ret i32 [[RES]]
560;
561entry:
562  %addr.a.1 = getelementptr i16, ptr %a, i32 1
563  %addr.b.1 = getelementptr i16, ptr %b, i32 1
564  %ld.a.0 = load i16, ptr %a
565  %sext.a.0 = sext i16 %ld.a.0 to i32
566  %ld.b.0 = load i16, ptr %b
567  %ld.a.1 = load i16, ptr %addr.a.1
568  %ld.b.1 = load i16, ptr %addr.b.1
569  %sext.a.1 = sext i16 %ld.a.1 to i32
570  %sext.b.1 = sext i16 %ld.b.1 to i32
571  %sext.b.0 = sext i16 %ld.b.0 to i32
572  %addr.a.2 = getelementptr i16, ptr %a, i32 2
573  %addr.a.3 = getelementptr i16, ptr %a, i32 3
574  %ld.a.2 = load i16, ptr %addr.a.2
575  %ld.a.3 = load i16, ptr %addr.a.3
576  %sext.a.2 = sext i16 %ld.a.2 to i32
577  %sext.a.3 = sext i16 %ld.a.3 to i32
578  %mul.2 = mul i32 %sext.b.0, %sext.a.3
579  %mul.3 = mul i32 %sext.b.1, %sext.a.2
580  %mul.0 = mul i32 %sext.a.0, %sext.b.0
581  %mul.1 = mul i32 %sext.a.1, %sext.b.1
582  %add = add i32 %mul.0, %mul.1
583  %add.1 = add i32 %mul.2, %mul.3
584  %sub = sub i32 %add, %add.1
585  %res = add i32 %acc, %sub
586  ret i32 %res
587}
588
589; TODO: Would it be better to generate a smlad and then sign extend it?
590define i64 @exchange_multi_use_64_3(ptr %a, ptr %b, i64 %acc) {
591; CHECK-LABEL: @exchange_multi_use_64_3(
592; CHECK-NEXT:  entry:
593; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
594; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
595; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
596; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
597; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
598; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
599; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
600; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
601; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
602; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
603; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
604; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
605; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
606; CHECK-NEXT:    [[TMP10:%.*]] = sext i16 [[TMP9]] to i32
607; CHECK-NEXT:    [[TMP11:%.*]] = lshr i32 [[TMP8]], 16
608; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16
609; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[TMP12]] to i32
610; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
611; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
612; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
613; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
614; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
615; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
616; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
617; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
618; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ADDR_A_2]], align 2
619; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
620; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[TMP8]], i32 [[TMP15]], i64 0)
621; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP8]], i64 [[TMP17]])
622; CHECK-NEXT:    [[TMP19:%.*]] = sext i16 [[TMP16]] to i32
623; CHECK-NEXT:    [[TMP20:%.*]] = lshr i32 [[TMP15]], 16
624; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
625; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
626; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
627; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
628; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
629; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP10]], [[TMP22]]
630; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP13]], [[TMP19]]
631; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP10]]
632; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP13]]
633; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
634; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
635; CHECK-NEXT:    [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64
636; CHECK-NEXT:    [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64
637; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[SEXT_ADD]], [[SEXT_ADD_1]]
638; CHECK-NEXT:    [[RES:%.*]] = sub i64 [[ACC:%.*]], [[TMP18]]
639; CHECK-NEXT:    ret i64 [[RES]]
640;
641entry:
642  %addr.a.1 = getelementptr i16, ptr %a, i32 1
643  %addr.b.1 = getelementptr i16, ptr %b, i32 1
644  %ld.a.0 = load i16, ptr %a
645  %sext.a.0 = sext i16 %ld.a.0 to i32
646  %ld.b.0 = load i16, ptr %b
647  %ld.a.1 = load i16, ptr %addr.a.1
648  %ld.b.1 = load i16, ptr %addr.b.1
649  %sext.a.1 = sext i16 %ld.a.1 to i32
650  %sext.b.1 = sext i16 %ld.b.1 to i32
651  %sext.b.0 = sext i16 %ld.b.0 to i32
652  %addr.a.2 = getelementptr i16, ptr %a, i32 2
653  %addr.a.3 = getelementptr i16, ptr %a, i32 3
654  %ld.a.2 = load i16, ptr %addr.a.2
655  %ld.a.3 = load i16, ptr %addr.a.3
656  %sext.a.2 = sext i16 %ld.a.2 to i32
657  %sext.a.3 = sext i16 %ld.a.3 to i32
658  %mul.2 = mul i32 %sext.b.0, %sext.a.3
659  %mul.3 = mul i32 %sext.b.1, %sext.a.2
660  %mul.0 = mul i32 %sext.a.0, %sext.b.0
661  %mul.1 = mul i32 %sext.a.1, %sext.b.1
662  %add = add i32 %mul.0, %mul.1
663  %add.1 = add i32 %mul.2, %mul.3
664  %sext.add = sext i32 %add to i64
665  %sext.add.1 = sext i32 %add.1 to i64
666  %add.2 = add i64 %sext.add, %sext.add.1
667  %res = sub i64 %acc, %add.2
668  ret i64 %res
669}
670
671; TODO: Why isn't smladx generated too?
672define i32 @exchange_multi_use_4(ptr %a, ptr %b, i32 %acc) {
673; CHECK-LABEL: @exchange_multi_use_4(
674; CHECK-NEXT:  entry:
675; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
676; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
677; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
678; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
679; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
680; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
681; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
682; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
683; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
684; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
685; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
686; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
687; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
688; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 0)
689; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
690; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
691; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
692; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
693; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
694; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
695; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
696; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
697; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
698; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2
699; CHECK-NEXT:    [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3
700; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2
701; CHECK-NEXT:    [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2
702; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
703; CHECK-NEXT:    [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32
704; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[TMP11]], [[SEXT_A_3]]
705; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[TMP14]], [[SEXT_A_2]]
706; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]]
707; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]]
708; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]]
709; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
710; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP10]], [[ADD_1]]
711; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ACC:%.*]], [[SUB]]
712; CHECK-NEXT:    ret i32 [[RES]]
713;
714entry:
715  %addr.a.1 = getelementptr i16, ptr %a, i32 1
716  %addr.b.1 = getelementptr i16, ptr %b, i32 1
717  %ld.a.0 = load i16, ptr %a
718  %sext.a.0 = sext i16 %ld.a.0 to i32
719  %ld.b.0 = load i16, ptr %b
720  %ld.a.1 = load i16, ptr %addr.a.1
721  %ld.b.1 = load i16, ptr %addr.b.1
722  %sext.a.1 = sext i16 %ld.a.1 to i32
723  %sext.b.1 = sext i16 %ld.b.1 to i32
724  %sext.b.0 = sext i16 %ld.b.0 to i32
725  %addr.a.2 = getelementptr i16, ptr %a, i32 2
726  %addr.a.3 = getelementptr i16, ptr %a, i32 3
727  %ld.a.2 = load i16, ptr %addr.a.2
728  %ld.a.3 = load i16, ptr %addr.a.3
729  %sext.a.2 = sext i16 %ld.a.2 to i32
730  %sext.a.3 = sext i16 %ld.a.3 to i32
731  %mul.2 = mul i32 %sext.b.0, %sext.a.3
732  %mul.3 = mul i32 %sext.b.1, %sext.a.2
733  %mul.0 = mul i32 %sext.a.0, %sext.b.0
734  %mul.1 = mul i32 %sext.a.1, %sext.b.1
735  %add.1 = add i32 %mul.2, %mul.3
736  %add = add i32 %mul.0, %mul.1
737  %sub = sub i32 %add, %add.1
738  %res = add i32 %acc, %sub
739  ret i32 %res
740}
741
742define i32 @exchange_swap(ptr %a, ptr %b, i32 %acc) {
743; CHECK-LABEL: @exchange_swap(
744; CHECK-NEXT:  entry:
745; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
746; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
747; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
748; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
749; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
750; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
751; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
752; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
753; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
754; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
755; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
756; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
757; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
758; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP8]], i32 [[TMP1]], i32 [[ACC:%.*]])
759; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
760; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
761; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
762; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
763; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
764; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
765; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
766; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
767; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
768; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP6]], [[TMP11]]
769; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP14]]
770; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
771; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
772; CHECK-NEXT:    ret i32 [[TMP10]]
773;
774entry:
775  %addr.a.1 = getelementptr i16, ptr %a, i32 1
776  %addr.b.1 = getelementptr i16, ptr %b, i32 1
777  %ld.a.0 = load i16, ptr %a
778  %sext.a.0 = sext i16 %ld.a.0 to i32
779  %ld.b.0 = load i16, ptr %b
780  %ld.a.1 = load i16, ptr %addr.a.1
781  %ld.b.1 = load i16, ptr %addr.b.1
782  %sext.a.1 = sext i16 %ld.a.1 to i32
783  %sext.b.1 = sext i16 %ld.b.1 to i32
784  %sext.b.0 = sext i16 %ld.b.0 to i32
785  %mul.0 = mul i32 %sext.a.1, %sext.b.0
786  %mul.1 = mul i32 %sext.a.0, %sext.b.1
787  %add = add i32 %mul.0, %mul.1
788  %res = add i32 %add, %acc
789  ret i32 %res
790}
791
792define i32 @exchange_swap_2(ptr %a, ptr %b, i32 %acc) {
793; CHECK-LABEL: @exchange_swap_2(
794; CHECK-NEXT:  entry:
795; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
796; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
797; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
798; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
799; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
800; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
801; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
802; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
803; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
804; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
805; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
806; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
807; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
808; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
809; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
810; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
811; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
812; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
813; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
814; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
815; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
816; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
817; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
818; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP6]], [[TMP11]]
819; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP14]]
820; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
821; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
822; CHECK-NEXT:    ret i32 [[TMP10]]
823;
824entry:
825  %addr.a.1 = getelementptr i16, ptr %a, i32 1
826  %addr.b.1 = getelementptr i16, ptr %b, i32 1
827  %ld.a.0 = load i16, ptr %a
828  %sext.a.0 = sext i16 %ld.a.0 to i32
829  %ld.b.0 = load i16, ptr %b
830  %ld.a.1 = load i16, ptr %addr.a.1
831  %ld.b.1 = load i16, ptr %addr.b.1
832  %sext.a.1 = sext i16 %ld.a.1 to i32
833  %sext.b.1 = sext i16 %ld.b.1 to i32
834  %sext.b.0 = sext i16 %ld.b.0 to i32
835  %mul.0 = mul i32 %sext.a.1, %sext.b.0
836  %mul.1 = mul i32 %sext.a.0, %sext.b.1
837  %add = add i32 %mul.1, %mul.0
838  %res = add i32 %add, %acc
839  ret i32 %res
840}
841
842define i32 @exchange_swap_3(ptr %a, ptr %b, i32 %acc) {
843; CHECK-LABEL: @exchange_swap_3(
844; CHECK-NEXT:  entry:
845; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1
846; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1
847; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2
848; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 2
849; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
850; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
851; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 16
852; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
853; CHECK-NEXT:    [[TMP6:%.*]] = sext i16 [[TMP5]] to i32
854; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
855; CHECK-NEXT:    [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2
856; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[B]], align 2
857; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
858; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]])
859; CHECK-NEXT:    [[TMP11:%.*]] = sext i16 [[TMP9]] to i32
860; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 [[TMP8]], 16
861; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16
862; CHECK-NEXT:    [[TMP14:%.*]] = sext i16 [[TMP13]] to i32
863; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2
864; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2
865; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
866; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
867; CHECK-NEXT:    [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32
868; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[TMP11]], [[TMP6]]
869; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP14]], [[TMP3]]
870; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
871; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD]], [[ACC]]
872; CHECK-NEXT:    ret i32 [[TMP10]]
873;
874entry:
875  %addr.a.1 = getelementptr i16, ptr %a, i32 1
876  %addr.b.1 = getelementptr i16, ptr %b, i32 1
877  %ld.a.0 = load i16, ptr %a
878  %sext.a.0 = sext i16 %ld.a.0 to i32
879  %ld.b.0 = load i16, ptr %b
880  %ld.a.1 = load i16, ptr %addr.a.1
881  %ld.b.1 = load i16, ptr %addr.b.1
882  %sext.a.1 = sext i16 %ld.a.1 to i32
883  %sext.b.1 = sext i16 %ld.b.1 to i32
884  %sext.b.0 = sext i16 %ld.b.0 to i32
885  %mul.0 = mul i32 %sext.b.0, %sext.a.1
886  %mul.1 = mul i32 %sext.b.1, %sext.a.0
887  %add = add i32 %mul.1, %mul.0
888  %res = add i32 %add, %acc
889  ret i32 %res
890}
891