xref: /llvm-project/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll (revision 63fe80fb18c9660e678d24c184021841cb02d82f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
3; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
4; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
5; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
6; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
7; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
8
9; assemble_acc
10declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
11define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
12; CHECK-LABEL: ass_acc:
13; CHECK:       # %bb.0: # %entry
14; CHECK-NEXT:    xxlor vs3, v2, v2
15; CHECK-NEXT:    xxlor vs2, v2, v2
16; CHECK-NEXT:    xxlor vs0, vs2, vs2
17; CHECK-NEXT:    xxlor vs1, vs3, vs3
18; CHECK-NEXT:    stxv vs0, 48(r3)
19; CHECK-NEXT:    stxv vs1, 32(r3)
20; CHECK-NEXT:    stxv vs2, 16(r3)
21; CHECK-NEXT:    stxv vs3, 0(r3)
22; CHECK-NEXT:    blr
23;
24; CHECK-BE-LABEL: ass_acc:
25; CHECK-BE:       # %bb.0: # %entry
26; CHECK-BE-NEXT:    xxlor vs3, v2, v2
27; CHECK-BE-NEXT:    xxlor vs2, v2, v2
28; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
29; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
30; CHECK-BE-NEXT:    stxv vs1, 16(r3)
31; CHECK-BE-NEXT:    stxv vs0, 0(r3)
32; CHECK-BE-NEXT:    stxv vs3, 48(r3)
33; CHECK-BE-NEXT:    stxv vs2, 32(r3)
34; CHECK-BE-NEXT:    blr
35entry:
36  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
37  store <512 x i1> %0, ptr %ptr, align 64
38  ret void
39}
40
41; xxmtacc
42declare <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1>)
43define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
44; CHECK-LABEL: int_xxmtacc:
45; CHECK:       # %bb.0: # %entry
46; CHECK-NEXT:    xxlor vs3, v2, v2
47; CHECK-NEXT:    xxlor vs2, v2, v2
48; CHECK-NEXT:    xxlor vs0, vs2, vs2
49; CHECK-NEXT:    xxlor vs1, vs3, vs3
50; CHECK-NEXT:    xxmtacc acc0
51; CHECK-NEXT:    stxv vs0, 48(r3)
52; CHECK-NEXT:    stxv vs1, 32(r3)
53; CHECK-NEXT:    stxv vs2, 16(r3)
54; CHECK-NEXT:    stxv vs3, 0(r3)
55; CHECK-NEXT:    blr
56;
57; CHECK-BE-LABEL: int_xxmtacc:
58; CHECK-BE:       # %bb.0: # %entry
59; CHECK-BE-NEXT:    xxlor vs3, v2, v2
60; CHECK-BE-NEXT:    xxlor vs2, v2, v2
61; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
62; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
63; CHECK-BE-NEXT:    xxmtacc acc0
64; CHECK-BE-NEXT:    stxv vs1, 16(r3)
65; CHECK-BE-NEXT:    stxv vs0, 0(r3)
66; CHECK-BE-NEXT:    stxv vs3, 48(r3)
67; CHECK-BE-NEXT:    stxv vs2, 32(r3)
68; CHECK-BE-NEXT:    blr
69entry:
70; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
71; generated from the call to xxmtacc then one xxmfacc is generated for the store
72  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
73  %1 = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> %0)
74  store <512 x i1> %1, ptr %ptr, align 64
75  ret void
76}
77
78; xxmfacc
79declare <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1>)
80define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
81; CHECK-LABEL: int_xxmfacc:
82; CHECK:       # %bb.0: # %entry
83; CHECK-NEXT:    xxlor vs3, v2, v2
84; CHECK-NEXT:    xxlor vs2, v2, v2
85; CHECK-NEXT:    xxlor vs0, vs2, vs2
86; CHECK-NEXT:    xxlor vs1, vs3, vs3
87; CHECK-NEXT:    stxv vs0, 48(r3)
88; CHECK-NEXT:    stxv vs1, 32(r3)
89; CHECK-NEXT:    stxv vs2, 16(r3)
90; CHECK-NEXT:    stxv vs3, 0(r3)
91; CHECK-NEXT:    blr
92;
93; CHECK-BE-LABEL: int_xxmfacc:
94; CHECK-BE:       # %bb.0: # %entry
95; CHECK-BE-NEXT:    xxlor vs3, v2, v2
96; CHECK-BE-NEXT:    xxlor vs2, v2, v2
97; CHECK-BE-NEXT:    xxlor vs0, vs2, vs2
98; CHECK-BE-NEXT:    xxlor vs1, vs3, vs3
99; CHECK-BE-NEXT:    stxv vs1, 16(r3)
100; CHECK-BE-NEXT:    stxv vs0, 0(r3)
101; CHECK-BE-NEXT:    stxv vs3, 48(r3)
102; CHECK-BE-NEXT:    stxv vs2, 32(r3)
103; CHECK-BE-NEXT:    blr
104entry:
105; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
106; generated from the call to xxmfacc then one xxmfacc is generated for the store
107  %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
108  %1 = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> %0)
109  store <512 x i1> %1, ptr %ptr, align 64
110  ret void
111}
112
113; xxsetaccz
114declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
115define void @int_xxsetaccz(ptr %ptr) {
116; CHECK-LABEL: int_xxsetaccz:
117; CHECK:       # %bb.0: # %entry
118; CHECK-NEXT:    xxsetaccz acc0
119; CHECK-NEXT:    xxmfacc acc0
120; CHECK-NEXT:    stxv vs0, 48(r3)
121; CHECK-NEXT:    stxv vs1, 32(r3)
122; CHECK-NEXT:    stxv vs2, 16(r3)
123; CHECK-NEXT:    stxv vs3, 0(r3)
124; CHECK-NEXT:    blr
125;
126; CHECK-BE-LABEL: int_xxsetaccz:
127; CHECK-BE:       # %bb.0: # %entry
128; CHECK-BE-NEXT:    xxsetaccz acc0
129; CHECK-BE-NEXT:    xxmfacc acc0
130; CHECK-BE-NEXT:    stxv vs1, 16(r3)
131; CHECK-BE-NEXT:    stxv vs0, 0(r3)
132; CHECK-BE-NEXT:    stxv vs3, 48(r3)
133; CHECK-BE-NEXT:    stxv vs2, 32(r3)
134; CHECK-BE-NEXT:    blr
135entry:
136  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
137  store <512 x i1> %0, ptr %ptr, align 64
138  ret void
139}
140
141; disassemble_acc
142declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>)
143define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
144; CHECK-LABEL: disass_acc:
145; CHECK:       # %bb.0: # %entry
146; CHECK-NEXT:    xxsetaccz acc0
147; CHECK-NEXT:    xxmfacc acc0
148; CHECK-NEXT:    stxv vs3, 0(r3)
149; CHECK-NEXT:    stxv vs2, 0(r4)
150; CHECK-NEXT:    stxv vs1, 0(r5)
151; CHECK-NEXT:    stxv vs0, 0(r6)
152; CHECK-NEXT:    blr
153;
154; CHECK-BE-LABEL: disass_acc:
155; CHECK-BE:       # %bb.0: # %entry
156; CHECK-BE-NEXT:    xxsetaccz acc0
157; CHECK-BE-NEXT:    xxmfacc acc0
158; CHECK-BE-NEXT:    stxv vs0, 0(r3)
159; CHECK-BE-NEXT:    stxv vs1, 0(r4)
160; CHECK-BE-NEXT:    stxv vs2, 0(r5)
161; CHECK-BE-NEXT:    stxv vs3, 0(r6)
162; CHECK-BE-NEXT:    blr
163entry:
164  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
165  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
166  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
167  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
168  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
169  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
170  store <16 x i8> %2, ptr %ptr1, align 16
171  store <16 x i8> %3, ptr %ptr2, align 16
172  store <16 x i8> %4, ptr %ptr3, align 16
173  store <16 x i8> %5, ptr %ptr4, align 16
174  ret void
175}
176
177declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>)
178define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
179; CHECK-LABEL: testBranch:
180; CHECK:       # %bb.0: # %entry
181; CHECK-NEXT:    cmplwi r7, 0
182; CHECK-NEXT:    beq cr0, .LBB5_2
183; CHECK-NEXT:  # %bb.1: # %if.then
184; CHECK-NEXT:    xxsetaccz acc0
185; CHECK-NEXT:    b .LBB5_3
186; CHECK-NEXT:  .LBB5_2: # %if.else
187; CHECK-NEXT:    lxv vs1, 32(r3)
188; CHECK-NEXT:    lxv vs0, 48(r3)
189; CHECK-NEXT:    lxv vs3, 0(r3)
190; CHECK-NEXT:    lxv vs2, 16(r3)
191; CHECK-NEXT:    xxmtacc acc0
192; CHECK-NEXT:    xvi4ger8pp acc0, v2, v2
193; CHECK-NEXT:  .LBB5_3: # %if.end
194; CHECK-NEXT:    xxmfacc acc0
195; CHECK-NEXT:    stxv vs0, 48(r3)
196; CHECK-NEXT:    stxv vs1, 32(r3)
197; CHECK-NEXT:    stxv vs2, 16(r3)
198; CHECK-NEXT:    stxv vs3, 0(r3)
199; CHECK-NEXT:    blr
200;
201; CHECK-BE-LABEL: testBranch:
202; CHECK-BE:       # %bb.0: # %entry
203; CHECK-BE-NEXT:    cmplwi r7, 0
204; CHECK-BE-NEXT:    beq cr0, .LBB5_2
205; CHECK-BE-NEXT:  # %bb.1: # %if.then
206; CHECK-BE-NEXT:    xxsetaccz acc0
207; CHECK-BE-NEXT:    b .LBB5_3
208; CHECK-BE-NEXT:  .LBB5_2: # %if.else
209; CHECK-BE-NEXT:    lxv vs1, 16(r3)
210; CHECK-BE-NEXT:    lxv vs0, 0(r3)
211; CHECK-BE-NEXT:    lxv vs3, 48(r3)
212; CHECK-BE-NEXT:    lxv vs2, 32(r3)
213; CHECK-BE-NEXT:    xxmtacc acc0
214; CHECK-BE-NEXT:    xvi4ger8pp acc0, v2, v2
215; CHECK-BE-NEXT:  .LBB5_3: # %if.end
216; CHECK-BE-NEXT:    xxmfacc acc0
217; CHECK-BE-NEXT:    stxv vs1, 16(r3)
218; CHECK-BE-NEXT:    stxv vs0, 0(r3)
219; CHECK-BE-NEXT:    stxv vs3, 48(r3)
220; CHECK-BE-NEXT:    stxv vs2, 32(r3)
221; CHECK-BE-NEXT:    blr
222entry:
223  %tobool = icmp eq i32 %val, 0
224  br i1 %tobool, label %if.else, label %if.then
225
226if.then:
227  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
228  br label %if.end
229
230if.else:
231  %1 = load <512 x i1>, ptr %ptr, align 64
232  %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
233  br label %if.end
234
235if.end:
236  %vq1.0 = phi <512 x i1> [ %0, %if.then ], [ %2, %if.else ]
237  store <512 x i1> %vq1.0, ptr %ptr, align 64
238  ret void
239}
240
241; The following test cases check that the xxsetaccz instruction is correctly rematerialized
242declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>)
243declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>)
244declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>)
245
246define void @testcse(ptr %res, <16 x i8> %vc) {
247; CHECK-LABEL: testcse:
248; CHECK:       # %bb.0: # %entry
249; CHECK-NEXT:    xxsetaccz acc0
250; CHECK-NEXT:    xvf32gerpp acc0, v2, v2
251; CHECK-NEXT:    xxmfacc acc0
252; CHECK-NEXT:    stxv vs0, 48(r3)
253; CHECK-NEXT:    stxv vs1, 32(r3)
254; CHECK-NEXT:    stxv vs2, 16(r3)
255; CHECK-NEXT:    stxv vs3, 0(r3)
256; CHECK-NEXT:    stxv vs0, 112(r3)
257; CHECK-NEXT:    stxv vs1, 96(r3)
258; CHECK-NEXT:    stxv vs2, 80(r3)
259; CHECK-NEXT:    stxv vs3, 64(r3)
260; CHECK-NEXT:    blr
261;
262; CHECK-BE-LABEL: testcse:
263; CHECK-BE:       # %bb.0: # %entry
264; CHECK-BE-NEXT:    xxsetaccz acc0
265; CHECK-BE-NEXT:    xvf32gerpp acc0, v2, v2
266; CHECK-BE-NEXT:    xxmfacc acc0
267; CHECK-BE-NEXT:    stxv vs1, 16(r3)
268; CHECK-BE-NEXT:    stxv vs0, 0(r3)
269; CHECK-BE-NEXT:    stxv vs3, 48(r3)
270; CHECK-BE-NEXT:    stxv vs2, 32(r3)
271; CHECK-BE-NEXT:    stxv vs1, 80(r3)
272; CHECK-BE-NEXT:    stxv vs0, 64(r3)
273; CHECK-BE-NEXT:    stxv vs3, 112(r3)
274; CHECK-BE-NEXT:    stxv vs2, 96(r3)
275; CHECK-BE-NEXT:    blr
276entry:
277  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
278  %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
279  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
280  %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
281  %4 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
282  store <512 x i1> %2, ptr %res, align 64
283  store <512 x i1> %3, ptr %4, align 64
284  ret void
285}
286
287define void @testcse2(ptr %res, <16 x i8> %vc) {
288; CHECK-LABEL: testcse2:
289; CHECK:       # %bb.0: # %entry
290; CHECK-NEXT:    xxsetaccz acc0
291; CHECK-NEXT:    xxsetaccz acc1
292; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
293; CHECK-NEXT:    xvf32gerpn acc0, v2, v2
294; CHECK-NEXT:    xxmfacc acc1
295; CHECK-NEXT:    xxmfacc acc0
296; CHECK-NEXT:    stxv vs4, 48(r3)
297; CHECK-NEXT:    stxv vs5, 32(r3)
298; CHECK-NEXT:    stxv vs6, 16(r3)
299; CHECK-NEXT:    stxv vs7, 0(r3)
300; CHECK-NEXT:    stxv vs0, 112(r3)
301; CHECK-NEXT:    stxv vs1, 96(r3)
302; CHECK-NEXT:    stxv vs2, 80(r3)
303; CHECK-NEXT:    stxv vs3, 64(r3)
304; CHECK-NEXT:    blr
305;
306; CHECK-BE-LABEL: testcse2:
307; CHECK-BE:       # %bb.0: # %entry
308; CHECK-BE-NEXT:    xxsetaccz acc0
309; CHECK-BE-NEXT:    xxsetaccz acc1
310; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
311; CHECK-BE-NEXT:    xvf32gerpn acc0, v2, v2
312; CHECK-BE-NEXT:    xxmfacc acc1
313; CHECK-BE-NEXT:    xxmfacc acc0
314; CHECK-BE-NEXT:    stxv vs5, 16(r3)
315; CHECK-BE-NEXT:    stxv vs4, 0(r3)
316; CHECK-BE-NEXT:    stxv vs7, 48(r3)
317; CHECK-BE-NEXT:    stxv vs6, 32(r3)
318; CHECK-BE-NEXT:    stxv vs1, 80(r3)
319; CHECK-BE-NEXT:    stxv vs0, 64(r3)
320; CHECK-BE-NEXT:    stxv vs3, 112(r3)
321; CHECK-BE-NEXT:    stxv vs2, 96(r3)
322; CHECK-BE-NEXT:    blr
323entry:
324  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
325  %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
326  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
327  %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
328  %4 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
329  store <512 x i1> %2, ptr %res, align 64
330  store <512 x i1> %3, ptr %4, align 64
331  ret void
332}
333
334define void @testcse3(ptr %res, <16 x i8> %vc) {
335; CHECK-LABEL: testcse3:
336; CHECK:       # %bb.0: # %entry
337; CHECK-NEXT:    xxsetaccz acc0
338; CHECK-NEXT:    xxsetaccz acc1
339; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
340; CHECK-NEXT:    xvf32gerpn acc0, v2, v2
341; CHECK-NEXT:    xxmfacc acc1
342; CHECK-NEXT:    xxmfacc acc0
343; CHECK-NEXT:    stxv vs4, 48(r3)
344; CHECK-NEXT:    stxv vs5, 32(r3)
345; CHECK-NEXT:    stxv vs6, 16(r3)
346; CHECK-NEXT:    stxv vs7, 0(r3)
347; CHECK-NEXT:    stxv vs0, 112(r3)
348; CHECK-NEXT:    stxv vs1, 96(r3)
349; CHECK-NEXT:    stxv vs2, 80(r3)
350; CHECK-NEXT:    stxv vs3, 64(r3)
351; CHECK-NEXT:    blr
352;
353; CHECK-BE-LABEL: testcse3:
354; CHECK-BE:       # %bb.0: # %entry
355; CHECK-BE-NEXT:    xxsetaccz acc0
356; CHECK-BE-NEXT:    xxsetaccz acc1
357; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
358; CHECK-BE-NEXT:    xvf32gerpn acc0, v2, v2
359; CHECK-BE-NEXT:    xxmfacc acc1
360; CHECK-BE-NEXT:    xxmfacc acc0
361; CHECK-BE-NEXT:    stxv vs5, 16(r3)
362; CHECK-BE-NEXT:    stxv vs4, 0(r3)
363; CHECK-BE-NEXT:    stxv vs7, 48(r3)
364; CHECK-BE-NEXT:    stxv vs6, 32(r3)
365; CHECK-BE-NEXT:    stxv vs1, 80(r3)
366; CHECK-BE-NEXT:    stxv vs0, 64(r3)
367; CHECK-BE-NEXT:    stxv vs3, 112(r3)
368; CHECK-BE-NEXT:    stxv vs2, 96(r3)
369; CHECK-BE-NEXT:    blr
370entry:
371  %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
372  %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
373  %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
374  %3 = getelementptr inbounds <512 x i1>, ptr %res, i64 1
375  store <512 x i1> %1, ptr %res, align 64
376  store <512 x i1> %2, ptr %3, align 64
377  ret void
378}
379
380define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
381; CHECK-LABEL: testcse4:
382; CHECK:       # %bb.0: # %entry
383; CHECK-NEXT:    cmpwi r4, 1
384; CHECK-NEXT:    bltlr cr0
385; CHECK-NEXT:  # %bb.1: # %for.body.preheader
386; CHECK-NEXT:    clrldi r4, r4, 32
387; CHECK-NEXT:    li r6, 0
388; CHECK-NEXT:    mtctr r4
389; CHECK-NEXT:    li r4, 0
390; CHECK-NEXT:    .p2align 4
391; CHECK-NEXT:  .LBB9_2: # %for.body
392; CHECK-NEXT:    #
393; CHECK-NEXT:    rldic r7, r6, 4, 28
394; CHECK-NEXT:    xxsetaccz acc2
395; CHECK-NEXT:    xxsetaccz acc1
396; CHECK-NEXT:    addi r6, r6, 6
397; CHECK-NEXT:    lxvx vs0, r5, r7
398; CHECK-NEXT:    add r7, r5, r7
399; CHECK-NEXT:    lxv vs1, 16(r7)
400; CHECK-NEXT:    xvf32gerpp acc2, vs0, vs1
401; CHECK-NEXT:    lxv vs0, 32(r7)
402; CHECK-NEXT:    lxv vs1, 48(r7)
403; CHECK-NEXT:    xvf32gerpn acc1, vs0, vs1
404; CHECK-NEXT:    lxv vs12, 64(r7)
405; CHECK-NEXT:    lxv vs13, 80(r7)
406; CHECK-NEXT:    xxsetaccz acc0
407; CHECK-NEXT:    rldic r7, r4, 6, 26
408; CHECK-NEXT:    addi r4, r4, 3
409; CHECK-NEXT:    add r8, r3, r7
410; CHECK-NEXT:    xxmfacc acc2
411; CHECK-NEXT:    xvf32gernp acc0, vs12, vs13
412; CHECK-NEXT:    stxvx vs11, r3, r7
413; CHECK-NEXT:    stxv vs8, 48(r8)
414; CHECK-NEXT:    xxmfacc acc1
415; CHECK-NEXT:    stxv vs9, 32(r8)
416; CHECK-NEXT:    stxv vs10, 16(r8)
417; CHECK-NEXT:    stxv vs4, 112(r8)
418; CHECK-NEXT:    stxv vs5, 96(r8)
419; CHECK-NEXT:    xxmfacc acc0
420; CHECK-NEXT:    stxv vs6, 80(r8)
421; CHECK-NEXT:    stxv vs7, 64(r8)
422; CHECK-NEXT:    stxv vs0, 176(r8)
423; CHECK-NEXT:    stxv vs1, 160(r8)
424; CHECK-NEXT:    stxv vs2, 144(r8)
425; CHECK-NEXT:    stxv vs3, 128(r8)
426; CHECK-NEXT:    bdnz .LBB9_2
427; CHECK-NEXT:  # %bb.3: # %for.cond.cleanup
428; CHECK-NEXT:    blr
429;
430; CHECK-BE-LABEL: testcse4:
431; CHECK-BE:       # %bb.0: # %entry
432; CHECK-BE-NEXT:    cmpwi r4, 1
433; CHECK-BE-NEXT:    bltlr cr0
434; CHECK-BE-NEXT:  # %bb.1: # %for.body.preheader
435; CHECK-BE-NEXT:    clrldi r4, r4, 32
436; CHECK-BE-NEXT:    li r6, 0
437; CHECK-BE-NEXT:    mtctr r4
438; CHECK-BE-NEXT:    li r4, 0
439; CHECK-BE-NEXT:    .p2align 4
440; CHECK-BE-NEXT:  .LBB9_2: # %for.body
441; CHECK-BE-NEXT:    #
442; CHECK-BE-NEXT:    rldic r7, r6, 4, 28
443; CHECK-BE-NEXT:    xxsetaccz acc2
444; CHECK-BE-NEXT:    xxsetaccz acc1
445; CHECK-BE-NEXT:    addi r6, r6, 6
446; CHECK-BE-NEXT:    lxvx vs0, r5, r7
447; CHECK-BE-NEXT:    add r7, r5, r7
448; CHECK-BE-NEXT:    lxv vs1, 16(r7)
449; CHECK-BE-NEXT:    xvf32gerpp acc2, vs0, vs1
450; CHECK-BE-NEXT:    lxv vs0, 32(r7)
451; CHECK-BE-NEXT:    lxv vs1, 48(r7)
452; CHECK-BE-NEXT:    xvf32gerpn acc1, vs0, vs1
453; CHECK-BE-NEXT:    lxv vs12, 64(r7)
454; CHECK-BE-NEXT:    lxv vs13, 80(r7)
455; CHECK-BE-NEXT:    xxsetaccz acc0
456; CHECK-BE-NEXT:    rldic r7, r4, 6, 26
457; CHECK-BE-NEXT:    addi r4, r4, 3
458; CHECK-BE-NEXT:    add r8, r3, r7
459; CHECK-BE-NEXT:    xxmfacc acc2
460; CHECK-BE-NEXT:    xvf32gernp acc0, vs12, vs13
461; CHECK-BE-NEXT:    stxvx vs8, r3, r7
462; CHECK-BE-NEXT:    stxv vs9, 16(r8)
463; CHECK-BE-NEXT:    xxmfacc acc1
464; CHECK-BE-NEXT:    stxv vs11, 48(r8)
465; CHECK-BE-NEXT:    stxv vs10, 32(r8)
466; CHECK-BE-NEXT:    stxv vs5, 80(r8)
467; CHECK-BE-NEXT:    stxv vs4, 64(r8)
468; CHECK-BE-NEXT:    xxmfacc acc0
469; CHECK-BE-NEXT:    stxv vs7, 112(r8)
470; CHECK-BE-NEXT:    stxv vs6, 96(r8)
471; CHECK-BE-NEXT:    stxv vs1, 144(r8)
472; CHECK-BE-NEXT:    stxv vs0, 128(r8)
473; CHECK-BE-NEXT:    stxv vs3, 176(r8)
474; CHECK-BE-NEXT:    stxv vs2, 160(r8)
475; CHECK-BE-NEXT:    bdnz .LBB9_2
476; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
477; CHECK-BE-NEXT:    blr
478entry:
479  %cmp55 = icmp sgt i32 %lim, 0
480  br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
481
482for.body.preheader:                               ; preds = %entry
483  %wide.trip.count = zext i32 %lim to i64
484  br label %for.body
485
486for.cond.cleanup:                                 ; preds = %for.body, %entry
487  ret void
488
489for.body:                                         ; preds = %for.body, %for.body.preheader
490  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
491  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
492  %1 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
493  %2 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
494  %3 = trunc i64 %indvars.iv to i32
495  %mul = mul nsw i32 %3, 6
496  %idxprom = zext i32 %mul to i64
497  %arrayidx = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom
498  %4 = load <16 x i8>, ptr %arrayidx, align 16
499  %add2 = or disjoint i32 %mul, 1
500  %idxprom3 = zext i32 %add2 to i64
501  %arrayidx4 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom3
502  %5 = load <16 x i8>, ptr %arrayidx4, align 16
503  %6 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %4, <16 x i8> %5)
504  %add6 = add nuw nsw i32 %mul, 2
505  %idxprom7 = zext i32 %add6 to i64
506  %arrayidx8 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom7
507  %7 = load <16 x i8>, ptr %arrayidx8, align 16
508  %add10 = add nuw nsw i32 %mul, 3
509  %idxprom11 = zext i32 %add10 to i64
510  %arrayidx12 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom11
511  %8 = load <16 x i8>, ptr %arrayidx12, align 16
512  %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %7, <16 x i8> %8)
513  %add14 = add nuw nsw i32 %mul, 4
514  %idxprom15 = zext i32 %add14 to i64
515  %arrayidx16 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom15
516  %10 = load <16 x i8>, ptr %arrayidx16, align 16
517  %add18 = add nuw nsw i32 %mul, 5
518  %idxprom19 = zext i32 %add18 to i64
519  %arrayidx20 = getelementptr inbounds <16 x i8>, ptr %vc, i64 %idxprom19
520  %11 = load <16 x i8>, ptr %arrayidx20, align 16
521  %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %2, <16 x i8> %10, <16 x i8> %11)
522  %mul21 = mul i64 %indvars.iv, 3
523  %idx.ext = and i64 %mul21, 4294967295
524  %add.ptr = getelementptr inbounds <512 x i1>, ptr %res, i64 %idx.ext
525  store <512 x i1> %6, ptr %add.ptr, align 64
526  %add.ptr26 = getelementptr inbounds <512 x i1>, ptr %add.ptr, i64 1
527  store <512 x i1> %9, ptr %add.ptr26, align 64
528  %add.ptr30 = getelementptr inbounds <512 x i1>, ptr %add.ptr, i64 2
529  store <512 x i1> %12, ptr %add.ptr30, align 64
530  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
531  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
532  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
533}
534
535declare i32 @testRedundantPrimeUnprimeF()
536define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
537; CHECK-LABEL: testRedundantPrimeUnprime:
538; CHECK:       # %bb.0: # %entry
539; CHECK-NEXT:    mflr r0
540; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
541; CHECK-NEXT:    std r0, 16(r1)
542; CHECK-NEXT:    stdu r1, -112(r1)
543; CHECK-NEXT:    xxsetaccz acc0
544; CHECK-NEXT:    xxsetaccz acc1
545; CHECK-NEXT:    mr r30, r3
546; CHECK-NEXT:    xxmfacc acc0
547; CHECK-NEXT:    stxv vs0, 48(r3)
548; CHECK-NEXT:    stxv vs1, 32(r3)
549; CHECK-NEXT:    stxv vs2, 16(r3)
550; CHECK-NEXT:    stxv vs3, 0(r3)
551; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
552; CHECK-NEXT:    xxmfacc acc1
553; CHECK-NEXT:    stxv vs4, 80(r1)
554; CHECK-NEXT:    stxv vs5, 64(r1)
555; CHECK-NEXT:    stxv vs6, 48(r1)
556; CHECK-NEXT:    stxv vs7, 32(r1)
557; CHECK-NEXT:    bl testRedundantPrimeUnprimeF@notoc
558; CHECK-NEXT:    lxvp vsp0, 64(r1)
559; CHECK-NEXT:    lxvp vsp2, 32(r1)
560; CHECK-NEXT:    stxv vs0, 112(r30)
561; CHECK-NEXT:    stxv vs1, 96(r30)
562; CHECK-NEXT:    stxv vs2, 80(r30)
563; CHECK-NEXT:    stxv vs3, 64(r30)
564; CHECK-NEXT:    addi r1, r1, 112
565; CHECK-NEXT:    ld r0, 16(r1)
566; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
567; CHECK-NEXT:    mtlr r0
568; CHECK-NEXT:    blr
569;
570; CHECK-BE-LABEL: testRedundantPrimeUnprime:
571; CHECK-BE:       # %bb.0: # %entry
572; CHECK-BE-NEXT:    mflr r0
573; CHECK-BE-NEXT:    std r0, 16(r1)
574; CHECK-BE-NEXT:    stdu r1, -192(r1)
575; CHECK-BE-NEXT:    xxsetaccz acc0
576; CHECK-BE-NEXT:    xxsetaccz acc1
577; CHECK-BE-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
578; CHECK-BE-NEXT:    mr r30, r3
579; CHECK-BE-NEXT:    xxmfacc acc0
580; CHECK-BE-NEXT:    stxv vs1, 16(r3)
581; CHECK-BE-NEXT:    stxv vs0, 0(r3)
582; CHECK-BE-NEXT:    stxv vs3, 48(r3)
583; CHECK-BE-NEXT:    stxv vs2, 32(r3)
584; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
585; CHECK-BE-NEXT:    xxmfacc acc1
586; CHECK-BE-NEXT:    stxv vs4, 112(r1)
587; CHECK-BE-NEXT:    stxv vs5, 128(r1)
588; CHECK-BE-NEXT:    stxv vs6, 144(r1)
589; CHECK-BE-NEXT:    stxv vs7, 160(r1)
590; CHECK-BE-NEXT:    bl testRedundantPrimeUnprimeF
591; CHECK-BE-NEXT:    nop
592; CHECK-BE-NEXT:    lxvp vsp0, 112(r1)
593; CHECK-BE-NEXT:    lxvp vsp2, 144(r1)
594; CHECK-BE-NEXT:    stxv vs3, 112(r30)
595; CHECK-BE-NEXT:    stxv vs2, 96(r30)
596; CHECK-BE-NEXT:    stxv vs1, 80(r30)
597; CHECK-BE-NEXT:    stxv vs0, 64(r30)
598; CHECK-BE-NEXT:    ld r30, 176(r1) # 8-byte Folded Reload
599; CHECK-BE-NEXT:    addi r1, r1, 192
600; CHECK-BE-NEXT:    ld r0, 16(r1)
601; CHECK-BE-NEXT:    mtlr r0
602; CHECK-BE-NEXT:    blr
603entry:
604  %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
605  store <512 x i1> %0, ptr %dst, align 64
606  %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
607  %call = tail call signext i32 @testRedundantPrimeUnprimeF()
608  %add.ptr1 = getelementptr inbounds <512 x i1>, ptr %dst, i64 1
609  store <512 x i1> %1, ptr %add.ptr1, align 64
610  ret void
611}
612
613declare <256 x i1> @llvm.ppc.vsx.lxvp(ptr)
614declare void @llvm.ppc.vsx.stxvp(<256 x i1>, ptr)
615
616; Function Attrs: nofree nounwind
617define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
618; CHECK-LABEL: test_ldst_1:
619; CHECK:       # %bb.0: # %entry
620; CHECK-NEXT:    lxv vs1, 32(r3)
621; CHECK-NEXT:    lxv vs0, 48(r3)
622; CHECK-NEXT:    lxv vs3, 0(r3)
623; CHECK-NEXT:    lxv vs2, 16(r3)
624; CHECK-NEXT:    plxvp vsp36, 8(r4), 0
625; CHECK-NEXT:    xxmtacc acc0
626; CHECK-NEXT:    pmxvf64gernn acc0, vsp36, v2, 0, 0
627; CHECK-NEXT:    xxmfacc acc0
628; CHECK-NEXT:    stxv vs0, 48(r7)
629; CHECK-NEXT:    stxv vs1, 32(r7)
630; CHECK-NEXT:    stxv vs2, 16(r7)
631; CHECK-NEXT:    stxv vs3, 0(r7)
632; CHECK-NEXT:    blr
633;
634; CHECK-BE-LABEL: test_ldst_1:
635; CHECK-BE:       # %bb.0: # %entry
636; CHECK-BE-NEXT:    lxv vs1, 16(r3)
637; CHECK-BE-NEXT:    lxv vs0, 0(r3)
638; CHECK-BE-NEXT:    lxv vs3, 48(r3)
639; CHECK-BE-NEXT:    lxv vs2, 32(r3)
640; CHECK-BE-NEXT:    plxvp vsp36, 8(r4), 0
641; CHECK-BE-NEXT:    xxmtacc acc0
642; CHECK-BE-NEXT:    pmxvf64gernn acc0, vsp36, v2, 0, 0
643; CHECK-BE-NEXT:    xxmfacc acc0
644; CHECK-BE-NEXT:    stxv vs1, 16(r7)
645; CHECK-BE-NEXT:    stxv vs0, 0(r7)
646; CHECK-BE-NEXT:    stxv vs3, 48(r7)
647; CHECK-BE-NEXT:    stxv vs2, 32(r7)
648; CHECK-BE-NEXT:    blr
649entry:
650  %0 = load <512 x i1>, ptr %vqp, align 64
651  %1 = getelementptr i8, ptr %vpp, i64 8
652  %2 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %1)
653  %3 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %0, <256 x i1> %2, <16 x i8> %vc, i32 0, i32 0)
654  store <512 x i1> %3, ptr %resp, align 64
655  ret void
656}
657
658; Function Attrs: nofree nounwind
659define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
660; CHECK-LABEL: test_ldst_2:
661; CHECK:       # %bb.0: # %entry
662; CHECK-NEXT:    lxv vs1, 32(r3)
663; CHECK-NEXT:    lxv vs0, 48(r3)
664; CHECK-NEXT:    lxv vs3, 0(r3)
665; CHECK-NEXT:    lxv vs2, 16(r3)
666; CHECK-NEXT:    xxmtacc acc0
667; CHECK-NEXT:    lxvp vsp36, 0(r4)
668; CHECK-NEXT:    xvf64gernp acc0, vsp36, v2
669; CHECK-NEXT:    xxmfacc acc0
670; CHECK-NEXT:    stxv vs0, 48(r7)
671; CHECK-NEXT:    stxv vs1, 32(r7)
672; CHECK-NEXT:    stxv vs2, 16(r7)
673; CHECK-NEXT:    stxv vs3, 0(r7)
674; CHECK-NEXT:    blr
675;
676; CHECK-BE-LABEL: test_ldst_2:
677; CHECK-BE:       # %bb.0: # %entry
678; CHECK-BE-NEXT:    lxv vs1, 16(r3)
679; CHECK-BE-NEXT:    lxv vs0, 0(r3)
680; CHECK-BE-NEXT:    lxv vs3, 48(r3)
681; CHECK-BE-NEXT:    lxv vs2, 32(r3)
682; CHECK-BE-NEXT:    xxmtacc acc0
683; CHECK-BE-NEXT:    lxvp vsp36, 0(r4)
684; CHECK-BE-NEXT:    xvf64gernp acc0, vsp36, v2
685; CHECK-BE-NEXT:    xxmfacc acc0
686; CHECK-BE-NEXT:    stxv vs1, 16(r7)
687; CHECK-BE-NEXT:    stxv vs0, 0(r7)
688; CHECK-BE-NEXT:    stxv vs3, 48(r7)
689; CHECK-BE-NEXT:    stxv vs2, 32(r7)
690; CHECK-BE-NEXT:    blr
691entry:
692  %0 = load <512 x i1>, ptr %vqp, align 64
693  %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
694  %2 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %0, <256 x i1> %1, <16 x i8> %vc)
695  store <512 x i1> %2, ptr %resp, align 64
696  ret void
697}
698
699; Function Attrs: nofree nounwind
700define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x i8> %vc, ptr nocapture %resp)  {
701; CHECK-LABEL: test_ldst_3:
702; CHECK:       # %bb.0: # %entry
703; CHECK-NEXT:    lxv vs1, 32(r3)
704; CHECK-NEXT:    lxv vs0, 48(r3)
705; CHECK-NEXT:    lxv vs3, 0(r3)
706; CHECK-NEXT:    lxv vs2, 16(r3)
707; CHECK-NEXT:    xxmtacc acc0
708; CHECK-NEXT:    lxvp vsp36, 0(r5)
709; CHECK-NEXT:    xvf64gernp acc0, vsp36, v2
710; CHECK-NEXT:    xxmfacc acc0
711; CHECK-NEXT:    stxv vs0, 48(r9)
712; CHECK-NEXT:    stxv vs1, 32(r9)
713; CHECK-NEXT:    stxv vs2, 16(r9)
714; CHECK-NEXT:    stxv vs3, 0(r9)
715; CHECK-NEXT:    blr
716;
717; CHECK-BE-LABEL: test_ldst_3:
718; CHECK-BE:       # %bb.0: # %entry
719; CHECK-BE-NEXT:    lxv vs1, 16(r3)
720; CHECK-BE-NEXT:    lxv vs0, 0(r3)
721; CHECK-BE-NEXT:    lxv vs3, 48(r3)
722; CHECK-BE-NEXT:    lxv vs2, 32(r3)
723; CHECK-BE-NEXT:    xxmtacc acc0
724; CHECK-BE-NEXT:    lxvp vsp36, 0(r5)
725; CHECK-BE-NEXT:    xvf64gernp acc0, vsp36, v2
726; CHECK-BE-NEXT:    xxmfacc acc0
727; CHECK-BE-NEXT:    stxv vs1, 16(r9)
728; CHECK-BE-NEXT:    stxv vs0, 0(r9)
729; CHECK-BE-NEXT:    stxv vs3, 48(r9)
730; CHECK-BE-NEXT:    stxv vs2, 32(r9)
731; CHECK-BE-NEXT:    blr
732entry:
733  %0 = load <512 x i1>, ptr %vqp, align 64
734  %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
735  %2 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %0, <256 x i1> %1, <16 x i8> %vc)
736  store <512 x i1> %2, ptr %resp, align 64
737  ret void
738}
739
740declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
741declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)
742