xref: /llvm-project/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll (revision 475a39fbc3c780fe418bcd4d049177504522f235)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
3; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
4; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s
5; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
6; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
7; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
8
9declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
10declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
11declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>)
12declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>)
13define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) {
14; CHECK-LABEL: testPHI1:
15; CHECK:       # %bb.0: # %entry
16; CHECK-NEXT:    xxsetaccz acc0
17; CHECK-NEXT:    cmpwi r5, 3
18; CHECK-NEXT:    blt cr0, .LBB0_3
19; CHECK-NEXT:  # %bb.1: # %for.body.preheader
20; CHECK-NEXT:    lxv v2, 0(r4)
21; CHECK-NEXT:    lxv v3, 16(r4)
22; CHECK-NEXT:    clrldi r5, r5, 32
23; CHECK-NEXT:    addi r4, r4, 32
24; CHECK-NEXT:    addi r5, r5, -2
25; CHECK-NEXT:    mtctr r5
26; CHECK-NEXT:    .p2align 4
27; CHECK-NEXT:  .LBB0_2: # %for.body
28; CHECK-NEXT:    #
29; CHECK-NEXT:    lxv vs4, 0(r4)
30; CHECK-NEXT:    addi r4, r4, 16
31; CHECK-NEXT:    xvf64gerpp acc0, vsp34, vs4
32; CHECK-NEXT:    bdnz .LBB0_2
33; CHECK-NEXT:  .LBB0_3: # %for.cond.cleanup
34; CHECK-NEXT:    xxmfacc acc0
35; CHECK-NEXT:    stxv vs3, 0(r3)
36; CHECK-NEXT:    stxv vs2, 16(r3)
37; CHECK-NEXT:    stxv vs1, 32(r3)
38; CHECK-NEXT:    stxv vs0, 48(r3)
39; CHECK-NEXT:    blr
40;
41; CHECK-BE-LABEL: testPHI1:
42; CHECK-BE:       # %bb.0: # %entry
43; CHECK-BE-NEXT:    xxsetaccz acc0
44; CHECK-BE-NEXT:    cmpwi r5, 3
45; CHECK-BE-NEXT:    blt cr0, .LBB0_3
46; CHECK-BE-NEXT:  # %bb.1: # %for.body.preheader
47; CHECK-BE-NEXT:    lxv v2, 0(r4)
48; CHECK-BE-NEXT:    lxv v3, 16(r4)
49; CHECK-BE-NEXT:    clrldi r5, r5, 32
50; CHECK-BE-NEXT:    addi r4, r4, 32
51; CHECK-BE-NEXT:    addi r5, r5, -2
52; CHECK-BE-NEXT:    mtctr r5
53; CHECK-BE-NEXT:    .p2align 4
54; CHECK-BE-NEXT:  .LBB0_2: # %for.body
55; CHECK-BE-NEXT:    #
56; CHECK-BE-NEXT:    lxv vs4, 0(r4)
57; CHECK-BE-NEXT:    addi r4, r4, 16
58; CHECK-BE-NEXT:    xvf64gerpp acc0, vsp34, vs4
59; CHECK-BE-NEXT:    bdnz .LBB0_2
60; CHECK-BE-NEXT:  .LBB0_3: # %for.cond.cleanup
61; CHECK-BE-NEXT:    xxmfacc acc0
62; CHECK-BE-NEXT:    stxv vs0, 0(r3)
63; CHECK-BE-NEXT:    stxv vs1, 16(r3)
64; CHECK-BE-NEXT:    stxv vs2, 32(r3)
65; CHECK-BE-NEXT:    stxv vs3, 48(r3)
66; CHECK-BE-NEXT:    blr
67entry:
68  %0 = load <16 x i8>, ptr %Src, align 16
69  %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
70  %1 = load <16 x i8>, ptr %arrayidx1, align 16
71  %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1)
72  %3 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
73  %cmp11 = icmp sgt i32 %Len, 2
74  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
75
76for.body.preheader:
77  %wide.trip.count = zext i32 %Len to i64
78  br label %for.body
79
80for.cond.cleanup:
81  %Acc.0.lcssa = phi <512 x i1> [ %3, %entry ], [ %13, %for.body ]
82  %4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa)
83  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 0
84  store <16 x i8> %5, ptr %Dst, align 16
85  %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 1
86  %7 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 1
87  store <16 x i8> %6, ptr %7, align 16
88  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 2
89  %9 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 2
90  store <16 x i8> %8, ptr %9, align 16
91  %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 3
92  %11 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 3
93  store <16 x i8> %10, ptr %11, align 16
94  ret void
95
96for.body:
97  %indvars.iv = phi i64 [ 2, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
98  %Acc.012 = phi <512 x i1> [ %3, %for.body.preheader ], [ %13, %for.body ]
99  %arrayidx2 = getelementptr inbounds <16 x i8>, ptr %Src, i64 %indvars.iv
100  %12 = load <16 x i8>, ptr %arrayidx2, align 16
101  %13 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.012, <256 x i1> %2, <16 x i8> %12)
102  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
103  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
104  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
105}
106
107declare <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1>, <16 x i8>)
108define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) {
109; CHECK-LABEL: testPHI2:
110; CHECK:       # %bb.0: # %entry
111; CHECK-NEXT:    lxv v2, 0(r4)
112; CHECK-NEXT:    lxv v3, 16(r4)
113; CHECK-NEXT:    cmpwi r5, 4
114; CHECK-NEXT:    lxv vs4, 32(r4)
115; CHECK-NEXT:    xvf64ger acc0, vsp34, vs4
116; CHECK-NEXT:    blt cr0, .LBB1_3
117; CHECK-NEXT:  # %bb.1: # %for.body.preheader
118; CHECK-NEXT:    clrldi r5, r5, 32
119; CHECK-NEXT:    addi r4, r4, 48
120; CHECK-NEXT:    addi r5, r5, -3
121; CHECK-NEXT:    mtctr r5
122; CHECK-NEXT:    .p2align 4
123; CHECK-NEXT:  .LBB1_2: # %for.body
124; CHECK-NEXT:    #
125; CHECK-NEXT:    lxv vs4, 0(r4)
126; CHECK-NEXT:    addi r4, r4, 16
127; CHECK-NEXT:    xvf64gerpp acc0, vsp34, vs4
128; CHECK-NEXT:    bdnz .LBB1_2
129; CHECK-NEXT:  .LBB1_3: # %for.cond.cleanup
130; CHECK-NEXT:    xxmfacc acc0
131; CHECK-NEXT:    stxv vs3, 0(r3)
132; CHECK-NEXT:    stxv vs2, 16(r3)
133; CHECK-NEXT:    stxv vs1, 32(r3)
134; CHECK-NEXT:    stxv vs0, 48(r3)
135; CHECK-NEXT:    blr
136;
137; CHECK-BE-LABEL: testPHI2:
138; CHECK-BE:       # %bb.0: # %entry
139; CHECK-BE-NEXT:    lxv v2, 0(r4)
140; CHECK-BE-NEXT:    lxv v3, 16(r4)
141; CHECK-BE-NEXT:    cmpwi r5, 4
142; CHECK-BE-NEXT:    lxv vs4, 32(r4)
143; CHECK-BE-NEXT:    xvf64ger acc0, vsp34, vs4
144; CHECK-BE-NEXT:    blt cr0, .LBB1_3
145; CHECK-BE-NEXT:  # %bb.1: # %for.body.preheader
146; CHECK-BE-NEXT:    clrldi r5, r5, 32
147; CHECK-BE-NEXT:    addi r4, r4, 48
148; CHECK-BE-NEXT:    addi r5, r5, -3
149; CHECK-BE-NEXT:    mtctr r5
150; CHECK-BE-NEXT:    .p2align 4
151; CHECK-BE-NEXT:  .LBB1_2: # %for.body
152; CHECK-BE-NEXT:    #
153; CHECK-BE-NEXT:    lxv vs4, 0(r4)
154; CHECK-BE-NEXT:    addi r4, r4, 16
155; CHECK-BE-NEXT:    xvf64gerpp acc0, vsp34, vs4
156; CHECK-BE-NEXT:    bdnz .LBB1_2
157; CHECK-BE-NEXT:  .LBB1_3: # %for.cond.cleanup
158; CHECK-BE-NEXT:    xxmfacc acc0
159; CHECK-BE-NEXT:    stxv vs0, 0(r3)
160; CHECK-BE-NEXT:    stxv vs1, 16(r3)
161; CHECK-BE-NEXT:    stxv vs2, 32(r3)
162; CHECK-BE-NEXT:    stxv vs3, 48(r3)
163; CHECK-BE-NEXT:    blr
164entry:
165  %0 = load <16 x i8>, ptr %Src, align 16
166  %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
167  %1 = load <16 x i8>, ptr %arrayidx1, align 16
168  %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1)
169  %arrayidx2 = getelementptr inbounds <16 x i8>, ptr %Src, i64 2
170  %3 = load <16 x i8>, ptr %arrayidx2, align 16
171  %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %2, <16 x i8> %3)
172  %cmp14 = icmp sgt i32 %Len, 3
173  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
174
175for.body.preheader:
176  %wide.trip.count = zext i32 %Len to i64
177  br label %for.body
178
179for.cond.cleanup:
180  %Acc.0.lcssa = phi <512 x i1> [ %4, %entry ], [ %14, %for.body ]
181  %5 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa)
182  %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 0
183  store <16 x i8> %6, ptr %Dst, align 16
184  %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 1
185  %8 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 1
186  store <16 x i8> %7, ptr %8, align 16
187  %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 2
188  %10 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 2
189  store <16 x i8> %9, ptr %10, align 16
190  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 3
191  %12 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 3
192  store <16 x i8> %11, ptr %12, align 16
193  ret void
194
195for.body:
196  %indvars.iv = phi i64 [ 3, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
197  %Acc.015 = phi <512 x i1> [ %4, %for.body.preheader ], [ %14, %for.body ]
198  %arrayidx3 = getelementptr inbounds <16 x i8>, ptr %Src, i64 %indvars.iv
199  %13 = load <16 x i8>, ptr %arrayidx3, align 16
200  %14 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.015, <256 x i1> %2, <16 x i8> %13)
201  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
202  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
203  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
204}
205
206; This test uses an unprimed accumulator PHI node with two operands: an
207; implicitely defined unprimed accumulator and the unprimed result of the call
208; to xvf64gerpp. The compiler should replace this PHI node by a primed
209; accumulator PHI node.
210define void @testImplicitDef(ptr %ptr) {
211; CHECK-LABEL: testImplicitDef:
212; CHECK:       # %bb.0: # %label1
213; CHECK-NEXT:    # implicit-def: $acc0
214; CHECK-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
215; CHECK-NEXT:  # %bb.1: # %label2
216; CHECK-NEXT:    xvf64gerpp acc0, vsp34, vs0
217; CHECK-NEXT:  .LBB2_2: # %label3
218; CHECK-NEXT:    xxmfacc acc0
219; CHECK-NEXT:    stxv vs0, 0(r3)
220; CHECK-NEXT:    blr
221;
222; CHECK-BE-LABEL: testImplicitDef:
223; CHECK-BE:       # %bb.0: # %label1
224; CHECK-BE-NEXT:    # implicit-def: $acc0
225; CHECK-BE-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
226; CHECK-BE-NEXT:  # %bb.1: # %label2
227; CHECK-BE-NEXT:    xvf64gerpp acc0, vsp34, vs0
228; CHECK-BE-NEXT:  .LBB2_2: # %label3
229; CHECK-BE-NEXT:    xxmfacc acc0
230; CHECK-BE-NEXT:    stxv vs3, 0(r3)
231; CHECK-BE-NEXT:    blr
232label1:
233  br i1 undef, label %label3, label %label2
234
235label2:
236  %0 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> undef, <256 x i1> undef, <16 x i8> undef)
237  br label %label3
238
239label3:
240  %1 = phi <512 x i1> [ undef, %label1 ], [ %0, %label2 ]
241  %2 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %1)
242  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %2, 3
243  store <16 x i8> %3, ptr %ptr, align 16
244  ret void
245}
246
247; This test uses an unprimed accumulator PHI node with an unprimed accumulator
248; PHI node operand. The compiler should replace these PHI nodes by primed
249; accumulator PHI nodes.
250declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>)
251define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %count, ptr nocapture %ptr, <16 x i8> %vc) {
252; CHECK-LABEL: testNestedPHI:
253; CHECK:       # %bb.0: # %entry
254; CHECK-NEXT:    cmplwi r3, 0
255; CHECK-NEXT:    beq cr0, .LBB3_2
256; CHECK-NEXT:  # %bb.1: # %if.then
257; CHECK-NEXT:    xvf32gernp acc0, v2, v2
258; CHECK-NEXT:    cmpwi r4, 1
259; CHECK-NEXT:    bge cr0, .LBB3_3
260; CHECK-NEXT:    b .LBB3_5
261; CHECK-NEXT:  .LBB3_2:
262; CHECK-NEXT:    # implicit-def: $acc0
263; CHECK-NEXT:    cmpwi r4, 1
264; CHECK-NEXT:    blt cr0, .LBB3_5
265; CHECK-NEXT:  .LBB3_3: # %for.body.preheader
266; CHECK-NEXT:    addi r3, r4, -1
267; CHECK-NEXT:    clrldi r3, r3, 32
268; CHECK-NEXT:    addi r3, r3, 1
269; CHECK-NEXT:    mtctr r3
270; CHECK-NEXT:    .p2align 4
271; CHECK-NEXT:  .LBB3_4: # %for.body
272; CHECK-NEXT:    #
273; CHECK-NEXT:    xvf32gernp acc0, v2, v2
274; CHECK-NEXT:    bdnz .LBB3_4
275; CHECK-NEXT:  .LBB3_5: # %for.cond.cleanup
276; CHECK-NEXT:    xxmfacc acc0
277; CHECK-NEXT:    li r3, 0
278; CHECK-NEXT:    stxv vs0, 48(r5)
279; CHECK-NEXT:    stxv vs1, 32(r5)
280; CHECK-NEXT:    stxv vs2, 16(r5)
281; CHECK-NEXT:    stxv vs3, 0(r5)
282; CHECK-NEXT:    blr
283;
284; CHECK-BE-LABEL: testNestedPHI:
285; CHECK-BE:       # %bb.0: # %entry
286; CHECK-BE-NEXT:    cmplwi r3, 0
287; CHECK-BE-NEXT:    beq cr0, .LBB3_2
288; CHECK-BE-NEXT:  # %bb.1: # %if.then
289; CHECK-BE-NEXT:    xvf32gernp acc0, v2, v2
290; CHECK-BE-NEXT:    cmpwi r4, 1
291; CHECK-BE-NEXT:    bge cr0, .LBB3_3
292; CHECK-BE-NEXT:    b .LBB3_5
293; CHECK-BE-NEXT:  .LBB3_2:
294; CHECK-BE-NEXT:    # implicit-def: $acc0
295; CHECK-BE-NEXT:    cmpwi r4, 1
296; CHECK-BE-NEXT:    blt cr0, .LBB3_5
297; CHECK-BE-NEXT:  .LBB3_3: # %for.body.preheader
298; CHECK-BE-NEXT:    addi r3, r4, -1
299; CHECK-BE-NEXT:    clrldi r3, r3, 32
300; CHECK-BE-NEXT:    addi r3, r3, 1
301; CHECK-BE-NEXT:    mtctr r3
302; CHECK-BE-NEXT:    .p2align 4
303; CHECK-BE-NEXT:  .LBB3_4: # %for.body
304; CHECK-BE-NEXT:    #
305; CHECK-BE-NEXT:    xvf32gernp acc0, v2, v2
306; CHECK-BE-NEXT:    bdnz .LBB3_4
307; CHECK-BE-NEXT:  .LBB3_5: # %for.cond.cleanup
308; CHECK-BE-NEXT:    xxmfacc acc0
309; CHECK-BE-NEXT:    li r3, 0
310; CHECK-BE-NEXT:    stxv vs1, 16(r5)
311; CHECK-BE-NEXT:    stxv vs0, 0(r5)
312; CHECK-BE-NEXT:    stxv vs3, 48(r5)
313; CHECK-BE-NEXT:    stxv vs2, 32(r5)
314; CHECK-BE-NEXT:    blr
315entry:
316  %tobool.not = icmp eq i32 %cond, 0
317  br i1 %tobool.not, label %if.end, label %if.then
318
319if.then:
320  %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> undef, <16 x i8> %vc, <16 x i8> %vc)
321  br label %if.end
322
323if.end:
324  %vq.0 = phi <512 x i1> [ %0, %if.then ], [ undef, %entry ]
325  %cmp9 = icmp sgt i32 %count, 0
326  br i1 %cmp9, label %for.body, label %for.cond.cleanup
327
328for.cond.cleanup:
329  %vq.1.lcssa = phi <512 x i1> [ %vq.0, %if.end ], [ %1, %for.body ]
330  store <512 x i1> %vq.1.lcssa, ptr %ptr, align 64
331  ret i32 0
332
333for.body:
334  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %if.end ]
335  %vq.110 = phi <512 x i1> [ %1, %for.body ], [ %vq.0, %if.end ]
336  %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %vq.110, <16 x i8> %vc, <16 x i8> %vc)
337  %inc = add nuw nsw i32 %i.011, 1
338  %exitcond.not = icmp eq i32 %inc, %count
339  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
340}
341