1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ 3; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ 4; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s 5; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \ 6; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \ 7; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE 8 9declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) 10declare <512 x i1> @llvm.ppc.mma.xxsetaccz() 11declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>) 12declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) 13define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) { 14; CHECK-LABEL: testPHI1: 15; CHECK: # %bb.0: # %entry 16; CHECK-NEXT: xxsetaccz acc0 17; CHECK-NEXT: cmpwi r5, 3 18; CHECK-NEXT: blt cr0, .LBB0_3 19; CHECK-NEXT: # %bb.1: # %for.body.preheader 20; CHECK-NEXT: lxv v2, 0(r4) 21; CHECK-NEXT: lxv v3, 16(r4) 22; CHECK-NEXT: clrldi r5, r5, 32 23; CHECK-NEXT: addi r4, r4, 32 24; CHECK-NEXT: addi r5, r5, -2 25; CHECK-NEXT: mtctr r5 26; CHECK-NEXT: .p2align 4 27; CHECK-NEXT: .LBB0_2: # %for.body 28; CHECK-NEXT: # 29; CHECK-NEXT: lxv vs4, 0(r4) 30; CHECK-NEXT: addi r4, r4, 16 31; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs4 32; CHECK-NEXT: bdnz .LBB0_2 33; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup 34; CHECK-NEXT: xxmfacc acc0 35; CHECK-NEXT: stxv vs3, 0(r3) 36; CHECK-NEXT: stxv vs2, 16(r3) 37; CHECK-NEXT: stxv vs1, 32(r3) 38; CHECK-NEXT: stxv vs0, 48(r3) 39; CHECK-NEXT: blr 40; 41; CHECK-BE-LABEL: testPHI1: 42; CHECK-BE: # %bb.0: # %entry 43; CHECK-BE-NEXT: xxsetaccz acc0 44; CHECK-BE-NEXT: cmpwi r5, 3 45; CHECK-BE-NEXT: blt cr0, .LBB0_3 46; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader 47; CHECK-BE-NEXT: lxv v2, 0(r4) 48; CHECK-BE-NEXT: lxv v3, 16(r4) 49; CHECK-BE-NEXT: clrldi r5, r5, 32 50; CHECK-BE-NEXT: addi r4, r4, 32 51; CHECK-BE-NEXT: addi r5, r5, -2 52; CHECK-BE-NEXT: mtctr r5 53; CHECK-BE-NEXT: .p2align 4 54; CHECK-BE-NEXT: .LBB0_2: # %for.body 55; CHECK-BE-NEXT: # 56; CHECK-BE-NEXT: lxv vs4, 0(r4) 57; CHECK-BE-NEXT: addi r4, r4, 16 58; CHECK-BE-NEXT: xvf64gerpp acc0, vsp34, vs4 59; CHECK-BE-NEXT: bdnz .LBB0_2 60; CHECK-BE-NEXT: .LBB0_3: # %for.cond.cleanup 61; CHECK-BE-NEXT: xxmfacc acc0 62; CHECK-BE-NEXT: stxv vs0, 0(r3) 63; CHECK-BE-NEXT: stxv vs1, 16(r3) 64; CHECK-BE-NEXT: stxv vs2, 32(r3) 65; CHECK-BE-NEXT: stxv vs3, 48(r3) 66; CHECK-BE-NEXT: blr 67entry: 68 %0 = load <16 x i8>, ptr %Src, align 16 69 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 70 %1 = load <16 x i8>, ptr %arrayidx1, align 16 71 %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1) 72 %3 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() 73 %cmp11 = icmp sgt i32 %Len, 2 74 br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup 75 76for.body.preheader: 77 %wide.trip.count = zext i32 %Len to i64 78 br label %for.body 79 80for.cond.cleanup: 81 %Acc.0.lcssa = phi <512 x i1> [ %3, %entry ], [ %13, %for.body ] 82 %4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) 83 %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 0 84 store <16 x i8> %5, ptr %Dst, align 16 85 %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 1 86 %7 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 1 87 store <16 x i8> %6, ptr %7, align 16 88 %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 2 89 %9 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 2 90 store <16 x i8> %8, ptr %9, align 16 91 %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 3 92 %11 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 3 93 store <16 x i8> %10, ptr %11, align 16 94 ret void 95 96for.body: 97 %indvars.iv = phi i64 [ 2, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 98 %Acc.012 = phi <512 x i1> [ %3, %for.body.preheader ], [ %13, %for.body ] 99 %arrayidx2 = getelementptr inbounds <16 x i8>, ptr %Src, i64 %indvars.iv 100 %12 = load <16 x i8>, ptr %arrayidx2, align 16 101 %13 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.012, <256 x i1> %2, <16 x i8> %12) 102 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 103 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 104 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 105} 106 107declare <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1>, <16 x i8>) 108define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) { 109; CHECK-LABEL: testPHI2: 110; CHECK: # %bb.0: # %entry 111; CHECK-NEXT: lxv v2, 0(r4) 112; CHECK-NEXT: lxv v3, 16(r4) 113; CHECK-NEXT: cmpwi r5, 4 114; CHECK-NEXT: lxv vs4, 32(r4) 115; CHECK-NEXT: xvf64ger acc0, vsp34, vs4 116; CHECK-NEXT: blt cr0, .LBB1_3 117; CHECK-NEXT: # %bb.1: # %for.body.preheader 118; CHECK-NEXT: clrldi r5, r5, 32 119; CHECK-NEXT: addi r4, r4, 48 120; CHECK-NEXT: addi r5, r5, -3 121; CHECK-NEXT: mtctr r5 122; CHECK-NEXT: .p2align 4 123; CHECK-NEXT: .LBB1_2: # %for.body 124; CHECK-NEXT: # 125; CHECK-NEXT: lxv vs4, 0(r4) 126; CHECK-NEXT: addi r4, r4, 16 127; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs4 128; CHECK-NEXT: bdnz .LBB1_2 129; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup 130; CHECK-NEXT: xxmfacc acc0 131; CHECK-NEXT: stxv vs3, 0(r3) 132; CHECK-NEXT: stxv vs2, 16(r3) 133; CHECK-NEXT: stxv vs1, 32(r3) 134; CHECK-NEXT: stxv vs0, 48(r3) 135; CHECK-NEXT: blr 136; 137; CHECK-BE-LABEL: testPHI2: 138; CHECK-BE: # %bb.0: # %entry 139; CHECK-BE-NEXT: lxv v2, 0(r4) 140; CHECK-BE-NEXT: lxv v3, 16(r4) 141; CHECK-BE-NEXT: cmpwi r5, 4 142; CHECK-BE-NEXT: lxv vs4, 32(r4) 143; CHECK-BE-NEXT: xvf64ger acc0, vsp34, vs4 144; CHECK-BE-NEXT: blt cr0, .LBB1_3 145; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader 146; CHECK-BE-NEXT: clrldi r5, r5, 32 147; CHECK-BE-NEXT: addi r4, r4, 48 148; CHECK-BE-NEXT: addi r5, r5, -3 149; CHECK-BE-NEXT: mtctr r5 150; CHECK-BE-NEXT: .p2align 4 151; CHECK-BE-NEXT: .LBB1_2: # %for.body 152; CHECK-BE-NEXT: # 153; CHECK-BE-NEXT: lxv vs4, 0(r4) 154; CHECK-BE-NEXT: addi r4, r4, 16 155; CHECK-BE-NEXT: xvf64gerpp acc0, vsp34, vs4 156; CHECK-BE-NEXT: bdnz .LBB1_2 157; CHECK-BE-NEXT: .LBB1_3: # %for.cond.cleanup 158; CHECK-BE-NEXT: xxmfacc acc0 159; CHECK-BE-NEXT: stxv vs0, 0(r3) 160; CHECK-BE-NEXT: stxv vs1, 16(r3) 161; CHECK-BE-NEXT: stxv vs2, 32(r3) 162; CHECK-BE-NEXT: stxv vs3, 48(r3) 163; CHECK-BE-NEXT: blr 164entry: 165 %0 = load <16 x i8>, ptr %Src, align 16 166 %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1 167 %1 = load <16 x i8>, ptr %arrayidx1, align 16 168 %2 = tail call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %0, <16 x i8> %1) 169 %arrayidx2 = getelementptr inbounds <16 x i8>, ptr %Src, i64 2 170 %3 = load <16 x i8>, ptr %arrayidx2, align 16 171 %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %2, <16 x i8> %3) 172 %cmp14 = icmp sgt i32 %Len, 3 173 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup 174 175for.body.preheader: 176 %wide.trip.count = zext i32 %Len to i64 177 br label %for.body 178 179for.cond.cleanup: 180 %Acc.0.lcssa = phi <512 x i1> [ %4, %entry ], [ %14, %for.body ] 181 %5 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) 182 %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 0 183 store <16 x i8> %6, ptr %Dst, align 16 184 %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 1 185 %8 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 1 186 store <16 x i8> %7, ptr %8, align 16 187 %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 2 188 %10 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 2 189 store <16 x i8> %9, ptr %10, align 16 190 %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 3 191 %12 = getelementptr inbounds <16 x i8>, ptr %Dst, i64 3 192 store <16 x i8> %11, ptr %12, align 16 193 ret void 194 195for.body: 196 %indvars.iv = phi i64 [ 3, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 197 %Acc.015 = phi <512 x i1> [ %4, %for.body.preheader ], [ %14, %for.body ] 198 %arrayidx3 = getelementptr inbounds <16 x i8>, ptr %Src, i64 %indvars.iv 199 %13 = load <16 x i8>, ptr %arrayidx3, align 16 200 %14 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.015, <256 x i1> %2, <16 x i8> %13) 201 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 202 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count 203 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 204} 205 206; This test uses an unprimed accumulator PHI node with two operands: an 207; implicitely defined unprimed accumulator and the unprimed result of the call 208; to xvf64gerpp. The compiler should replace this PHI node by a primed 209; accumulator PHI node. 210define void @testImplicitDef(ptr %ptr) { 211; CHECK-LABEL: testImplicitDef: 212; CHECK: # %bb.0: # %label1 213; CHECK-NEXT: # implicit-def: $acc0 214; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB2_2 215; CHECK-NEXT: # %bb.1: # %label2 216; CHECK-NEXT: xvf64gerpp acc0, vsp34, vs0 217; CHECK-NEXT: .LBB2_2: # %label3 218; CHECK-NEXT: xxmfacc acc0 219; CHECK-NEXT: stxv vs0, 0(r3) 220; CHECK-NEXT: blr 221; 222; CHECK-BE-LABEL: testImplicitDef: 223; CHECK-BE: # %bb.0: # %label1 224; CHECK-BE-NEXT: # implicit-def: $acc0 225; CHECK-BE-NEXT: bc 12, 4*cr5+lt, .LBB2_2 226; CHECK-BE-NEXT: # %bb.1: # %label2 227; CHECK-BE-NEXT: xvf64gerpp acc0, vsp34, vs0 228; CHECK-BE-NEXT: .LBB2_2: # %label3 229; CHECK-BE-NEXT: xxmfacc acc0 230; CHECK-BE-NEXT: stxv vs3, 0(r3) 231; CHECK-BE-NEXT: blr 232label1: 233 br i1 undef, label %label3, label %label2 234 235label2: 236 %0 = call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> undef, <256 x i1> undef, <16 x i8> undef) 237 br label %label3 238 239label3: 240 %1 = phi <512 x i1> [ undef, %label1 ], [ %0, %label2 ] 241 %2 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %1) 242 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %2, 3 243 store <16 x i8> %3, ptr %ptr, align 16 244 ret void 245} 246 247; This test uses an unprimed accumulator PHI node with an unprimed accumulator 248; PHI node operand. The compiler should replace these PHI nodes by primed 249; accumulator PHI nodes. 250declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>) 251define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %count, ptr nocapture %ptr, <16 x i8> %vc) { 252; CHECK-LABEL: testNestedPHI: 253; CHECK: # %bb.0: # %entry 254; CHECK-NEXT: cmplwi r3, 0 255; CHECK-NEXT: beq cr0, .LBB3_2 256; CHECK-NEXT: # %bb.1: # %if.then 257; CHECK-NEXT: xvf32gernp acc0, v2, v2 258; CHECK-NEXT: cmpwi r4, 1 259; CHECK-NEXT: bge cr0, .LBB3_3 260; CHECK-NEXT: b .LBB3_5 261; CHECK-NEXT: .LBB3_2: 262; CHECK-NEXT: # implicit-def: $acc0 263; CHECK-NEXT: cmpwi r4, 1 264; CHECK-NEXT: blt cr0, .LBB3_5 265; CHECK-NEXT: .LBB3_3: # %for.body.preheader 266; CHECK-NEXT: addi r3, r4, -1 267; CHECK-NEXT: clrldi r3, r3, 32 268; CHECK-NEXT: addi r3, r3, 1 269; CHECK-NEXT: mtctr r3 270; CHECK-NEXT: .p2align 4 271; CHECK-NEXT: .LBB3_4: # %for.body 272; CHECK-NEXT: # 273; CHECK-NEXT: xvf32gernp acc0, v2, v2 274; CHECK-NEXT: bdnz .LBB3_4 275; CHECK-NEXT: .LBB3_5: # %for.cond.cleanup 276; CHECK-NEXT: xxmfacc acc0 277; CHECK-NEXT: li r3, 0 278; CHECK-NEXT: stxv vs0, 48(r5) 279; CHECK-NEXT: stxv vs1, 32(r5) 280; CHECK-NEXT: stxv vs2, 16(r5) 281; CHECK-NEXT: stxv vs3, 0(r5) 282; CHECK-NEXT: blr 283; 284; CHECK-BE-LABEL: testNestedPHI: 285; CHECK-BE: # %bb.0: # %entry 286; CHECK-BE-NEXT: cmplwi r3, 0 287; CHECK-BE-NEXT: beq cr0, .LBB3_2 288; CHECK-BE-NEXT: # %bb.1: # %if.then 289; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 290; CHECK-BE-NEXT: cmpwi r4, 1 291; CHECK-BE-NEXT: bge cr0, .LBB3_3 292; CHECK-BE-NEXT: b .LBB3_5 293; CHECK-BE-NEXT: .LBB3_2: 294; CHECK-BE-NEXT: # implicit-def: $acc0 295; CHECK-BE-NEXT: cmpwi r4, 1 296; CHECK-BE-NEXT: blt cr0, .LBB3_5 297; CHECK-BE-NEXT: .LBB3_3: # %for.body.preheader 298; CHECK-BE-NEXT: addi r3, r4, -1 299; CHECK-BE-NEXT: clrldi r3, r3, 32 300; CHECK-BE-NEXT: addi r3, r3, 1 301; CHECK-BE-NEXT: mtctr r3 302; CHECK-BE-NEXT: .p2align 4 303; CHECK-BE-NEXT: .LBB3_4: # %for.body 304; CHECK-BE-NEXT: # 305; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 306; CHECK-BE-NEXT: bdnz .LBB3_4 307; CHECK-BE-NEXT: .LBB3_5: # %for.cond.cleanup 308; CHECK-BE-NEXT: xxmfacc acc0 309; CHECK-BE-NEXT: li r3, 0 310; CHECK-BE-NEXT: stxv vs1, 16(r5) 311; CHECK-BE-NEXT: stxv vs0, 0(r5) 312; CHECK-BE-NEXT: stxv vs3, 48(r5) 313; CHECK-BE-NEXT: stxv vs2, 32(r5) 314; CHECK-BE-NEXT: blr 315entry: 316 %tobool.not = icmp eq i32 %cond, 0 317 br i1 %tobool.not, label %if.end, label %if.then 318 319if.then: 320 %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> undef, <16 x i8> %vc, <16 x i8> %vc) 321 br label %if.end 322 323if.end: 324 %vq.0 = phi <512 x i1> [ %0, %if.then ], [ undef, %entry ] 325 %cmp9 = icmp sgt i32 %count, 0 326 br i1 %cmp9, label %for.body, label %for.cond.cleanup 327 328for.cond.cleanup: 329 %vq.1.lcssa = phi <512 x i1> [ %vq.0, %if.end ], [ %1, %for.body ] 330 store <512 x i1> %vq.1.lcssa, ptr %ptr, align 64 331 ret i32 0 332 333for.body: 334 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %if.end ] 335 %vq.110 = phi <512 x i1> [ %1, %for.body ], [ %vq.0, %if.end ] 336 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %vq.110, <16 x i8> %vc, <16 x i8> %vc) 337 %inc = add nuw nsw i32 %i.011, 1 338 %exitcond.not = icmp eq i32 %inc, %count 339 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 340} 341