1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 2; RUN: opt -mtriple=amdgcn -mcpu=gfx906 -amdgpu-late-codegenprepare -S -o - %s | FileCheck --check-prefix=GFX906 %s 3 4define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 5; GFX906-LABEL: define amdgpu_kernel void @v3i8_liveout( 6; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0:[0-9]+]] { 7; GFX906-NEXT: entry: 8; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 9; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 10; GFX906-NEXT: [[VEC1:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP1]], align 4 11; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[VEC1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 12; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[TMP0]] to i32 13; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <3 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 14; GFX906-NEXT: [[VEC2:%.*]] = load <3 x i8>, ptr addrspace(1) [[GEP2]], align 4 15; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <3 x i8> [[VEC2]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 16; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[TMP1]] to i32 17; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 18; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] 19; GFX906: bb.1: 20; GFX906-NEXT: br label [[BB_2]] 21; GFX906: bb.2: 22; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] 23; GFX906-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP5_TC]] to i24 24; GFX906-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8> 25; GFX906-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 26; GFX906-NEXT: ret void 27; 28entry: 29 %idx = call i32 @llvm.amdgcn.workitem.id.x() 30 %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx 31 %vec1 = load <3 x i8>, ptr addrspace(1) %gep1 32 %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx 33 %vec2 = load <3 x i8>, ptr addrspace(1) %gep2 34 %cmp = icmp ult i32 %idx, 15 35 br i1 %cmp, label %bb.1, label %bb.2 36bb.1: 37 br label %bb.2 38 39bb.2: 40 %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 41 store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4 42 ret void 43} 44 45define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 46; GFX906-LABEL: define amdgpu_kernel void @v4i8_liveout( 47; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] { 48; GFX906-NEXT: entry: 49; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 50; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 51; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 52; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 53; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 54; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 55; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 56; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 57; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] 58; GFX906: bb.1: 59; GFX906-NEXT: br label [[BB_2]] 60; GFX906: bb.2: 61; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] 62; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8> 63; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 64; GFX906-NEXT: ret void 65; 66entry: 67 %idx = call i32 @llvm.amdgcn.workitem.id.x() 68 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 69 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 70 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 71 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 72 %cmp = icmp ult i32 %idx, 15 73 br i1 %cmp, label %bb.1, label %bb.2 74bb.1: 75 br label %bb.2 76 77bb.2: 78 %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 79 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 80 ret void 81} 82 83define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 84; GFX906-LABEL: define amdgpu_kernel void @v5i8_liveout( 85; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] { 86; GFX906-NEXT: entry: 87; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 88; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 89; GFX906-NEXT: [[VEC1:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP1]], align 8 90; GFX906-NEXT: [[TMP0:%.*]] = shufflevector <5 x i8> [[VEC1]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5> 91; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> 92; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <5 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 93; GFX906-NEXT: [[VEC2:%.*]] = load <5 x i8>, ptr addrspace(1) [[GEP2]], align 8 94; GFX906-NEXT: [[TMP1:%.*]] = shufflevector <5 x i8> [[VEC2]], <5 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5> 95; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> 96; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 97; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] 98; GFX906: bb.1: 99; GFX906-NEXT: br label [[BB_2]] 100; GFX906: bb.2: 101; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] 102; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> 103; GFX906-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4> 104; GFX906-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4 105; GFX906-NEXT: ret void 106; 107entry: 108 %idx = call i32 @llvm.amdgcn.workitem.id.x() 109 %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx 110 %vec1 = load <5 x i8>, ptr addrspace(1) %gep1 111 %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx 112 %vec2 = load <5 x i8>, ptr addrspace(1) %gep2 113 %cmp = icmp ult i32 %idx, 15 114 br i1 %cmp, label %bb.1, label %bb.2 115bb.1: 116 br label %bb.2 117 118bb.2: 119 %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 120 store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4 121 ret void 122} 123 124define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 125; GFX906-LABEL: define amdgpu_kernel void @v8i8_liveout( 126; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] { 127; GFX906-NEXT: entry: 128; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 129; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 130; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 131; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> 132; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 133; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 134; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> 135; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 136; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] 137; GFX906: bb.1: 138; GFX906-NEXT: br label [[BB_2]] 139; GFX906: bb.2: 140; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] 141; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> 142; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 143; GFX906-NEXT: ret void 144; 145entry: 146 %idx = call i32 @llvm.amdgcn.workitem.id.x() 147 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 148 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 149 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 150 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 151 %cmp = icmp ult i32 %idx, 15 152 br i1 %cmp, label %bb.1, label %bb.2 153bb.1: 154 br label %bb.2 155 156bb.2: 157 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 158 store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4 159 ret void 160} 161 162define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 163; GFX906-LABEL: define amdgpu_kernel void @repeat_successor( 164; GFX906-SAME: i32 [[IN:%.*]], ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] { 165; GFX906-NEXT: entry: 166; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 167; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 168; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 169; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 170; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <4 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 171; GFX906-NEXT: [[VEC2:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP2]], align 4 172; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <4 x i8> [[VEC2]] to i32 173; GFX906-NEXT: switch i32 [[IN]], label [[RETURN:%.*]] [ 174; GFX906-NEXT: i32 1, label [[RETURN_SINK_SPLIT:%.*]] 175; GFX906-NEXT: i32 2, label [[RETURN_SINK_SPLIT]] 176; GFX906-NEXT: i32 3, label [[SW_BB5:%.*]] 177; GFX906-NEXT: ] 178; GFX906: sw.bb5: 179; GFX906-NEXT: br label [[RETURN_SINK_SPLIT]] 180; GFX906: return.sink.split: 181; GFX906-NEXT: [[TMP5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ] 182; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast i32 [[TMP5_TC]] to <4 x i8> 183; GFX906-NEXT: store <4 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST]], align 4 184; GFX906-NEXT: ret void 185; GFX906: return: 186; GFX906-NEXT: ret void 187; 188entry: 189 %idx = call i32 @llvm.amdgcn.workitem.id.x() 190 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 191 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 192 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 193 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 194 switch i32 %in, label %return [ 195 i32 1, label %return.sink.split 196 i32 2, label %return.sink.split 197 i32 3, label %sw.bb5 198 ] 199 200sw.bb5: 201 br label %return.sink.split 202 203return.sink.split: 204 %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] 205 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 206 ret void 207 208return: 209 ret void 210} 211 212define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { 213; GFX906-LABEL: define amdgpu_kernel void @v8i8_phi_chain( 214; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] { 215; GFX906-NEXT: entry: 216; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 217; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 218; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 219; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> 220; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 221; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 222; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> 223; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 224; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_2:%.*]] 225; GFX906: bb.1: 226; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 227; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]] 228; GFX906: bb.2: 229; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ] 230; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> 231; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST0]], align 4 232; GFX906-NEXT: br label [[BB_3]] 233; GFX906: bb.3: 234; GFX906-NEXT: [[TMP7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[TMP5_TC]], [[BB_2]] ] 235; GFX906-NEXT: [[TMP7_TC_BC:%.*]] = bitcast <2 x i32> [[TMP7_TC]] to <8 x i8> 236; GFX906-NEXT: store <8 x i8> [[TMP7_TC_BC]], ptr addrspace(1) [[DST1]], align 4 237; GFX906-NEXT: ret void 238; 239entry: 240 %idx = call i32 @llvm.amdgcn.workitem.id.x() 241 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 242 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 243 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 244 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 245 %cmp = icmp ult i32 %idx, 15 246 br i1 %cmp, label %bb.1, label %bb.2 247bb.1: 248 %cmp2 = icmp ult i32 %idx, 7 249 br i1 %cmp2, label %bb.2, label %bb.3 250 251bb.2: 252 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 253 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 254 br label %bb.3 255 256bb.3: 257 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] 258 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 259 ret void 260} 261 262define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { 263; GFX906-LABEL: define amdgpu_kernel void @v8i8_multi_block( 264; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST0:%.*]], ptr addrspace(1) captures(none) [[DST1:%.*]]) #[[ATTR0]] { 265; GFX906-NEXT: entry: 266; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 267; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 268; GFX906-NEXT: [[VEC1:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP1]], align 8 269; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <8 x i8> [[VEC1]] to <2 x i32> 270; GFX906-NEXT: [[GEP2:%.*]] = getelementptr <8 x i8>, ptr addrspace(1) [[SRC2]], i32 [[IDX]] 271; GFX906-NEXT: [[VEC2:%.*]] = load <8 x i8>, ptr addrspace(1) [[GEP2]], align 8 272; GFX906-NEXT: [[VEC2_BC:%.*]] = bitcast <8 x i8> [[VEC2]] to <2 x i32> 273; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 274; GFX906-NEXT: br i1 [[CMP]], label [[BB_1:%.*]], label [[BB_3:%.*]] 275; GFX906: bb.1: 276; GFX906-NEXT: [[CMP2:%.*]] = icmp ult i32 [[IDX]], 7 277; GFX906-NEXT: br i1 [[CMP2]], label [[BB_2:%.*]], label [[BB_3]] 278; GFX906: bb.2: 279; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <2 x i32> [[VEC1_BC]] to <8 x i8> 280; GFX906-NEXT: store <8 x i8> [[VEC1_BC_BC]], ptr addrspace(1) [[DST0]], align 4 281; GFX906-NEXT: br label [[BB_3]] 282; GFX906: bb.3: 283; GFX906-NEXT: [[TMP5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ] 284; GFX906-NEXT: [[TMP5_TC_BC:%.*]] = bitcast <2 x i32> [[TMP5_TC]] to <8 x i8> 285; GFX906-NEXT: store <8 x i8> [[TMP5_TC_BC]], ptr addrspace(1) [[DST1]], align 4 286; GFX906-NEXT: ret void 287; 288entry: 289 %idx = call i32 @llvm.amdgcn.workitem.id.x() 290 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 291 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 292 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 293 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 294 %cmp = icmp ult i32 %idx, 15 295 br i1 %cmp, label %bb.1, label %bb.3 296bb.1: 297 %cmp2 = icmp ult i32 %idx, 7 298 br i1 %cmp2, label %bb.2, label %bb.3 299 300bb.2: 301 store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4 302 br label %bb.3 303 304bb.3: 305 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] 306 store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 307 ret void 308} 309 310define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 311; GFX906-LABEL: define amdgpu_kernel void @v32i8_loop_carried( 312; GFX906-SAME: ptr addrspace(1) [[SRC1:%.*]], ptr addrspace(1) [[SRC2:%.*]], ptr addrspace(1) captures(none) [[DST:%.*]]) #[[ATTR0]] { 313; GFX906-NEXT: entry: 314; GFX906-NEXT: [[IDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() 315; GFX906-NEXT: [[GEP1:%.*]] = getelementptr <32 x i8>, ptr addrspace(1) [[SRC1]], i32 [[IDX]] 316; GFX906-NEXT: [[VEC1:%.*]] = load <4 x i8>, ptr addrspace(1) [[GEP1]], align 4 317; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <4 x i8> [[VEC1]] to i32 318; GFX906-NEXT: br label [[BB_1:%.*]] 319; GFX906: bb.1: 320; GFX906-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ] 321; GFX906-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8> 322; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8> 323; GFX906-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6> 324; GFX906-NEXT: [[VEC2_BC]] = bitcast <4 x i8> [[VEC2]] to i32 325; GFX906-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX]], 15 326; GFX906-NEXT: br i1 [[CMP]], label [[BB_1]], label [[BB_2:%.*]] 327; GFX906: 0: 328; GFX906-NEXT: br label [[BB_2]] 329; GFX906: bb.2: 330; GFX906-NEXT: [[VEC2_BC_BC:%.*]] = bitcast i32 [[VEC2_BC]] to <4 x i8> 331; GFX906-NEXT: store <4 x i8> [[VEC2_BC_BC]], ptr addrspace(1) [[DST]], align 4 332; GFX906-NEXT: ret void 333; 334entry: 335 %idx = call i32 @llvm.amdgcn.workitem.id.x() 336 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx 337 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 338 br label %bb.1 339 340bb.1: 341 %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 342 %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 343 %cmp = icmp ult i32 %idx, 15 344 br i1 %cmp, label %bb.1, label %bb.2 345 br label %bb.2 346 347bb.2: 348 store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4 349 ret void 350} 351 352; Should not produce a broken phi 353 354define void @broken_phi() { 355; GFX906-LABEL: define void @broken_phi( 356; GFX906-SAME: ) #[[ATTR0]] { 357; GFX906-NEXT: bb: 358; GFX906-NEXT: br label [[BB1:%.*]] 359; GFX906: bb1: 360; GFX906-NEXT: [[I:%.*]] = phi <4 x i8> [ splat (i8 1), [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ] 361; GFX906-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]] 362; GFX906: bb2: 363; GFX906-NEXT: br label [[BB3]] 364; GFX906: bb3: 365; GFX906-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ] 366; GFX906-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]] 367; GFX906: bb5: 368; GFX906-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer) 369; GFX906-NEXT: br label [[BB7]] 370; GFX906: bb7: 371; GFX906-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ] 372; GFX906-NEXT: br label [[BB1]] 373; 374bb: 375 br label %bb1 376bb1: 377 %i = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, %bb ], [ %i8, %bb7 ] 378 br i1 false, label %bb3, label %bb2 379bb2: 380 br label %bb3 381bb3: 382 %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ] 383 br i1 false, label %bb7, label %bb5 384bb5: 385 %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer) 386 br label %bb7 387bb7: 388 %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ] 389 br label %bb1 390} 391 392; %sel1 should just use %sel0 instead of trying to convert back the 393; converted version of %sel0 394 395define amdgpu_kernel void @reuseOp() { 396; GFX906-LABEL: define amdgpu_kernel void @reuseOp( 397; GFX906-SAME: ) #[[ATTR0]] { 398; GFX906-NEXT: entry: 399; GFX906-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 400; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32> 401; GFX906-NEXT: br label [[BB_1:%.*]] 402; GFX906: bb.1: 403; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8> 404; GFX906-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer 405; GFX906-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32> 406; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]] 407; GFX906-NEXT: br label [[BB_2:%.*]] 408; GFX906: bb.2: 409; GFX906-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8> 410; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0 411; GFX906-NEXT: ret void 412; 413entry: 414 %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 415 br label %bb.1 416 417bb.1: 418 %sel0 = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer 419 %sel1 = select i1 false, <16 x i8> %vec1, <16 x i8> %sel0 420 br label %bb.2 421 422bb.2: 423 %val = extractelement <16 x i8> %sel0, i64 0 424 ret void 425} 426 427 428define amdgpu_kernel void @deletedPHI(i32 %in0, i1 %cmp, <10 x i8> %invec0) { 429; GFX906-LABEL: define amdgpu_kernel void @deletedPHI( 430; GFX906-SAME: i32 [[IN0:%.*]], i1 [[CMP:%.*]], <10 x i8> [[INVEC0:%.*]]) #[[ATTR0]] { 431; GFX906-NEXT: entry: 432; GFX906-NEXT: br label [[BB_1:%.*]] 433; GFX906: bb.1: 434; GFX906-NEXT: [[PHI0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[BB_11:%.*]] ] 435; GFX906-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY]] ], [ [[VEC1:%.*]], [[BB_11]] ] 436; GFX906-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] 437; GFX906: bb.2: 438; GFX906-NEXT: br label [[BB_3]] 439; GFX906: bb.3: 440; GFX906-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI1]], [[BB_1]] ] 441; GFX906-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] 442; GFX906: bb.4: 443; GFX906-NEXT: [[VEC0:%.*]] = insertelement <10 x i8> [[PHI2]], i8 0, i64 0 444; GFX906-NEXT: br label [[BB_5]] 445; GFX906: bb.5: 446; GFX906-NEXT: [[PHI3:%.*]] = phi <10 x i8> [ [[VEC0]], [[BB_4]] ], [ [[PHI2]], [[BB_3]] ] 447; GFX906-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] 448; GFX906: bb.6: 449; GFX906-NEXT: br label [[BB_7]] 450; GFX906: bb.7: 451; GFX906-NEXT: [[PHI4:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_6]] ], [ [[PHI3]], [[BB_5]] ] 452; GFX906-NEXT: br i1 [[CMP]], label [[BB_9:%.*]], label [[BB_8:%.*]] 453; GFX906: bb.8: 454; GFX906-NEXT: br label [[BB_9]] 455; GFX906: bb.9: 456; GFX906-NEXT: [[PHI5:%.*]] = phi <10 x i8> [ [[INVEC0]], [[BB_8]] ], [ [[PHI4]], [[BB_7]] ] 457; GFX906-NEXT: br i1 [[CMP]], label [[BB_11]], label [[BB_10:%.*]] 458; GFX906: bb.10: 459; GFX906-NEXT: br label [[BB_11]] 460; GFX906: bb.11: 461; GFX906-NEXT: [[PHI6:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_10]] ], [ [[PHI5]], [[BB_9]] ] 462; GFX906-NEXT: [[VEC1]] = shufflevector <10 x i8> [[PHI6]], <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19> 463; GFX906-NEXT: br label [[BB_1]] 464; 465entry: 466 br label %bb.1 467 468bb.1: 469 %phi0 = phi i32 [ 0, %entry ], [ 1, %bb.11 ] 470 %phi1 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %vec1, %bb.11 ] 471 br i1 %cmp, label %bb.3, label %bb.2 472 473bb.2: 474 br label %bb.3 475 476bb.3: 477 %phi2 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi1, %bb.1 ] 478 br i1 %cmp, label %bb.5, label %bb.4 479 480bb.4: 481 %vec0 = insertelement <10 x i8> %phi2, i8 0, i64 0 482 br label %bb.5 483 484bb.5: ; preds = %bb.4, %bb.3 485 %phi3 = phi <10 x i8> [ %vec0, %bb.4 ], [ %phi2, %bb.3 ] 486 br i1 %cmp, label %bb.7, label %bb.6 487 488bb.6: 489 br label %bb.7 490 491bb.7: ; preds = %bb.6, %bb.5 492 %phi4 = phi <10 x i8> [ %invec0, %bb.6 ], [ %phi3, %bb.5 ] 493 br i1 %cmp, label %bb.9, label %bb.8 494 495bb.8: 496 br label %bb.9 497 498bb.9: 499 %phi5 = phi <10 x i8> [ %invec0, %bb.8 ], [ %phi4, %bb.7 ] 500 br i1 %cmp, label %bb.11, label %bb.10 501 502bb.10: 503 br label %bb.11 504 505bb.11: 506 %phi6 = phi <10 x i8> [ zeroinitializer, %bb.10 ], [ %phi5, %bb.9 ] 507 %vec1 = shufflevector <10 x i8> %phi6, <10 x i8> zeroinitializer, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 15, i32 16, i32 17, i32 18, i32 19> 508 br label %bb.1 509} 510 511define amdgpu_kernel void @multiple_unwind(i1 %cmp, <10 x i8> %invec) { 512; GFX906-LABEL: define amdgpu_kernel void @multiple_unwind( 513; GFX906-SAME: i1 [[CMP:%.*]], <10 x i8> [[INVEC:%.*]]) #[[ATTR0]] { 514; GFX906-NEXT: entry: 515; GFX906-NEXT: br label [[BB_1:%.*]] 516; GFX906: bb.1: 517; GFX906-NEXT: [[PHI0:%.*]] = phi <10 x i8> [ splat (i8 1), [[ENTRY:%.*]] ], [ [[PHI3:%.*]], [[BB_8:%.*]] ] 518; GFX906-NEXT: br i1 [[CMP]], label [[BB_3:%.*]], label [[BB_2:%.*]] 519; GFX906: bb.2: 520; GFX906-NEXT: br label [[BB_3]] 521; GFX906: bb.3: 522; GFX906-NEXT: [[PHI1:%.*]] = phi <10 x i8> [ zeroinitializer, [[BB_2]] ], [ [[PHI0]], [[BB_1]] ] 523; GFX906-NEXT: br i1 [[CMP]], label [[BB_5:%.*]], label [[BB_4:%.*]] 524; GFX906: bb.4: 525; GFX906-NEXT: br label [[BB_5]] 526; GFX906: bb.5: 527; GFX906-NEXT: [[PHI2:%.*]] = phi <10 x i8> [ [[PHI0]], [[BB_4]] ], [ [[PHI1]], [[BB_3]] ] 528; GFX906-NEXT: br i1 [[CMP]], label [[BB_7:%.*]], label [[BB_6:%.*]] 529; GFX906: bb.6: 530; GFX906-NEXT: br label [[BB_7]] 531; GFX906: bb.7: 532; GFX906-NEXT: [[PHI3]] = phi <10 x i8> [ [[INVEC]], [[BB_6]] ], [ [[PHI2]], [[BB_5]] ] 533; GFX906-NEXT: br label [[BB_8]] 534; GFX906: bb.8: 535; GFX906-NEXT: br label [[BB_1]] 536; 537entry: 538 br label %bb.1 539 540bb.1: 541 %phi0 = phi <10 x i8> [ <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %entry ], [ %phi3, %bb.8 ] 542 br i1 %cmp, label %bb.3, label %bb.2 543 544bb.2: 545 br label %bb.3 546 547bb.3: 548 %phi1 = phi <10 x i8> [ zeroinitializer, %bb.2 ], [ %phi0, %bb.1 ] 549 br i1 %cmp, label %bb.5, label %bb.4 550 551bb.4: 552 br label %bb.5 553 554bb.5: 555 %phi2 = phi <10 x i8> [ %phi0, %bb.4 ], [ %phi1, %bb.3 ] 556 br i1 %cmp, label %bb.7, label %bb.6 557 558bb.6: ; preds = %bb.5 559 br label %bb.7 560 561bb.7: 562 %phi3 = phi <10 x i8> [ %invec, %bb.6 ], [ %phi2, %bb.5 ] 563 br label %bb.8 564 565bb.8: 566 br label %bb.1 567} 568 569 570 571declare i32 @llvm.amdgcn.workitem.id.x() 572