1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s 3 4define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 5; GFX906-LABEL: v3i8_liveout: 6; GFX906: ; %bb.0: ; %entry 7; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 9; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 10; GFX906-NEXT: v_mov_b32_e32 v4, 8 11; GFX906-NEXT: v_mov_b32_e32 v5, 16 12; GFX906-NEXT: s_waitcnt lgkmcnt(0) 13; GFX906-NEXT: global_load_dword v3, v2, s[0:1] 14; GFX906-NEXT: v_mov_b32_e32 v1, 0xff 15; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 16; GFX906-NEXT: s_waitcnt vmcnt(0) 17; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v3 18; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 19; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 20; GFX906-NEXT: v_or3_b32 v3, v6, v7, v3 21; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 22; GFX906-NEXT: s_cbranch_execz .LBB0_2 23; GFX906-NEXT: ; %bb.1: ; %bb.1 24; GFX906-NEXT: global_load_dword v0, v2, s[2:3] 25; GFX906-NEXT: s_waitcnt vmcnt(0) 26; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0 27; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 28; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 29; GFX906-NEXT: v_or3_b32 v3, v2, v3, v0 30; GFX906-NEXT: .LBB0_2: ; %bb.2 31; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 32; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3 33; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0 34; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0 35; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 36; GFX906-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 37; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 38; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 39; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 40; GFX906-NEXT: v_mov_b32_e32 v1, 0 41; GFX906-NEXT: global_store_short v1, v0, s[6:7] 42; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[6:7] offset:2 43; GFX906-NEXT: s_endpgm 44entry: 45 %idx = call i32 @llvm.amdgcn.workitem.id.x() 46 %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx 47 %vec1 = load <3 x i8>, ptr addrspace(1) %gep1 48 %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx 49 %vec2 = load <3 x i8>, ptr addrspace(1) %gep2 50 %cmp = icmp ult i32 %idx, 15 51 br i1 %cmp, label %bb.1, label %bb.2 52bb.1: 53 br label %bb.2 54 55bb.2: 56 %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 57 store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4 58 ret void 59} 60 61define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 62; GFX906-LABEL: v4i8_liveout: 63; GFX906: ; %bb.0: ; %entry 64; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 65; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 66; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0 67; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 68; GFX906-NEXT: s_waitcnt lgkmcnt(0) 69; GFX906-NEXT: global_load_dword v1, v2, s[0:1] 70; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 71; GFX906-NEXT: s_cbranch_execz .LBB1_2 72; GFX906-NEXT: ; %bb.1: ; %bb.1 73; GFX906-NEXT: global_load_dword v1, v2, s[2:3] 74; GFX906-NEXT: .LBB1_2: ; %bb.2 75; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 76; GFX906-NEXT: v_mov_b32_e32 v0, 0 77; GFX906-NEXT: s_waitcnt vmcnt(0) 78; GFX906-NEXT: global_store_dword v0, v1, s[6:7] 79; GFX906-NEXT: s_endpgm 80entry: 81 %idx = call i32 @llvm.amdgcn.workitem.id.x() 82 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 83 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 84 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 85 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 86 %cmp = icmp ult i32 %idx, 15 87 br i1 %cmp, label %bb.1, label %bb.2 88bb.1: 89 br label %bb.2 90 91bb.2: 92 %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 93 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 94 ret void 95} 96 97define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 98; GFX906-LABEL: v5i8_liveout: 99; GFX906: ; %bb.0: ; %entry 100; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 101; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 102; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 103; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 104; GFX906-NEXT: s_waitcnt lgkmcnt(0) 105; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] 106; GFX906-NEXT: s_waitcnt vmcnt(0) 107; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 108; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 109; GFX906-NEXT: s_cbranch_execz .LBB2_2 110; GFX906-NEXT: ; %bb.1: ; %bb.1 111; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] 112; GFX906-NEXT: s_waitcnt vmcnt(0) 113; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2 114; GFX906-NEXT: .LBB2_2: ; %bb.2 115; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 116; GFX906-NEXT: v_mov_b32_e32 v4, 0 117; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1 118; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1 119; GFX906-NEXT: global_store_byte v4, v1, s[6:7] 120; GFX906-NEXT: global_store_byte v4, v0, s[6:7] offset:1 121; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[6:7] offset:2 122; GFX906-NEXT: global_store_byte v4, v3, s[6:7] offset:3 123; GFX906-NEXT: global_store_byte v4, v2, s[6:7] offset:4 124; GFX906-NEXT: s_endpgm 125entry: 126 %idx = call i32 @llvm.amdgcn.workitem.id.x() 127 %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx 128 %vec1 = load <5 x i8>, ptr addrspace(1) %gep1 129 %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx 130 %vec2 = load <5 x i8>, ptr addrspace(1) %gep2 131 %cmp = icmp ult i32 %idx, 15 132 br i1 %cmp, label %bb.1, label %bb.2 133bb.1: 134 br label %bb.2 135 136bb.2: 137 %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 138 store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4 139 ret void 140} 141 142define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 143; GFX906-LABEL: v8i8_liveout: 144; GFX906: ; %bb.0: ; %entry 145; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 146; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 147; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 148; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 149; GFX906-NEXT: s_waitcnt lgkmcnt(0) 150; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[0:1] 151; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 152; GFX906-NEXT: s_cbranch_execz .LBB3_2 153; GFX906-NEXT: ; %bb.1: ; %bb.1 154; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[2:3] 155; GFX906-NEXT: .LBB3_2: ; %bb.2 156; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 157; GFX906-NEXT: v_mov_b32_e32 v0, 0 158; GFX906-NEXT: s_waitcnt vmcnt(0) 159; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[6:7] 160; GFX906-NEXT: s_endpgm 161entry: 162 %idx = call i32 @llvm.amdgcn.workitem.id.x() 163 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 164 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 165 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 166 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 167 %cmp = icmp ult i32 %idx, 15 168 br i1 %cmp, label %bb.1, label %bb.2 169bb.1: 170 br label %bb.2 171 172bb.2: 173 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 174 store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4 175 ret void 176} 177 178define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 179; GFX906-LABEL: v16i8_liveout: 180; GFX906: ; %bb.0: ; %entry 181; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 182; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 183; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0 184; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 185; GFX906-NEXT: s_waitcnt lgkmcnt(0) 186; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[0:1] 187; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 188; GFX906-NEXT: s_cbranch_execz .LBB4_2 189; GFX906-NEXT: ; %bb.1: ; %bb.1 190; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[2:3] 191; GFX906-NEXT: .LBB4_2: ; %bb.2 192; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 193; GFX906-NEXT: v_mov_b32_e32 v0, 0 194; GFX906-NEXT: s_waitcnt vmcnt(0) 195; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7] 196; GFX906-NEXT: s_endpgm 197entry: 198 %idx = call i32 @llvm.amdgcn.workitem.id.x() 199 %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx 200 %vec1 = load <16 x i8>, ptr addrspace(1) %gep1 201 %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx 202 %vec2 = load <16 x i8>, ptr addrspace(1) %gep2 203 %cmp = icmp ult i32 %idx, 15 204 br i1 %cmp, label %bb.1, label %bb.2 205bb.1: 206 br label %bb.2 207 208bb.2: 209 %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 210 store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4 211 ret void 212} 213 214define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 215; GFX906-LABEL: v32i8_liveout: 216; GFX906: ; %bb.0: ; %entry 217; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 218; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 219; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0 220; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 221; GFX906-NEXT: s_waitcnt lgkmcnt(0) 222; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[0:1] 223; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[0:1] offset:16 224; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 225; GFX906-NEXT: s_cbranch_execz .LBB5_2 226; GFX906-NEXT: ; %bb.1: ; %bb.1 227; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[2:3] 228; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[2:3] offset:16 229; GFX906-NEXT: .LBB5_2: ; %bb.2 230; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 231; GFX906-NEXT: v_mov_b32_e32 v0, 0 232; GFX906-NEXT: s_waitcnt vmcnt(1) 233; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[6:7] 234; GFX906-NEXT: s_waitcnt vmcnt(1) 235; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[6:7] offset:16 236; GFX906-NEXT: s_endpgm 237entry: 238 %idx = call i32 @llvm.amdgcn.workitem.id.x() 239 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx 240 %vec1 = load <32 x i8>, ptr addrspace(1) %gep1 241 %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx 242 %vec2 = load <32 x i8>, ptr addrspace(1) %gep2 243 %cmp = icmp ult i32 %idx, 15 244 br i1 %cmp, label %bb.1, label %bb.2 245bb.1: 246 br label %bb.2 247 248bb.2: 249 %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 250 store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4 251 ret void 252} 253 254define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 255; GFX906-LABEL: v256i8_liveout: 256; GFX906: ; %bb.0: ; %entry 257; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 258; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 259; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0 260; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 261; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 262; GFX906-NEXT: s_waitcnt lgkmcnt(0) 263; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] 264; GFX906-NEXT: s_mov_b32 s14, -1 265; GFX906-NEXT: s_mov_b32 s15, 0xe00000 266; GFX906-NEXT: s_add_u32 s12, s12, s11 267; GFX906-NEXT: s_addc_u32 s13, s13, 0 268; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 269; GFX906-NEXT: s_waitcnt vmcnt(0) 270; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill 271; GFX906-NEXT: s_nop 0 272; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill 273; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill 274; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill 275; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[0:1] offset:16 276; GFX906-NEXT: s_nop 0 277; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[0:1] offset:32 278; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[0:1] offset:48 279; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[0:1] offset:64 280; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[0:1] offset:80 281; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[0:1] offset:96 282; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[0:1] offset:112 283; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[0:1] offset:128 284; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[0:1] offset:144 285; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[0:1] offset:160 286; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[0:1] offset:176 287; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[0:1] offset:192 288; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[0:1] offset:208 289; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[0:1] offset:224 290; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] offset:240 291; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 292; GFX906-NEXT: s_cbranch_execz .LBB6_2 293; GFX906-NEXT: ; %bb.1: ; %bb.1 294; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 295; GFX906-NEXT: s_waitcnt vmcnt(0) 296; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill 297; GFX906-NEXT: s_nop 0 298; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill 299; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill 300; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill 301; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[2:3] offset:16 302; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[2:3] offset:32 303; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[2:3] offset:48 304; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[2:3] offset:64 305; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[2:3] offset:80 306; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[2:3] offset:96 307; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[2:3] offset:112 308; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[2:3] offset:128 309; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[2:3] offset:144 310; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[2:3] offset:160 311; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[2:3] offset:176 312; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[2:3] offset:192 313; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[2:3] offset:208 314; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[2:3] offset:224 315; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:240 316; GFX906-NEXT: .LBB6_2: ; %bb.2 317; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 318; GFX906-NEXT: s_waitcnt vmcnt(0) 319; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill 320; GFX906-NEXT: s_nop 0 321; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill 322; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill 323; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill 324; GFX906-NEXT: v_mov_b32_e32 v0, v57 325; GFX906-NEXT: v_mov_b32_e32 v1, v58 326; GFX906-NEXT: v_mov_b32_e32 v2, v59 327; GFX906-NEXT: v_mov_b32_e32 v3, v60 328; GFX906-NEXT: v_mov_b32_e32 v60, v56 329; GFX906-NEXT: v_mov_b32_e32 v59, v55 330; GFX906-NEXT: v_mov_b32_e32 v58, v54 331; GFX906-NEXT: v_mov_b32_e32 v57, v53 332; GFX906-NEXT: v_mov_b32_e32 v56, v52 333; GFX906-NEXT: v_mov_b32_e32 v55, v51 334; GFX906-NEXT: v_mov_b32_e32 v54, v50 335; GFX906-NEXT: v_mov_b32_e32 v53, v49 336; GFX906-NEXT: v_mov_b32_e32 v52, v48 337; GFX906-NEXT: v_mov_b32_e32 v51, v47 338; GFX906-NEXT: v_mov_b32_e32 v50, v46 339; GFX906-NEXT: v_mov_b32_e32 v49, v45 340; GFX906-NEXT: v_mov_b32_e32 v48, v44 341; GFX906-NEXT: v_mov_b32_e32 v47, v43 342; GFX906-NEXT: v_mov_b32_e32 v46, v42 343; GFX906-NEXT: v_mov_b32_e32 v45, v41 344; GFX906-NEXT: v_mov_b32_e32 v44, v40 345; GFX906-NEXT: v_mov_b32_e32 v43, v39 346; GFX906-NEXT: v_mov_b32_e32 v42, v38 347; GFX906-NEXT: v_mov_b32_e32 v41, v37 348; GFX906-NEXT: v_mov_b32_e32 v40, v36 349; GFX906-NEXT: v_mov_b32_e32 v39, v35 350; GFX906-NEXT: v_mov_b32_e32 v38, v34 351; GFX906-NEXT: v_mov_b32_e32 v37, v33 352; GFX906-NEXT: v_mov_b32_e32 v36, v32 353; GFX906-NEXT: v_mov_b32_e32 v35, v31 354; GFX906-NEXT: v_mov_b32_e32 v34, v30 355; GFX906-NEXT: v_mov_b32_e32 v33, v29 356; GFX906-NEXT: v_mov_b32_e32 v32, v28 357; GFX906-NEXT: v_mov_b32_e32 v31, v27 358; GFX906-NEXT: v_mov_b32_e32 v30, v26 359; GFX906-NEXT: v_mov_b32_e32 v29, v25 360; GFX906-NEXT: v_mov_b32_e32 v28, v24 361; GFX906-NEXT: v_mov_b32_e32 v27, v23 362; GFX906-NEXT: v_mov_b32_e32 v26, v22 363; GFX906-NEXT: v_mov_b32_e32 v25, v21 364; GFX906-NEXT: v_mov_b32_e32 v24, v20 365; GFX906-NEXT: v_mov_b32_e32 v23, v19 366; GFX906-NEXT: v_mov_b32_e32 v22, v18 367; GFX906-NEXT: v_mov_b32_e32 v21, v17 368; GFX906-NEXT: v_mov_b32_e32 v20, v16 369; GFX906-NEXT: v_mov_b32_e32 v19, v15 370; GFX906-NEXT: v_mov_b32_e32 v18, v14 371; GFX906-NEXT: v_mov_b32_e32 v17, v13 372; GFX906-NEXT: v_mov_b32_e32 v16, v12 373; GFX906-NEXT: v_mov_b32_e32 v15, v11 374; GFX906-NEXT: v_mov_b32_e32 v14, v10 375; GFX906-NEXT: v_mov_b32_e32 v13, v9 376; GFX906-NEXT: v_mov_b32_e32 v12, v8 377; GFX906-NEXT: v_mov_b32_e32 v11, v7 378; GFX906-NEXT: v_mov_b32_e32 v10, v6 379; GFX906-NEXT: v_mov_b32_e32 v9, v5 380; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload 381; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload 382; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload 383; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload 384; GFX906-NEXT: v_mov_b32_e32 v4, 0 385; GFX906-NEXT: s_waitcnt vmcnt(0) 386; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] 387; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:16 388; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:32 389; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:48 390; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:64 391; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:80 392; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:96 393; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112 394; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:128 395; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:144 396; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:160 397; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:176 398; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:192 399; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:208 400; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:224 401; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload 402; GFX906-NEXT: s_nop 0 403; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload 404; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload 405; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload 406; GFX906-NEXT: s_waitcnt vmcnt(0) 407; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240 408; GFX906-NEXT: s_endpgm 409entry: 410 %idx = call i32 @llvm.amdgcn.workitem.id.x() 411 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 412 %vec1 = load <256 x i8>, ptr addrspace(1) %gep1 413 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 414 %vec2 = load <256 x i8>, ptr addrspace(1) %gep2 415 %cmp = icmp ult i32 %idx, 15 416 br i1 %cmp, label %bb.1, label %bb.2 417bb.1: 418 br label %bb.2 419 420bb.2: 421 %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 422 store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4 423 ret void 424} 425 426 427define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 428; GFX906-LABEL: repeat_successor: 429; GFX906: ; %bb.0: ; %entry 430; GFX906-NEXT: s_load_dword s6, s[4:5], 0x24 431; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 432; GFX906-NEXT: s_waitcnt lgkmcnt(0) 433; GFX906-NEXT: s_cmp_lt_i32 s6, 3 434; GFX906-NEXT: s_cbranch_scc0 .LBB7_3 435; GFX906-NEXT: ; %bb.1: ; %LeafBlock 436; GFX906-NEXT: s_cmp_ge_i32 s6, 1 437; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 438; GFX906-NEXT: ; %bb.2: 439; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 440; GFX906-NEXT: global_load_dword v0, v0, s[0:1] 441; GFX906-NEXT: s_branch .LBB7_5 442; GFX906-NEXT: .LBB7_3: ; %LeafBlock5 443; GFX906-NEXT: s_cmp_eq_u32 s6, 3 444; GFX906-NEXT: s_cbranch_scc0 .LBB7_6 445; GFX906-NEXT: ; %bb.4: ; %sw.bb5 446; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0 447; GFX906-NEXT: global_load_dword v0, v0, s[2:3] 448; GFX906-NEXT: .LBB7_5: ; %return.sink.split 449; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c 450; GFX906-NEXT: v_mov_b32_e32 v1, 0 451; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 452; GFX906-NEXT: global_store_dword v1, v0, s[0:1] 453; GFX906-NEXT: .LBB7_6: ; %return 454; GFX906-NEXT: s_endpgm 455entry: 456 %idx = call i32 @llvm.amdgcn.workitem.id.x() 457 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 458 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 459 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 460 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 461 switch i32 %in, label %return [ 462 i32 1, label %return.sink.split 463 i32 2, label %return.sink.split 464 i32 3, label %sw.bb5 465 ] 466 467sw.bb5: 468 br label %return.sink.split 469 470return.sink.split: 471 %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ] 472 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4 473 ret void 474 475return: 476 ret void 477} 478 479define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { 480; GFX906-LABEL: v8i8_phi_chain: 481; GFX906: ; %bb.0: ; %entry 482; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 483; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 484; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 485; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 486; GFX906-NEXT: s_waitcnt lgkmcnt(0) 487; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] 488; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc 489; GFX906-NEXT: s_cbranch_execz .LBB8_2 490; GFX906-NEXT: ; %bb.1: ; %bb.1 491; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] 492; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 493; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 494; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc 495; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] 496; GFX906-NEXT: .LBB8_2: ; %Flow 497; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] 498; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 499; GFX906-NEXT: s_cbranch_execz .LBB8_4 500; GFX906-NEXT: ; %bb.3: ; %bb.2 501; GFX906-NEXT: v_mov_b32_e32 v0, 0 502; GFX906-NEXT: s_waitcnt vmcnt(0) 503; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] 504; GFX906-NEXT: .LBB8_4: ; %bb.3 505; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] 506; GFX906-NEXT: v_mov_b32_e32 v0, 0 507; GFX906-NEXT: s_waitcnt vmcnt(0) 508; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] 509; GFX906-NEXT: s_endpgm 510entry: 511 %idx = call i32 @llvm.amdgcn.workitem.id.x() 512 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 513 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 514 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 515 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 516 %cmp = icmp ult i32 %idx, 15 517 br i1 %cmp, label %bb.1, label %bb.2 518bb.1: 519 %cmp2 = icmp ult i32 %idx, 7 520 br i1 %cmp2, label %bb.2, label %bb.3 521 522bb.2: 523 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 524 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4 525 br label %bb.3 526 527bb.3: 528 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2] 529 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4 530 ret void 531} 532 533define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) { 534; GFX906-LABEL: v8i8_multi_block: 535; GFX906: ; %bb.0: ; %entry 536; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 537; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 538; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 539; GFX906-NEXT: s_waitcnt lgkmcnt(0) 540; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] 541; GFX906-NEXT: s_waitcnt vmcnt(0) 542; GFX906-NEXT: v_mov_b32_e32 v1, v3 543; GFX906-NEXT: v_mov_b32_e32 v2, v4 544; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc 545; GFX906-NEXT: s_cbranch_execz .LBB9_4 546; GFX906-NEXT: ; %bb.1: ; %bb.1 547; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] 548; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 549; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc 550; GFX906-NEXT: s_cbranch_execz .LBB9_3 551; GFX906-NEXT: ; %bb.2: ; %bb.2 552; GFX906-NEXT: v_mov_b32_e32 v0, 0 553; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] 554; GFX906-NEXT: .LBB9_3: ; %Flow 555; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] 556; GFX906-NEXT: .LBB9_4: ; %bb.3 557; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 558; GFX906-NEXT: v_mov_b32_e32 v0, 0 559; GFX906-NEXT: s_waitcnt vmcnt(0) 560; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] 561; GFX906-NEXT: s_endpgm 562entry: 563 %idx = call i32 @llvm.amdgcn.workitem.id.x() 564 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx 565 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1 566 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx 567 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2 568 %cmp = icmp ult i32 %idx, 15 569 br i1 %cmp, label %bb.1, label %bb.3 570bb.1: 571 %cmp2 = icmp ult i32 %idx, 7 572 br i1 %cmp2, label %bb.2, label %bb.3 573 574bb.2: 575 store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4 576 br label %bb.3 577 578bb.3: 579 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2] 580 store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4 581 ret void 582} 583 584define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { 585; GFX906-LABEL: v32i8_loop_carried: 586; GFX906: ; %bb.0: ; %entry 587; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 588; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0 589; GFX906-NEXT: v_mov_b32_e32 v3, 8 590; GFX906-NEXT: v_mov_b32_e32 v2, 0xff 591; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 592; GFX906-NEXT: s_waitcnt lgkmcnt(0) 593; GFX906-NEXT: global_load_dword v1, v1, s[0:1] 594; GFX906-NEXT: s_mov_b64 s[0:1], 0 595; GFX906-NEXT: s_waitcnt vmcnt(0) 596; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 597; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0 598; GFX906-NEXT: v_mov_b32_e32 v2, 24 599; GFX906-NEXT: .LBB10_1: ; %bb.1 600; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1 601; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1 602; GFX906-NEXT: s_and_b64 s[2:3], exec, vcc 603; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3 604; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 605; GFX906-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 606; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1 607; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1] 608; GFX906-NEXT: s_cbranch_execnz .LBB10_1 609; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit 610; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] 611; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 612; GFX906-NEXT: v_mov_b32_e32 v0, 0 613; GFX906-NEXT: s_waitcnt lgkmcnt(0) 614; GFX906-NEXT: global_store_dword v0, v1, s[0:1] 615; GFX906-NEXT: s_endpgm 616entry: 617 %idx = call i32 @llvm.amdgcn.workitem.id.x() 618 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx 619 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 620 br label %bb.1 621 622bb.1: 623 %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ] 624 %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 625 %cmp = icmp ult i32 %idx, 15 626 br i1 %cmp, label %bb.1, label %bb.2 627 br label %bb.2 628 629bb.2: 630 store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4 631 ret void 632} 633 634 635declare i32 @llvm.amdgcn.workitem.id.x() 636 637