1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -mtriple=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s 4 5define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { 6; GCN-LABEL: test_loop: 7; GCN: ; %bb.0: ; %entry 8; GCN-NEXT: s_load_dword s0, s[4:5], 0xa 9; GCN-NEXT: s_waitcnt lgkmcnt(0) 10; GCN-NEXT: s_cmp_eq_u32 s0, -1 11; GCN-NEXT: s_cbranch_scc1 .LBB0_3 12; GCN-NEXT: ; %bb.1: ; %for.body.preheader 13; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 14; GCN-NEXT: s_waitcnt lgkmcnt(0) 15; GCN-NEXT: s_addk_i32 s0, 0x80 16; GCN-NEXT: s_and_b64 vcc, exec, -1 17; GCN-NEXT: s_mov_b32 m0, -1 18; GCN-NEXT: .LBB0_2: ; %for.body 19; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 20; GCN-NEXT: v_mov_b32_e32 v0, s0 21; GCN-NEXT: ds_read_b32 v1, v0 22; GCN-NEXT: s_waitcnt lgkmcnt(0) 23; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 24; GCN-NEXT: ds_write_b32 v0, v1 25; GCN-NEXT: s_add_i32 s0, s0, 4 26; GCN-NEXT: s_mov_b64 vcc, vcc 27; GCN-NEXT: s_cbranch_vccnz .LBB0_2 28; GCN-NEXT: .LBB0_3: ; %for.exit 29; GCN-NEXT: s_endpgm 30; 31; GCN_DBG-LABEL: test_loop: 32; GCN_DBG: ; %bb.0: ; %entry 33; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 34; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 35; GCN_DBG-NEXT: s_mov_b32 s14, -1 36; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 37; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 38; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 39; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 40; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane 41; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 42; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 43; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa 44; GCN_DBG-NEXT: s_mov_b32 s0, 0 45; GCN_DBG-NEXT: s_mov_b32 s2, -1 46; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 47; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 48; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 49; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec 50; GCN_DBG-NEXT: s_mov_b64 exec, -1 51; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 52; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 53; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 54; GCN_DBG-NEXT: ; %bb.1: ; %for.exit 55; GCN_DBG-NEXT: s_endpgm 56; GCN_DBG-NEXT: .LBB0_2: ; %for.body 57; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 58; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 59; GCN_DBG-NEXT: s_waitcnt expcnt(0) 60; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload 61; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 62; GCN_DBG-NEXT: s_waitcnt vmcnt(0) 63; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 64; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 65; GCN_DBG-NEXT: s_mov_b32 s1, 2 66; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 67; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 68; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 69; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 70; GCN_DBG-NEXT: s_mov_b32 m0, -1 71; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 72; GCN_DBG-NEXT: ds_read_b32 v0, v0 73; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 74; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 75; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 76; GCN_DBG-NEXT: s_mov_b32 m0, -1 77; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 78; GCN_DBG-NEXT: ds_write_b32 v0, v1 79; GCN_DBG-NEXT: s_mov_b32 s1, 1 80; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 81; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 82; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 83; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 84; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 85; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 86; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 87; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 88; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock 89; GCN_DBG-NEXT: s_endpgm 90entry: 91 %cmp = icmp eq i32 %n, -1 92 br i1 %cmp, label %for.exit, label %for.body 93 94for.exit: 95 ret void 96 97for.body: 98 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 99 %tmp = add i32 %indvar, 32 100 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp 101 %vecload = load float, ptr addrspace(3) %arrayidx, align 4 102 %add = fadd float %vecload, 1.0 103 store float %add, ptr addrspace(3) %arrayidx, align 8 104 %inc = add i32 %indvar, 1 105 br label %for.body 106} 107 108define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwind { 109; GCN-LABEL: loop_const_true: 110; GCN: ; %bb.0: ; %entry 111; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 112; GCN-NEXT: s_waitcnt lgkmcnt(0) 113; GCN-NEXT: s_addk_i32 s0, 0x80 114; GCN-NEXT: s_and_b64 vcc, exec, -1 115; GCN-NEXT: s_mov_b32 m0, -1 116; GCN-NEXT: .LBB1_1: ; %for.body 117; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 118; GCN-NEXT: v_mov_b32_e32 v0, s0 119; GCN-NEXT: ds_read_b32 v1, v0 120; GCN-NEXT: s_waitcnt lgkmcnt(0) 121; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 122; GCN-NEXT: ds_write_b32 v0, v1 123; GCN-NEXT: s_add_i32 s0, s0, 4 124; GCN-NEXT: s_mov_b64 vcc, vcc 125; GCN-NEXT: s_cbranch_vccnz .LBB1_1 126; GCN-NEXT: ; %bb.2: ; %DummyReturnBlock 127; GCN-NEXT: s_endpgm 128; 129; GCN_DBG-LABEL: loop_const_true: 130; GCN_DBG: ; %bb.0: ; %entry 131; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 132; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 133; GCN_DBG-NEXT: s_mov_b32 s14, -1 134; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 135; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 136; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 137; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 138; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane 139; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 140; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 141; GCN_DBG-NEXT: s_mov_b32 s0, 0 142; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 143; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 144; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 145; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 146; GCN_DBG-NEXT: s_branch .LBB1_2 147; GCN_DBG-NEXT: .LBB1_1: ; %for.exit 148; GCN_DBG-NEXT: s_endpgm 149; GCN_DBG-NEXT: .LBB1_2: ; %for.body 150; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 151; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 152; GCN_DBG-NEXT: s_waitcnt expcnt(0) 153; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload 154; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 155; GCN_DBG-NEXT: s_waitcnt vmcnt(0) 156; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 157; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 158; GCN_DBG-NEXT: s_mov_b32 s1, 2 159; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 160; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 161; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 162; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 163; GCN_DBG-NEXT: s_mov_b32 m0, -1 164; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 165; GCN_DBG-NEXT: ds_read_b32 v0, v0 166; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 167; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 168; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 169; GCN_DBG-NEXT: s_mov_b32 m0, -1 170; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 171; GCN_DBG-NEXT: ds_write_b32 v0, v1 172; GCN_DBG-NEXT: s_mov_b32 s1, 1 173; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 174; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 175; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 176; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 177; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 178; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 179; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 180; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 181; GCN_DBG-NEXT: s_branch .LBB1_2 182entry: 183 br label %for.body 184 185for.exit: 186 ret void 187 188for.body: 189 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 190 %tmp = add i32 %indvar, 32 191 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp 192 %vecload = load float, ptr addrspace(3) %arrayidx, align 4 193 %add = fadd float %vecload, 1.0 194 store float %add, ptr addrspace(3) %arrayidx, align 8 195 %inc = add i32 %indvar, 1 196 br i1 true, label %for.body, label %for.exit 197} 198 199define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounwind { 200; GCN-LABEL: loop_const_false: 201; GCN: ; %bb.0: ; %entry 202; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 203; GCN-NEXT: s_waitcnt lgkmcnt(0) 204; GCN-NEXT: v_mov_b32_e32 v0, s0 205; GCN-NEXT: s_mov_b32 m0, -1 206; GCN-NEXT: ds_read_b32 v1, v0 offset:128 207; GCN-NEXT: s_waitcnt lgkmcnt(0) 208; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 209; GCN-NEXT: ds_write_b32 v0, v1 offset:128 210; GCN-NEXT: s_endpgm 211; 212; GCN_DBG-LABEL: loop_const_false: 213; GCN_DBG: ; %bb.0: ; %entry 214; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 215; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 216; GCN_DBG-NEXT: s_mov_b32 s14, -1 217; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 218; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 219; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 220; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 221; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane 222; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 223; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 224; GCN_DBG-NEXT: s_mov_b32 s0, 0 225; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 226; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 227; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 228; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 229; GCN_DBG-NEXT: s_branch .LBB2_2 230; GCN_DBG-NEXT: .LBB2_1: ; %for.exit 231; GCN_DBG-NEXT: s_endpgm 232; GCN_DBG-NEXT: .LBB2_2: ; %for.body 233; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 234; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 235; GCN_DBG-NEXT: s_waitcnt expcnt(0) 236; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload 237; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 238; GCN_DBG-NEXT: s_waitcnt vmcnt(0) 239; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 240; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 241; GCN_DBG-NEXT: s_mov_b32 s1, 2 242; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 243; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 244; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 245; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 246; GCN_DBG-NEXT: s_mov_b32 m0, -1 247; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 248; GCN_DBG-NEXT: ds_read_b32 v0, v0 249; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 250; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 251; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 252; GCN_DBG-NEXT: s_mov_b32 m0, -1 253; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 254; GCN_DBG-NEXT: ds_write_b32 v0, v1 255; GCN_DBG-NEXT: s_mov_b32 s1, 1 256; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 257; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 258; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 259; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 260; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 261; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 262; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 263; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 264; GCN_DBG-NEXT: s_branch .LBB2_2 265entry: 266 br label %for.body 267 268for.exit: 269 ret void 270 271; XXX - Should there be an S_ENDPGM? 272for.body: 273 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 274 %tmp = add i32 %indvar, 32 275 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp 276 %vecload = load float, ptr addrspace(3) %arrayidx, align 4 277 %add = fadd float %vecload, 1.0 278 store float %add, ptr addrspace(3) %arrayidx, align 8 279 %inc = add i32 %indvar, 1 280 br i1 false, label %for.body, label %for.exit 281} 282 283define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounwind { 284; GCN-LABEL: loop_const_undef: 285; GCN: ; %bb.0: ; %entry 286; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 287; GCN-NEXT: s_waitcnt lgkmcnt(0) 288; GCN-NEXT: v_mov_b32_e32 v0, s0 289; GCN-NEXT: s_mov_b32 m0, -1 290; GCN-NEXT: ds_read_b32 v1, v0 offset:128 291; GCN-NEXT: s_waitcnt lgkmcnt(0) 292; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 293; GCN-NEXT: ds_write_b32 v0, v1 offset:128 294; GCN-NEXT: s_endpgm 295; 296; GCN_DBG-LABEL: loop_const_undef: 297; GCN_DBG: ; %bb.0: ; %entry 298; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 299; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 300; GCN_DBG-NEXT: s_mov_b32 s14, -1 301; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 302; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 303; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 304; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 305; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane 306; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 307; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 308; GCN_DBG-NEXT: s_mov_b32 s0, 0 309; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 310; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 311; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 312; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 313; GCN_DBG-NEXT: s_branch .LBB3_2 314; GCN_DBG-NEXT: .LBB3_1: ; %for.exit 315; GCN_DBG-NEXT: s_endpgm 316; GCN_DBG-NEXT: .LBB3_2: ; %for.body 317; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 318; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 319; GCN_DBG-NEXT: s_waitcnt expcnt(0) 320; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload 321; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 322; GCN_DBG-NEXT: s_waitcnt vmcnt(0) 323; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1 324; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0 325; GCN_DBG-NEXT: s_mov_b32 s1, 2 326; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 327; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 328; GCN_DBG-NEXT: s_mov_b32 s2, 0x80 329; GCN_DBG-NEXT: s_add_i32 s1, s1, s2 330; GCN_DBG-NEXT: s_mov_b32 m0, -1 331; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 332; GCN_DBG-NEXT: ds_read_b32 v0, v0 333; GCN_DBG-NEXT: s_mov_b32 s2, 1.0 334; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 335; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2 336; GCN_DBG-NEXT: s_mov_b32 m0, -1 337; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 338; GCN_DBG-NEXT: ds_write_b32 v0, v1 339; GCN_DBG-NEXT: s_mov_b32 s1, 1 340; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 341; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 342; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 343; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 344; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 345; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 346; GCN_DBG-NEXT: s_branch .LBB3_2 347entry: 348 br label %for.body 349 350for.exit: 351 ret void 352 353; XXX - Should there be an s_endpgm? 354for.body: 355 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 356 %tmp = add i32 %indvar, 32 357 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp 358 %vecload = load float, ptr addrspace(3) %arrayidx, align 4 359 %add = fadd float %vecload, 1.0 360 store float %add, ptr addrspace(3) %arrayidx, align 8 361 %inc = add i32 %indvar, 1 362 br i1 undef, label %for.body, label %for.exit 363} 364 365define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { 366; GCN-LABEL: loop_arg_0: 367; GCN: ; %bb.0: ; %entry 368; GCN-NEXT: v_mov_b32_e32 v0, 0 369; GCN-NEXT: s_mov_b32 m0, -1 370; GCN-NEXT: ds_read_u8 v0, v0 371; GCN-NEXT: s_load_dword s4, s[4:5], 0x9 372; GCN-NEXT: s_waitcnt lgkmcnt(0) 373; GCN-NEXT: v_readfirstlane_b32 s0, v0 374; GCN-NEXT: s_bitcmp1_b32 s0, 0 375; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 376; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1 377; GCN-NEXT: s_add_i32 s0, s4, 0x80 378; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] 379; GCN-NEXT: .LBB4_1: ; %for.body 380; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 381; GCN-NEXT: v_mov_b32_e32 v0, s0 382; GCN-NEXT: ds_read_b32 v1, v0 383; GCN-NEXT: s_waitcnt lgkmcnt(0) 384; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 385; GCN-NEXT: ds_write_b32 v0, v1 386; GCN-NEXT: s_add_i32 s0, s0, 4 387; GCN-NEXT: s_mov_b64 vcc, vcc 388; GCN-NEXT: s_cbranch_vccz .LBB4_1 389; GCN-NEXT: ; %bb.2: ; %for.exit 390; GCN-NEXT: s_endpgm 391; 392; GCN_DBG-LABEL: loop_arg_0: 393; GCN_DBG: ; %bb.0: ; %entry 394; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 395; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 396; GCN_DBG-NEXT: s_mov_b32 s14, -1 397; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 398; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 399; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 400; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 401; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane 402; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 403; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0 404; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0 405; GCN_DBG-NEXT: s_mov_b32 m0, -1 406; GCN_DBG-NEXT: ds_read_u8 v0, v0 407; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 408; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 409; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 410; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 411; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 412; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 413; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 414; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 415; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 416; GCN_DBG-NEXT: s_mov_b32 s0, 0 417; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 418; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 419; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 420; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 421; GCN_DBG-NEXT: s_branch .LBB4_2 422; GCN_DBG-NEXT: .LBB4_1: ; %for.exit 423; GCN_DBG-NEXT: s_endpgm 424; GCN_DBG-NEXT: .LBB4_2: ; %for.body 425; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 426; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 427; GCN_DBG-NEXT: s_waitcnt expcnt(0) 428; GCN_DBG-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; 4-byte Folded Reload 429; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 430; GCN_DBG-NEXT: s_waitcnt vmcnt(0) 431; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3 432; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1 433; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2 434; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0 435; GCN_DBG-NEXT: s_mov_b32 s1, 2 436; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1 437; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 438; GCN_DBG-NEXT: s_mov_b32 s4, 0x80 439; GCN_DBG-NEXT: s_add_i32 s1, s1, s4 440; GCN_DBG-NEXT: s_mov_b32 m0, -1 441; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 442; GCN_DBG-NEXT: ds_read_b32 v0, v0 443; GCN_DBG-NEXT: s_mov_b32 s4, 1.0 444; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) 445; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4 446; GCN_DBG-NEXT: s_mov_b32 m0, -1 447; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1 448; GCN_DBG-NEXT: ds_write_b32 v0, v1 449; GCN_DBG-NEXT: s_mov_b32 s1, 1 450; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 451; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] 452; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 453; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 454; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill 455; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] 456; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 457; GCN_DBG-NEXT: s_branch .LBB4_2 458entry: 459 %cond = load volatile i1, ptr addrspace(3) null 460 br label %for.body 461 462for.exit: 463 ret void 464 465for.body: 466 %indvar = phi i32 [ %inc, %for.body ], [ 0, %entry ] 467 %tmp = add i32 %indvar, 32 468 %arrayidx = getelementptr float, ptr addrspace(3) %ptr, i32 %tmp 469 %vecload = load float, ptr addrspace(3) %arrayidx, align 4 470 %add = fadd float %vecload, 1.0 471 store float %add, ptr addrspace(3) %arrayidx, align 8 472 %inc = add i32 %indvar, 1 473 br i1 %cond, label %for.body, label %for.exit 474} 475 476!llvm.module.flags = !{!0} 477!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 478