1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=GFX10-W32 %s 4 5; Check that WQM isn't triggered by image load/store intrinsics. 6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) { 7; GFX9-W64-LABEL: test1: 8; GFX9-W64: ; %bb.0: ; %main_body 9; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 10; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm 11; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 12; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm 13; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 14; GFX9-W64-NEXT: ; return to shader part epilog 15; 16; GFX10-W32-LABEL: test1: 17; GFX10-W32: ; %bb.0: ; %main_body 18; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 19; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 20; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 21; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 22; GFX10-W32-NEXT: ; return to shader part epilog 23main_body: 24 %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 25 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0) 26 ret <4 x float> %tex 27} 28 29; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible 30define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 31; GFX9-W64-LABEL: test2: 32; GFX9-W64: ; %bb.0: ; %main_body 33; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 34; GFX9-W64-NEXT: s_wqm_b64 exec, exec 35; GFX9-W64-NEXT: s_mov_b32 m0, s3 36; GFX9-W64-NEXT: s_nop 0 37; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 38; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 39; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 40; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 41; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 42; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 43; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 44; GFX9-W64-NEXT: ; return to shader part epilog 45; 46; GFX10-W32-LABEL: test2: 47; GFX10-W32: ; %bb.0: ; %main_body 48; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 49; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 50; GFX10-W32-NEXT: s_mov_b32 m0, s3 51; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 52; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 53; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 54; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 55; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 56; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 57; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 58; GFX10-W32-NEXT: ; return to shader part epilog 59main_body: 60 %inst23 = extractelement <2 x float> %pos, i32 0 61 %inst24 = extractelement <2 x float> %pos, i32 1 62 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 63 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 64 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 65 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 66 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 67 ret <4 x float> %tex 68} 69 70; ... but disabled for stores (and, in this simple case, not re-enabled) ... 71define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 72; GFX9-W64-LABEL: test3: 73; GFX9-W64: ; %bb.0: ; %main_body 74; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 75; GFX9-W64-NEXT: s_wqm_b64 exec, exec 76; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 77; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 78; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 79; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 80; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 81; GFX9-W64-NEXT: ; return to shader part epilog 82; 83; GFX10-W32-LABEL: test3: 84; GFX10-W32: ; %bb.0: ; %main_body 85; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 86; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 87; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 88; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 89; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 90; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 91; GFX10-W32-NEXT: ; return to shader part epilog 92main_body: 93 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 94 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 95 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 96 97 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0) 98 99 ret <4 x float> %tex 100} 101 102define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) { 103; GFX9-W64-LABEL: test3_ptr_buf: 104; GFX9-W64: ; %bb.0: ; %main_body 105; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 106; GFX9-W64-NEXT: s_wqm_b64 exec, exec 107; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 108; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 109; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 110; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 111; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 112; GFX9-W64-NEXT: ; return to shader part epilog 113; 114; GFX10-W32-LABEL: test3_ptr_buf: 115; GFX10-W32: ; %bb.0: ; %main_body 116; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 117; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 118; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 119; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 120; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 121; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen 122; GFX10-W32-NEXT: ; return to shader part epilog 123main_body: 124 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 125 %tex.1 = bitcast <4 x float> %tex to <4 x i32> 126 %tex.2 = extractelement <4 x i32> %tex.1, i32 0 127 128 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0) 129 130 ret <4 x float> %tex 131} 132 133; ... and disabled for export. 134define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 { 135; GFX9-W64-LABEL: test3x: 136; GFX9-W64: ; %bb.0: ; %main_body 137; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 138; GFX9-W64-NEXT: s_wqm_b64 exec, exec 139; GFX9-W64-NEXT: s_mov_b32 m0, s3 140; GFX9-W64-NEXT: s_nop 0 141; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 142; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 143; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 144; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 145; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 146; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf 147; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 148; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm 149; GFX9-W64-NEXT: s_endpgm 150; 151; GFX10-W32-LABEL: test3x: 152; GFX10-W32: ; %bb.0: ; %main_body 153; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 154; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 155; GFX10-W32-NEXT: s_mov_b32 m0, s3 156; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 157; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 158; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 159; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 160; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 161; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 162; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 163; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm 164; GFX10-W32-NEXT: s_endpgm 165main_body: 166 %inst23 = extractelement <2 x float> %pos, i32 0 167 %inst24 = extractelement <2 x float> %pos, i32 1 168 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 169 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 170 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 171 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 172 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 173 %tex.0 = extractelement <4 x float> %tex, i32 0 174 %tex.1 = extractelement <4 x float> %tex, i32 1 175 %tex.2 = extractelement <4 x float> %tex, i32 2 176 %tex.3 = extractelement <4 x float> %tex, i32 3 177 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true) 178 ret void 179} 180 181; Check that WQM is re-enabled when required. 182define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) { 183; GFX9-W64-LABEL: test4: 184; GFX9-W64: ; %bb.0: ; %main_body 185; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 186; GFX9-W64-NEXT: s_wqm_b64 exec, exec 187; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1 188; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 189; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 190; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 191; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 192; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 193; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 194; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 195; GFX9-W64-NEXT: ; return to shader part epilog 196; 197; GFX10-W32-LABEL: test4: 198; GFX10-W32: ; %bb.0: ; %main_body 199; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 200; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 201; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1 202; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 203; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 204; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 205; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 206; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 207; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 208; GFX10-W32-NEXT: ; return to shader part epilog 209main_body: 210 %c.1 = mul i32 %c, %d 211 212 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0) 213 %c.1.bc = bitcast i32 %c.1 to float 214 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 215 %tex0 = extractelement <4 x float> %tex, i32 0 216 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 217 ret <4 x float> %dtex 218} 219 220define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) { 221; GFX9-W64-LABEL: test4_ptr_buf: 222; GFX9-W64: ; %bb.0: ; %main_body 223; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 224; GFX9-W64-NEXT: s_wqm_b64 exec, exec 225; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1 226; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 227; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 228; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 229; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 230; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 231; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 232; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 233; GFX9-W64-NEXT: ; return to shader part epilog 234; 235; GFX10-W32-LABEL: test4_ptr_buf: 236; GFX10-W32: ; %bb.0: ; %main_body 237; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 238; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 239; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1 240; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 241; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 242; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 243; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 244; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 245; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen 246; GFX10-W32-NEXT: ; return to shader part epilog 247main_body: 248 %c.1 = mul i32 %c, %d 249 250 call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0) 251 %c.1.bc = bitcast i32 %c.1 to float 252 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 253 %tex0 = extractelement <4 x float> %tex, i32 0 254 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 255 ret <4 x float> %dtex 256} 257 258; Check that WQM is triggered by the wqm intrinsic. 259; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this 260; does not happen - the v_add should write the return reg directly. 261define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) { 262; GFX9-W64-LABEL: test5: 263; GFX9-W64: ; %bb.0: ; %main_body 264; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 265; GFX9-W64-NEXT: s_wqm_b64 exec, exec 266; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 267; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 268; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 269; GFX9-W64-NEXT: s_nop 0 270; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 271; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 272; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 273; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 274; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 275; GFX9-W64-NEXT: ; return to shader part epilog 276; 277; GFX10-W32-LABEL: test5: 278; GFX10-W32: ; %bb.0: ; %main_body 279; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 280; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 281; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 282; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 283; GFX10-W32-NEXT: s_clause 0x1 284; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 285; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 286; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 287; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 288; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 289; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 290; GFX10-W32-NEXT: ; return to shader part epilog 291main_body: 292 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 293 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 294 %out = fadd float %src0, %src1 295 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 296 ret float %out.0 297} 298 299define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { 300; GFX9-W64-LABEL: test5_ptr_buf: 301; GFX9-W64: ; %bb.0: ; %main_body 302; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 303; GFX9-W64-NEXT: s_wqm_b64 exec, exec 304; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 305; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 306; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 307; GFX9-W64-NEXT: s_nop 0 308; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 309; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 310; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 311; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 312; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 313; GFX9-W64-NEXT: ; return to shader part epilog 314; 315; GFX10-W32-LABEL: test5_ptr_buf: 316; GFX10-W32: ; %bb.0: ; %main_body 317; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 318; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 319; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 320; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 321; GFX10-W32-NEXT: s_clause 0x1 322; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 323; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 324; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 325; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 326; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 327; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 328; GFX10-W32-NEXT: ; return to shader part epilog 329main_body: 330 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 331 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 332 %out = fadd float %src0, %src1 333 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 334 ret float %out.0 335} 336 337; Check that the wqm intrinsic works correctly for integers. 338define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) { 339; GFX9-W64-LABEL: test6: 340; GFX9-W64: ; %bb.0: ; %main_body 341; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 342; GFX9-W64-NEXT: s_wqm_b64 exec, exec 343; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 344; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 345; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 346; GFX9-W64-NEXT: s_nop 0 347; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 348; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 349; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 350; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 351; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 352; GFX9-W64-NEXT: ; return to shader part epilog 353; 354; GFX10-W32-LABEL: test6: 355; GFX10-W32: ; %bb.0: ; %main_body 356; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 357; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 358; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 359; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 360; GFX10-W32-NEXT: s_clause 0x1 361; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 362; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 363; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 364; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 365; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 366; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 367; GFX10-W32-NEXT: ; return to shader part epilog 368main_body: 369 %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0) 370 %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0) 371 %out = fadd float %src0, %src1 372 %out.0 = bitcast float %out to i32 373 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 374 %out.2 = bitcast i32 %out.1 to float 375 ret float %out.2 376} 377 378define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) { 379; GFX9-W64-LABEL: test6_ptr_buf: 380; GFX9-W64: ; %bb.0: ; %main_body 381; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 382; GFX9-W64-NEXT: s_wqm_b64 exec, exec 383; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 384; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 385; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 386; GFX9-W64-NEXT: s_nop 0 387; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 388; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 389; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 390; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 391; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 392; GFX9-W64-NEXT: ; return to shader part epilog 393; 394; GFX10-W32-LABEL: test6_ptr_buf: 395; GFX10-W32: ; %bb.0: ; %main_body 396; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 397; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 398; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 399; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 400; GFX10-W32-NEXT: s_clause 0x1 401; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen 402; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 403; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 404; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 405; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 406; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 407; GFX10-W32-NEXT: ; return to shader part epilog 408main_body: 409 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 410 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 411 %out = fadd float %src0, %src1 412 %out.0 = bitcast float %out to i32 413 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 414 %out.2 = bitcast i32 %out.1 to float 415 ret float %out.2 416} 417 418; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. 419 420; Check that WWM is triggered by the wwm intrinsic. 421define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 422; GFX9-W64-LABEL: test_wwm1: 423; GFX9-W64: ; %bb.0: ; %main_body 424; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 425; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 426; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 427; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 428; GFX9-W64-NEXT: s_nop 0 429; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 430; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 431; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 432; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 433; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 434; GFX9-W64-NEXT: ; return to shader part epilog 435; 436; GFX10-W32-LABEL: test_wwm1: 437; GFX10-W32: ; %bb.0: ; %main_body 438; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 439; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 440; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 441; GFX10-W32-NEXT: s_clause 0x1 442; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 443; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 444; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 445; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 446; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 447; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 448; GFX10-W32-NEXT: ; return to shader part epilog 449main_body: 450 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 451 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 452 %out = fadd float %src0, %src1 453 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 454 ret float %out.0 455} 456 457; Same as above, but with an integer type. 458define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 459; GFX9-W64-LABEL: test_wwm2: 460; GFX9-W64: ; %bb.0: ; %main_body 461; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 462; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 463; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 464; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 465; GFX9-W64-NEXT: s_nop 0 466; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 467; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 468; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 469; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 470; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 471; GFX9-W64-NEXT: ; return to shader part epilog 472; 473; GFX10-W32-LABEL: test_wwm2: 474; GFX10-W32: ; %bb.0: ; %main_body 475; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 476; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 477; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 478; GFX10-W32-NEXT: s_clause 0x1 479; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 480; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 481; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 482; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 483; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 484; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 485; GFX10-W32-NEXT: ; return to shader part epilog 486main_body: 487 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 488 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 489 %src0.0 = bitcast float %src0 to i32 490 %src1.0 = bitcast float %src1 to i32 491 %out = add i32 %src0.0, %src1.0 492 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 493 %out.1 = bitcast i32 %out.0 to float 494 ret float %out.1 495} 496 497; Check that we don't leave WWM on for computations that don't require WWM, 498; since that will lead clobbering things that aren't supposed to be clobbered 499; in cases like this. 500; We enforce this by checking that v_add gets emitted in the same block as 501; WWM computations. 502define amdgpu_ps float @test_wwm3(i32 inreg %idx) { 503; GFX9-W64-LABEL: test_wwm3: 504; GFX9-W64: ; %bb.0: ; %main_body 505; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 506; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 507; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 508; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 509; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 510; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2 511; GFX9-W64-NEXT: ; %bb.1: ; %if 512; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 513; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 514; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 515; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 516; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 517; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 518; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 519; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 520; GFX9-W64-NEXT: .LBB13_2: ; %endif 521; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 522; GFX9-W64-NEXT: ; return to shader part epilog 523; 524; GFX10-W32-LABEL: test_wwm3: 525; GFX10-W32: ; %bb.0: ; %main_body 526; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 527; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 528; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 529; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 530; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 531; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2 532; GFX10-W32-NEXT: ; %bb.1: ; %if 533; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 534; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 535; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 536; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 537; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 538; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 539; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 540; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 541; GFX10-W32-NEXT: .LBB13_2: ; %endif 542; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 543; GFX10-W32-NEXT: ; return to shader part epilog 544main_body: 545 ; use mbcnt to make sure the branch is divergent 546 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 547 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 548 %cc = icmp uge i32 %hi, 16 549 br i1 %cc, label %endif, label %if 550 551if: 552 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 553 %out = fadd float %src, %src 554 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 555 %out.1 = fadd float %src, %out.0 556 br label %endif 557 558endif: 559 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 560 ret float %out.2 561} 562 563; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 564; write could clobber disabled channels in the non-WWM one. 565; We enforce this by checking that v_mov gets emitted in the same block as 566; WWM computations. 567define amdgpu_ps float @test_wwm4(i32 inreg %idx) { 568; GFX9-W64-LABEL: test_wwm4: 569; GFX9-W64: ; %bb.0: ; %main_body 570; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 571; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 572; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 573; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 574; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 575; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2 576; GFX9-W64-NEXT: ; %bb.1: ; %if 577; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 578; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 579; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 580; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 581; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 582; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 583; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 584; GFX9-W64-NEXT: .LBB14_2: ; %endif 585; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 586; GFX9-W64-NEXT: ; return to shader part epilog 587; 588; GFX10-W32-LABEL: test_wwm4: 589; GFX10-W32: ; %bb.0: ; %main_body 590; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 591; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 592; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 593; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 594; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 595; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2 596; GFX10-W32-NEXT: ; %bb.1: ; %if 597; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 598; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 599; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 600; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 601; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 602; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 603; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 604; GFX10-W32-NEXT: .LBB14_2: ; %endif 605; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 606; GFX10-W32-NEXT: ; return to shader part epilog 607main_body: 608 ; use mbcnt to make sure the branch is divergent 609 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 610 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 611 %cc = icmp uge i32 %hi, 16 612 br i1 %cc, label %endif, label %if 613 614if: 615 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 616 %out = fadd float %src, %src 617 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 618 br label %endif 619 620endif: 621 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 622 ret float %out.1 623} 624 625; Make sure the transition from Exact to WWM then WQM works properly. 626define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 627; GFX9-W64-LABEL: test_wwm5: 628; GFX9-W64: ; %bb.0: ; %main_body 629; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 630; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 631; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 632; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 633; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 634; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 635; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 636; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 637; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 638; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 639; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 640; GFX9-W64-NEXT: s_wqm_b64 exec, exec 641; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 642; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 643; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 644; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 645; GFX9-W64-NEXT: ; return to shader part epilog 646; 647; GFX10-W32-LABEL: test_wwm5: 648; GFX10-W32: ; %bb.0: ; %main_body 649; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 650; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 651; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 652; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 653; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 654; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 655; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 656; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 657; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 658; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 659; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 660; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 661; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 662; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 663; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 664; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 665; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 666; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 667; GFX10-W32-NEXT: ; return to shader part epilog 668main_body: 669 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 670 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 671 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 672 %temp = fadd float %src1, %src1 673 %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp) 674 %out = fadd float %temp.0, %temp.0 675 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 676 ret float %out.0 677} 678 679; Check that WWM is turned on correctly across basic block boundaries. 680; if..then..endif version 681;SI-CHECK: buffer_load_dword 682;VI-CHECK: flat_load_dword 683;SI-CHECK: buffer_load_dword 684;VI-CHECK: flat_load_dword 685define amdgpu_ps float @test_wwm6_then() { 686; GFX9-W64-LABEL: test_wwm6_then: 687; GFX9-W64: ; %bb.0: ; %main_body 688; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 689; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 690; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 691; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 692; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 693; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 694; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 695; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 696; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 697; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2 698; GFX9-W64-NEXT: ; %bb.1: ; %if 699; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 700; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 701; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 702; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 703; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 704; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 705; GFX9-W64-NEXT: .LBB16_2: ; %endif 706; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 707; GFX9-W64-NEXT: ; return to shader part epilog 708; 709; GFX10-W32-LABEL: test_wwm6_then: 710; GFX10-W32: ; %bb.0: ; %main_body 711; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 712; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 713; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 714; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 715; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 716; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 717; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 718; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 719; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 720; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2 721; GFX10-W32-NEXT: ; %bb.1: ; %if 722; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 723; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 724; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 725; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 726; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 727; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 728; GFX10-W32-NEXT: .LBB16_2: ; %endif 729; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 730; GFX10-W32-NEXT: ; return to shader part epilog 731main_body: 732 %src0 = load volatile float, ptr addrspace(1) undef 733 ; use mbcnt to make sure the branch is divergent 734 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 735 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 736 %cc = icmp uge i32 %hi, 16 737 br i1 %cc, label %endif, label %if 738 739if: 740 %src1 = load volatile float, ptr addrspace(1) undef 741 %out = fadd float %src0, %src1 742 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 743 br label %endif 744 745endif: 746 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 747 ret float %out.1 748} 749 750; Check that WWM is turned on correctly across basic block boundaries. 751; loop version 752;SI-CHECK: buffer_load_dword 753;VI-CHECK: flat_load_dword 754;SI-CHECK: buffer_load_dword 755;VI-CHECK: flat_load_dword 756define amdgpu_ps float @test_wwm6_loop() { 757; GFX9-W64-LABEL: test_wwm6_loop: 758; GFX9-W64: ; %bb.0: ; %main_body 759; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 760; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 761; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 762; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 763; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 764; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 765; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 766; GFX9-W64-NEXT: .LBB17_1: ; %loop 767; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 768; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 769; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 770; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 771; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 772; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 773; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 774; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 775; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 776; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 777; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 778; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 779; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 780; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1 781; GFX9-W64-NEXT: ; %bb.2: ; %endloop 782; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 783; GFX9-W64-NEXT: ; return to shader part epilog 784; 785; GFX10-W32-LABEL: test_wwm6_loop: 786; GFX10-W32: ; %bb.0: ; %main_body 787; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 788; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 789; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 790; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 791; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 792; GFX10-W32-NEXT: s_mov_b32 s0, 0 793; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 794; GFX10-W32-NEXT: .LBB17_1: ; %loop 795; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 796; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 797; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 798; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 799; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 800; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 801; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 802; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 803; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 804; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 805; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 806; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 807; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 808; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1 809; GFX10-W32-NEXT: ; %bb.2: ; %endloop 810; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 811; GFX10-W32-NEXT: ; return to shader part epilog 812main_body: 813 %src0 = load volatile float, ptr addrspace(1) undef 814 ; use mbcnt to make sure the branch is divergent 815 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 816 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 817 br label %loop 818 819loop: 820 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 821 %src1 = load volatile float, ptr addrspace(1) undef 822 %out = fadd float %src0, %src1 823 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 824 %counter.1 = sub i32 %counter, 1 825 %cc = icmp ne i32 %counter.1, 0 826 br i1 %cc, label %loop, label %endloop 827 828endloop: 829 ret float %out.0 830} 831 832; Check that @llvm.amdgcn.set.inactive disables WWM. 833define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) { 834; GFX9-W64-LABEL: test_wwm_set_inactive1: 835; GFX9-W64: ; %bb.0: ; %main_body 836; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 837; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 838; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 839; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 840; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] 841; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 842; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 843; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 844; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 845; GFX9-W64-NEXT: s_endpgm 846; 847; GFX10-W32-LABEL: test_wwm_set_inactive1: 848; GFX10-W32: ; %bb.0: ; %main_body 849; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 850; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 851; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 852; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 853; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 854; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 855; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 856; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 857; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 858; GFX10-W32-NEXT: s_endpgm 859main_body: 860 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 861 %src.0 = bitcast float %src to i32 862 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 863 %out = add i32 %src.1, %src.1 864 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out) 865 %out.1 = bitcast i32 %out.0 to float 866 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 867 ret void 868} 869 870; Check that Strict WQM is triggered by the strict_wqm intrinsic. 871define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) { 872; GFX9-W64-LABEL: test_strict_wqm1: 873; GFX9-W64: ; %bb.0: ; %main_body 874; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 875; GFX9-W64-NEXT: s_wqm_b64 exec, exec 876; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 877; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 878; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 879; GFX9-W64-NEXT: s_nop 0 880; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 881; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 882; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 883; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 884; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 885; GFX9-W64-NEXT: ; return to shader part epilog 886; 887; GFX10-W32-LABEL: test_strict_wqm1: 888; GFX10-W32: ; %bb.0: ; %main_body 889; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 890; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 891; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 892; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 893; GFX10-W32-NEXT: s_clause 0x1 894; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 895; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 896; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 897; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 898; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 899; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 900; GFX10-W32-NEXT: ; return to shader part epilog 901main_body: 902 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 903 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 904 %out = fadd float %src0, %src1 905 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 906 ret float %out.0 907} 908 909; Same as above, but with an integer type. 910define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) { 911; GFX9-W64-LABEL: test_strict_wqm2: 912; GFX9-W64: ; %bb.0: ; %main_body 913; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 914; GFX9-W64-NEXT: s_wqm_b64 exec, exec 915; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 916; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 917; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 918; GFX9-W64-NEXT: s_nop 0 919; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 920; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 921; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 922; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 923; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 924; GFX9-W64-NEXT: ; return to shader part epilog 925; 926; GFX10-W32-LABEL: test_strict_wqm2: 927; GFX10-W32: ; %bb.0: ; %main_body 928; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 929; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 930; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 931; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 932; GFX10-W32-NEXT: s_clause 0x1 933; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 934; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 935; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 936; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 937; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 938; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 939; GFX10-W32-NEXT: ; return to shader part epilog 940main_body: 941 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 942 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 943 %src0.0 = bitcast float %src0 to i32 944 %src1.0 = bitcast float %src1 to i32 945 %out = add i32 %src0.0, %src1.0 946 %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out) 947 %out.1 = bitcast i32 %out.0 to float 948 ret float %out.1 949} 950 951; Check that we don't leave Strict WQM on for computations that don't require it, 952; since that will lead clobbering things that aren't supposed to be clobbered 953; in cases like this. 954; We enforce this by checking that v_add gets emitted in the same block as 955; WWM computations. 956define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) { 957; GFX9-W64-LABEL: test_strict_wqm3: 958; GFX9-W64: ; %bb.0: ; %main_body 959; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 960; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 961; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 962; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 963; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 964; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2 965; GFX9-W64-NEXT: ; %bb.1: ; %if 966; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 967; GFX9-W64-NEXT: s_wqm_b64 exec, exec 968; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 969; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 970; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 971; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 972; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 973; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 974; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 975; GFX9-W64-NEXT: .LBB21_2: ; %endif 976; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 977; GFX9-W64-NEXT: ; return to shader part epilog 978; 979; GFX10-W32-LABEL: test_strict_wqm3: 980; GFX10-W32: ; %bb.0: ; %main_body 981; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 982; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 983; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 984; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 985; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 986; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2 987; GFX10-W32-NEXT: ; %bb.1: ; %if 988; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 989; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 990; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 991; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 992; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 993; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 994; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 995; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 996; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 997; GFX10-W32-NEXT: .LBB21_2: ; %endif 998; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 999; GFX10-W32-NEXT: ; return to shader part epilog 1000main_body: 1001 ; use mbcnt to make sure the branch is divergent 1002 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1003 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1004 %cc = icmp uge i32 %hi, 16 1005 br i1 %cc, label %endif, label %if 1006 1007if: 1008 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 1009 %out = fadd float %src, %src 1010 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1011 %out.1 = fadd float %src, %out.0 1012 br label %endif 1013 1014endif: 1015 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 1016 ret float %out.2 1017} 1018 1019; Check that Strict WQM writes aren't coalesced with non-strict writes, since 1020; the Strict WQM write could clobber disabled channels in the non-strict one. 1021; We enforce this by checking that v_mov gets emitted in the same block as 1022; WWM computations. 1023define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) { 1024; GFX9-W64-LABEL: test_strict_wqm4: 1025; GFX9-W64: ; %bb.0: ; %main_body 1026; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1027; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1028; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 1029; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 1030; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 1031; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2 1032; GFX9-W64-NEXT: ; %bb.1: ; %if 1033; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 1034; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1035; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 1036; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1037; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1038; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 1039; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 1040; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 1041; GFX9-W64-NEXT: .LBB22_2: ; %endif 1042; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 1043; GFX9-W64-NEXT: ; return to shader part epilog 1044; 1045; GFX10-W32-LABEL: test_strict_wqm4: 1046; GFX10-W32: ; %bb.0: ; %main_body 1047; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1048; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1049; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 1050; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 1051; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 1052; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2 1053; GFX10-W32-NEXT: ; %bb.1: ; %if 1054; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 1055; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1056; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 1057; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1058; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1059; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 1060; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 1061; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 1062; GFX10-W32-NEXT: .LBB22_2: ; %endif 1063; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1064; GFX10-W32-NEXT: ; return to shader part epilog 1065main_body: 1066 ; use mbcnt to make sure the branch is divergent 1067 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1068 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1069 %cc = icmp uge i32 %hi, 16 1070 br i1 %cc, label %endif, label %if 1071 1072if: 1073 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 1074 %out = fadd float %src, %src 1075 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1076 br label %endif 1077 1078endif: 1079 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 1080 ret float %out.1 1081} 1082 1083; Make sure the transition from Exact to Strict WQM then WQM works properly. 1084define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) { 1085; GFX9-W64-LABEL: test_strict_wqm5: 1086; GFX9-W64: ; %bb.0: ; %main_body 1087; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1088; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 1089; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 1090; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec 1091; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1092; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1093; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1094; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 1095; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1096; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1097; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 1098; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 1099; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1100; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 1101; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 1102; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1103; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 1104; GFX9-W64-NEXT: ; return to shader part epilog 1105; 1106; GFX10-W32-LABEL: test_strict_wqm5: 1107; GFX10-W32: ; %bb.0: ; %main_body 1108; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 1109; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 1110; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 1111; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1112; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1113; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 1114; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1115; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1116; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1117; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1118; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1119; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1120; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1121; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 1122; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1123; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1124; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 1125; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 1126; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1127; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 1128; GFX10-W32-NEXT: ; return to shader part epilog 1129main_body: 1130 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 1131 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 1132 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 1133 %temp = fadd float %src1, %src1 1134 %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 1135 %out = fadd float %temp.0, %temp.0 1136 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 1137 ret float %out.0 1138} 1139 1140; Check that Strict WQM is turned on correctly across basic block boundaries. 1141; if..then..endif version 1142;SI-CHECK: buffer_load_dword 1143;VI-CHECK: flat_load_dword 1144;SI-CHECK: buffer_load_dword 1145;VI-CHECK: flat_load_dword 1146define amdgpu_ps float @test_strict_wqm6_then() { 1147; GFX9-W64-LABEL: test_strict_wqm6_then: 1148; GFX9-W64: ; %bb.0: ; %main_body 1149; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1150; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1151; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1152; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1153; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1154; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1155; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1156; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 1157; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 1158; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1159; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2 1160; GFX9-W64-NEXT: ; %bb.1: ; %if 1161; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1162; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1163; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1164; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1165; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 1166; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1167; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 1168; GFX9-W64-NEXT: .LBB24_2: ; %endif 1169; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1170; GFX9-W64-NEXT: ; return to shader part epilog 1171; 1172; GFX10-W32-LABEL: test_strict_wqm6_then: 1173; GFX10-W32: ; %bb.0: ; %main_body 1174; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1175; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1176; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1177; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1178; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1179; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1180; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1181; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 1182; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 1183; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 1184; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2 1185; GFX10-W32-NEXT: ; %bb.1: ; %if 1186; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1187; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1188; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1189; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1190; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 1191; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1192; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 1193; GFX10-W32-NEXT: .LBB24_2: ; %endif 1194; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1195; GFX10-W32-NEXT: ; return to shader part epilog 1196main_body: 1197 %src0 = load volatile float, ptr addrspace(1) undef 1198 ; use mbcnt to make sure the branch is divergent 1199 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1200 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1201 %cc = icmp uge i32 %hi, 16 1202 br i1 %cc, label %endif, label %if 1203 1204if: 1205 %src1 = load volatile float, ptr addrspace(1) undef 1206 %out = fadd float %src0, %src1 1207 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1208 br label %endif 1209 1210endif: 1211 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 1212 ret float %out.1 1213} 1214 1215; Check that Strict WQM is turned on correctly across basic block boundaries. 1216; loop version 1217;SI-CHECK: buffer_load_dword 1218;VI-CHECK: flat_load_dword 1219;SI-CHECK: buffer_load_dword 1220;VI-CHECK: flat_load_dword 1221define amdgpu_ps float @test_strict_wqm6_loop() { 1222; GFX9-W64-LABEL: test_strict_wqm6_loop: 1223; GFX9-W64: ; %bb.0: ; %main_body 1224; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1225; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1226; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 1227; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1228; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 1229; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1230; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 1231; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 1232; GFX9-W64-NEXT: .LBB25_1: ; %loop 1233; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1234; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1235; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1236; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 1237; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1238; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1239; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 1240; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 1241; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1242; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1243; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 1244; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 1245; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1246; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1247; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 1248; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1 1249; GFX9-W64-NEXT: ; %bb.2: ; %endloop 1250; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1251; GFX9-W64-NEXT: ; return to shader part epilog 1252; 1253; GFX10-W32-LABEL: test_strict_wqm6_loop: 1254; GFX10-W32: ; %bb.0: ; %main_body 1255; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1256; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1257; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 1258; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1259; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 1260; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1261; GFX10-W32-NEXT: s_mov_b32 s0, 0 1262; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 1263; GFX10-W32-NEXT: .LBB25_1: ; %loop 1264; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1265; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1266; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1267; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 1268; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1269; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1270; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 1271; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 1272; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1273; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 1274; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 1275; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 1276; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1277; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 1278; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 1279; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1 1280; GFX10-W32-NEXT: ; %bb.2: ; %endloop 1281; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1282; GFX10-W32-NEXT: ; return to shader part epilog 1283main_body: 1284 %src0 = load volatile float, ptr addrspace(1) undef 1285 ; use mbcnt to make sure the branch is divergent 1286 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1287 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1288 br label %loop 1289 1290loop: 1291 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 1292 %src1 = load volatile float, ptr addrspace(1) undef 1293 %out = fadd float %src0, %src1 1294 %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out) 1295 %counter.1 = sub i32 %counter, 1 1296 %cc = icmp ne i32 %counter.1, 0 1297 br i1 %cc, label %loop, label %endloop 1298 1299endloop: 1300 ret float %out.0 1301} 1302 1303; Check that enabling WQM anywhere enables WQM for the set.inactive source. 1304define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) { 1305; GFX9-W64-LABEL: test_set_inactive2: 1306; GFX9-W64: ; %bb.0: ; %main_body 1307; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 1308; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1309; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 1310; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0 1311; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen 1312; GFX9-W64-NEXT: s_nop 0 1313; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 1314; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec 1315; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 1316; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 1317; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1318; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1 1319; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1320; GFX9-W64-NEXT: s_endpgm 1321; 1322; GFX10-W32-LABEL: test_set_inactive2: 1323; GFX10-W32: ; %bb.0: ; %main_body 1324; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 1325; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1326; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1 1327; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 1328; GFX10-W32-NEXT: s_clause 0x1 1329; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 1330; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1331; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 1332; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec 1333; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 1334; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1335; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 1336; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1337; GFX10-W32-NEXT: s_endpgm 1338main_body: 1339 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 1340 %src1.0 = bitcast float %src1 to i32 1341 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef) 1342 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 1343 %src0.0 = bitcast float %src0 to i32 1344 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0) 1345 %out = add i32 %src0.1, %src1.1 1346 %out.0 = bitcast i32 %out to float 1347 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 1348 ret void 1349} 1350 1351; Check a case of one branch of an if-else requiring WQM, the other requiring 1352; exact. 1353; Note: In this particular case, the save-and-restore could be avoided if the 1354; analysis understood that the two branches of the if-else are mutually 1355; exclusive. 1356define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1357; GFX9-W64-LABEL: test_control_flow_0: 1358; GFX9-W64: ; %bb.0: ; %main_body 1359; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1360; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1361; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1362; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1363; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1364; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2 1365; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1366; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1367; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1368; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1369; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1370; GFX9-W64-NEXT: .LBB27_2: ; %Flow 1371; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] 1372; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4 1373; GFX9-W64-NEXT: ; %bb.3: ; %IF 1374; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1375; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1376; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1377; GFX9-W64-NEXT: .LBB27_4: ; %END 1378; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1379; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1380; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1381; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1382; GFX9-W64-NEXT: ; return to shader part epilog 1383; 1384; GFX10-W32-LABEL: test_control_flow_0: 1385; GFX10-W32: ; %bb.0: ; %main_body 1386; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1387; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1388; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1389; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 1390; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1391; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2 1392; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1393; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1394; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1395; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1396; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1397; GFX10-W32-NEXT: .LBB27_2: ; %Flow 1398; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 1399; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4 1400; GFX10-W32-NEXT: ; %bb.3: ; %IF 1401; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1402; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1403; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1404; GFX10-W32-NEXT: .LBB27_4: ; %END 1405; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1406; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1407; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1408; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1409; GFX10-W32-NEXT: ; return to shader part epilog 1410main_body: 1411 %cmp = icmp eq i32 %z, 0 1412 br i1 %cmp, label %IF, label %ELSE 1413 1414IF: 1415 %c.bc = bitcast i32 %c to float 1416 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1417 %tex0 = extractelement <4 x float> %tex, i32 0 1418 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1419 %data.if = extractelement <4 x float> %dtex, i32 0 1420 br label %END 1421 1422ELSE: 1423 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) 1424 br label %END 1425 1426END: 1427 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1428 ret float %r 1429} 1430 1431; Reverse branch order compared to the previous test. 1432define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 1433; GFX9-W64-LABEL: test_control_flow_1: 1434; GFX9-W64: ; %bb.0: ; %main_body 1435; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1436; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1437; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1438; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1439; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1440; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2 1441; GFX9-W64-NEXT: ; %bb.1: ; %IF 1442; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1443; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1444; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 1445; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1446; GFX9-W64-NEXT: .LBB28_2: ; %Flow 1447; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15] 1448; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1449; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1] 1450; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1] 1451; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4 1452; GFX9-W64-NEXT: ; %bb.3: ; %ELSE 1453; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1454; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1455; GFX9-W64-NEXT: .LBB28_4: ; %END 1456; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1457; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1458; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 1459; GFX9-W64-NEXT: ; return to shader part epilog 1460; 1461; GFX10-W32-LABEL: test_control_flow_1: 1462; GFX10-W32: ; %bb.0: ; %main_body 1463; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1464; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1465; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1466; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1 1467; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1468; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2 1469; GFX10-W32-NEXT: ; %bb.1: ; %IF 1470; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1471; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1472; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1473; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1474; GFX10-W32-NEXT: .LBB28_2: ; %Flow 1475; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13 1476; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1477; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0 1478; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1479; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4 1480; GFX10-W32-NEXT: ; %bb.3: ; %ELSE 1481; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1482; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1483; GFX10-W32-NEXT: .LBB28_4: ; %END 1484; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1485; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1486; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 1487; GFX10-W32-NEXT: ; return to shader part epilog 1488main_body: 1489 %cmp = icmp eq i32 %z, 0 1490 br i1 %cmp, label %ELSE, label %IF 1491 1492IF: 1493 %c.bc = bitcast i32 %c to float 1494 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1495 %tex0 = extractelement <4 x float> %tex, i32 0 1496 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1497 %data.if = extractelement <4 x float> %dtex, i32 0 1498 br label %END 1499 1500ELSE: 1501 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0) 1502 br label %END 1503 1504END: 1505 %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] 1506 ret float %r 1507} 1508 1509; Check that branch conditions are properly marked as needing WQM... 1510define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) { 1511; GFX9-W64-LABEL: test_control_flow_2: 1512; GFX9-W64: ; %bb.0: ; %main_body 1513; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1514; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1515; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1516; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1517; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1518; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1519; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1520; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1521; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1522; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 1523; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 1524; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1525; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1526; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 1527; GFX9-W64-NEXT: ; %bb.1: ; %ELSE 1528; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1529; GFX9-W64-NEXT: ; implicit-def: $vgpr5 1530; GFX9-W64-NEXT: ; %bb.2: ; %Flow 1531; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15] 1532; GFX9-W64-NEXT: ; %bb.3: ; %IF 1533; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5 1534; GFX9-W64-NEXT: ; %bb.4: ; %END 1535; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1536; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1537; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1538; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1539; GFX9-W64-NEXT: ; return to shader part epilog 1540; 1541; GFX10-W32-LABEL: test_control_flow_2: 1542; GFX10-W32: ; %bb.0: ; %main_body 1543; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1544; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1545; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1546; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen 1547; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1548; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen 1549; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1550; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0 1551; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1552; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen 1553; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1554; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1555; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 1556; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13 1557; GFX10-W32-NEXT: ; %bb.1: ; %ELSE 1558; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5 1559; GFX10-W32-NEXT: ; implicit-def: $vgpr5 1560; GFX10-W32-NEXT: ; %bb.2: ; %Flow 1561; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13 1562; GFX10-W32-NEXT: ; %bb.3: ; %IF 1563; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5 1564; GFX10-W32-NEXT: ; %bb.4: ; %END 1565; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1566; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1567; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1568; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1569; GFX10-W32-NEXT: ; return to shader part epilog 1570main_body: 1571 %idx.1 = extractelement <3 x i32> %idx, i32 0 1572 %data.1 = extractelement <2 x float> %data, i32 0 1573 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0) 1574 1575 ; The load that determines the branch (and should therefore be WQM) is 1576 ; surrounded by stores that require disabled WQM. 1577 %idx.2 = extractelement <3 x i32> %idx, i32 1 1578 %z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0) 1579 1580 %idx.3 = extractelement <3 x i32> %idx, i32 2 1581 %data.3 = extractelement <2 x float> %data, i32 1 1582 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0) 1583 1584 %cc = fcmp ogt float %z, 0.0 1585 br i1 %cc, label %IF, label %ELSE 1586 1587IF: 1588 %coord.IF = mul i32 %coord, 3 1589 br label %END 1590 1591ELSE: 1592 %coord.ELSE = mul i32 %coord, 4 1593 br label %END 1594 1595END: 1596 %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] 1597 %coord.END.bc = bitcast i32 %coord.END to float 1598 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1599 ret <4 x float> %tex 1600} 1601 1602; ... but only if they really do need it. 1603define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) { 1604; GFX9-W64-LABEL: test_control_flow_3: 1605; GFX9-W64: ; %bb.0: ; %main_body 1606; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1607; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1608; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1609; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1610; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1611; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 1612; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1613; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 1614; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1615; GFX9-W64-NEXT: ; implicit-def: $vgpr0 1616; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 1617; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 1618; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3 1619; GFX9-W64-NEXT: ; %bb.1: ; %Flow 1620; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 1621; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4 1622; GFX9-W64-NEXT: .LBB30_2: ; %END 1623; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1624; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1625; GFX9-W64-NEXT: s_branch .LBB30_5 1626; GFX9-W64-NEXT: .LBB30_3: ; %ELSE 1627; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1 1628; GFX9-W64-NEXT: ; implicit-def: $vgpr1 1629; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 1630; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2 1631; GFX9-W64-NEXT: .LBB30_4: ; %IF 1632; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1633; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 1634; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1635; GFX9-W64-NEXT: s_branch .LBB30_5 1636; GFX9-W64-NEXT: .LBB30_5: 1637; 1638; GFX10-W32-LABEL: test_control_flow_3: 1639; GFX10-W32: ; %bb.0: ; %main_body 1640; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1641; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1642; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1643; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1644; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1645; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1646; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1647; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen 1648; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1649; GFX10-W32-NEXT: ; implicit-def: $vgpr0 1650; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1 1651; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 1652; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3 1653; GFX10-W32-NEXT: ; %bb.1: ; %Flow 1654; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 1655; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4 1656; GFX10-W32-NEXT: .LBB30_2: ; %END 1657; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1658; GFX10-W32-NEXT: s_branch .LBB30_5 1659; GFX10-W32-NEXT: .LBB30_3: ; %ELSE 1660; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1 1661; GFX10-W32-NEXT: ; implicit-def: $vgpr1 1662; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 1663; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2 1664; GFX10-W32-NEXT: .LBB30_4: ; %IF 1665; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1 1666; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 1667; GFX10-W32-NEXT: s_branch .LBB30_5 1668; GFX10-W32-NEXT: .LBB30_5: 1669main_body: 1670 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1671 %tex0 = extractelement <4 x float> %tex, i32 0 1672 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1673 %dtex.1 = extractelement <4 x float> %dtex, i32 0 1674 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 1675 1676 %cc = fcmp ogt float %dtex.1, 0.0 1677 br i1 %cc, label %IF, label %ELSE 1678 1679IF: 1680 %tex.IF = fmul float %dtex.1, 3.0 1681 br label %END 1682 1683ELSE: 1684 %tex.ELSE = fmul float %dtex.1, 4.0 1685 br label %END 1686 1687END: 1688 %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] 1689 ret float %tex.END 1690} 1691 1692; Another test that failed at some point because of terminator handling. 1693define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) { 1694; GFX9-W64-LABEL: test_control_flow_4: 1695; GFX9-W64: ; %bb.0: ; %main_body 1696; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1697; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1698; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1699; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 1700; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2 1701; GFX9-W64-NEXT: ; %bb.1: ; %IF 1702; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 1703; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0 1704; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1 1705; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1706; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1707; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 1708; GFX9-W64-NEXT: .LBB31_2: ; %END 1709; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 1710; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 1711; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1712; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1713; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1714; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1715; GFX9-W64-NEXT: ; return to shader part epilog 1716; 1717; GFX10-W32-LABEL: test_control_flow_4: 1718; GFX10-W32: ; %bb.0: ; %main_body 1719; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1720; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1721; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 1722; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 1723; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2 1724; GFX10-W32-NEXT: ; %bb.1: ; %IF 1725; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 1726; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0 1727; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1 1728; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1729; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen 1730; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 1731; GFX10-W32-NEXT: .LBB31_2: ; %END 1732; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 1733; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1734; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1735; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1736; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1737; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1738; GFX10-W32-NEXT: ; return to shader part epilog 1739main_body: 1740 %cond = icmp eq i32 %y, 0 1741 br i1 %cond, label %IF, label %END 1742 1743IF: 1744 %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0) 1745 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0) 1746 br label %END 1747 1748END: 1749 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1750 %tex0 = extractelement <4 x float> %tex, i32 0 1751 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1752 ret <4 x float> %dtex 1753} 1754 1755; Kill is performed in WQM mode so that uniform kill behaves correctly ... 1756define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) { 1757; GFX9-W64-LABEL: test_kill_0: 1758; GFX9-W64: ; %bb.0: ; %main_body 1759; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1760; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1761; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1762; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf 1763; GFX9-W64-NEXT: s_nop 0 1764; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1765; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1766; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6 1767; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1768; GFX9-W64-NEXT: s_cbranch_scc0 .LBB32_2 1769; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1770; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1771; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 1772; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1773; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1774; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf 1775; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1776; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11 1777; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1778; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12 1779; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13 1780; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14 1781; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1782; GFX9-W64-NEXT: s_branch .LBB32_3 1783; GFX9-W64-NEXT: .LBB32_2: 1784; GFX9-W64-NEXT: s_mov_b64 exec, 0 1785; GFX9-W64-NEXT: exp null off, off, off, off done vm 1786; GFX9-W64-NEXT: s_endpgm 1787; GFX9-W64-NEXT: .LBB32_3: 1788; 1789; GFX10-W32-LABEL: test_kill_0: 1790; GFX10-W32: ; %bb.0: ; %main_body 1791; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1792; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1793; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1794; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1795; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 1796; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1797; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6 1798; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1799; GFX10-W32-NEXT: s_cbranch_scc0 .LBB32_2 1800; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1801; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1802; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1803; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1804; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1805; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1806; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen 1807; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1808; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12 1809; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14 1810; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11 1811; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13 1812; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4 1813; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5 1814; GFX10-W32-NEXT: s_branch .LBB32_3 1815; GFX10-W32-NEXT: .LBB32_2: 1816; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1817; GFX10-W32-NEXT: exp null off, off, off, off done vm 1818; GFX10-W32-NEXT: s_endpgm 1819; GFX10-W32-NEXT: .LBB32_3: 1820main_body: 1821 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1822 %idx.0 = extractelement <2 x i32> %idx, i32 0 1823 %data.0 = extractelement <2 x float> %data, i32 0 1824 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0) 1825 1826 %z.cmp = fcmp olt float %z, 0.0 1827 call void @llvm.amdgcn.kill(i1 %z.cmp) 1828 1829 %idx.1 = extractelement <2 x i32> %idx, i32 1 1830 %data.1 = extractelement <2 x float> %data, i32 1 1831 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0) 1832 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1833 %tex2.0 = extractelement <4 x float> %tex2, i32 0 1834 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1835 %out = fadd <4 x float> %tex, %dtex 1836 1837 ret <4 x float> %out 1838} 1839 1840; ... but only if WQM is necessary. 1841define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 1842; GFX9-W64-LABEL: test_kill_1: 1843; GFX9-W64: ; %bb.0: ; %main_body 1844; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 1845; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1846; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0 1847; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 1848; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2 1849; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 1850; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1851; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 1852; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4 1853; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc 1854; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0 1855; GFX9-W64-NEXT: s_cbranch_scc0 .LBB33_2 1856; GFX9-W64-NEXT: ; %bb.1: ; %main_body 1857; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc 1858; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1859; GFX9-W64-NEXT: s_branch .LBB33_3 1860; GFX9-W64-NEXT: .LBB33_2: 1861; GFX9-W64-NEXT: s_mov_b64 exec, 0 1862; GFX9-W64-NEXT: exp null off, off, off, off done vm 1863; GFX9-W64-NEXT: s_endpgm 1864; GFX9-W64-NEXT: .LBB33_3: 1865; 1866; GFX10-W32-LABEL: test_kill_1: 1867; GFX10-W32: ; %bb.0: ; %main_body 1868; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 1869; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1870; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0 1871; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 1872; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2 1873; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 1874; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1875; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 1876; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4 1877; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0 1878; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo 1879; GFX10-W32-NEXT: s_cbranch_scc0 .LBB33_2 1880; GFX10-W32-NEXT: ; %bb.1: ; %main_body 1881; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 1882; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1883; GFX10-W32-NEXT: s_branch .LBB33_3 1884; GFX10-W32-NEXT: .LBB33_2: 1885; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 1886; GFX10-W32-NEXT: exp null off, off, off, off done vm 1887; GFX10-W32-NEXT: s_endpgm 1888; GFX10-W32-NEXT: .LBB33_3: 1889main_body: 1890 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1891 %tex0 = extractelement <4 x float> %tex, i32 0 1892 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 1893 1894 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0) 1895 1896 %z.cmp = fcmp olt float %z, 0.0 1897 call void @llvm.amdgcn.kill(i1 %z.cmp) 1898 1899 ret <4 x float> %dtex 1900} 1901 1902; Check prolog shaders. 1903define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 { 1904; GFX9-W64-LABEL: test_prolog_1: 1905; GFX9-W64: ; %bb.0: ; %main_body 1906; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1907; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1908; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 1909; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1910; GFX9-W64-NEXT: ; return to shader part epilog 1911; 1912; GFX10-W32-LABEL: test_prolog_1: 1913; GFX10-W32: ; %bb.0: ; %main_body 1914; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1915; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1916; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 1917; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1918; GFX10-W32-NEXT: ; return to shader part epilog 1919main_body: 1920 %s = fadd float %a, %b 1921 ret float %s 1922} 1923 1924define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { 1925; GFX9-W64-LABEL: test_loop_vcc: 1926; GFX9-W64: ; %bb.0: ; %entry 1927; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 1928; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1929; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 1930; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 1931; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 1932; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 1933; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1934; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm 1935; GFX9-W64-NEXT: s_wqm_b64 exec, exec 1936; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 1937; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 1938; GFX9-W64-NEXT: s_branch .LBB35_2 1939; GFX9-W64-NEXT: .LBB35_1: ; %body 1940; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1 1941; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf 1942; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 1943; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4 1944; GFX9-W64-NEXT: .LBB35_2: ; %loop 1945; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 1946; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1947; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 1948; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 1949; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 1950; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 1951; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 1952; GFX9-W64-NEXT: s_cbranch_vccz .LBB35_1 1953; GFX9-W64-NEXT: ; %bb.3: 1954; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 1955; GFX9-W64-NEXT: ; implicit-def: $vgpr8 1956; GFX9-W64-NEXT: .LBB35_4: ; %break 1957; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 1958; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 1959; GFX9-W64-NEXT: ; return to shader part epilog 1960; 1961; GFX10-W32-LABEL: test_loop_vcc: 1962; GFX10-W32: ; %bb.0: ; %entry 1963; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 1964; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1965; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 1966; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1967; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm 1968; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 1969; GFX10-W32-NEXT: s_branch .LBB35_2 1970; GFX10-W32-NEXT: .p2align 6 1971; GFX10-W32-NEXT: .LBB35_1: ; %body 1972; GFX10-W32-NEXT: ; in Loop: Header=BB35_2 Depth=1 1973; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1974; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 1975; GFX10-W32-NEXT: s_cbranch_execz .LBB35_4 1976; GFX10-W32-NEXT: .LBB35_2: ; %loop 1977; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 1978; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 1979; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1980; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3 1981; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2 1982; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1 1983; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 1984; GFX10-W32-NEXT: s_cbranch_vccz .LBB35_1 1985; GFX10-W32-NEXT: ; %bb.3: 1986; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1987; GFX10-W32-NEXT: ; implicit-def: $vgpr8 1988; GFX10-W32-NEXT: .LBB35_4: ; %break 1989; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1990; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 1991; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 1992; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 1993; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6 1994; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7 1995; GFX10-W32-NEXT: ; return to shader part epilog 1996entry: 1997 call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0) 1998 br label %loop 1999 2000loop: 2001 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 2002 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 2003 %cc = fcmp ogt float %ctr.iv, 7.0 2004 br i1 %cc, label %break, label %body 2005 2006body: 2007 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 2008 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2009 %ctr.next = fadd float %ctr.iv, 2.0 2010 br label %loop 2011 2012break: 2013 ret <4 x float> %c.iv 2014} 2015 2016; Only intrinsic stores need exact execution -- other stores do not have 2017; externally visible effects and may require WQM for correctness. 2018define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { 2019; GFX9-W64-LABEL: test_alloca: 2020; GFX9-W64: ; %bb.0: ; %entry 2021; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2022; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2023; GFX9-W64-NEXT: s_mov_b32 s10, -1 2024; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000 2025; GFX9-W64-NEXT: s_add_u32 s8, s8, s0 2026; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0 2027; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 2028; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2029; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 2030; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0 2031; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2032; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 2033; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2034; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2035; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1 2036; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen 2037; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 2038; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2039; GFX9-W64-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf 2040; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1 2041; GFX9-W64-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen 2042; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 2043; GFX9-W64-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 2044; GFX9-W64-NEXT: s_endpgm 2045; 2046; GFX10-W32-LABEL: test_alloca: 2047; GFX10-W32: ; %bb.0: ; %entry 2048; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 2049; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 2050; GFX10-W32-NEXT: s_mov_b32 s10, -1 2051; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000 2052; GFX10-W32-NEXT: s_add_u32 s8, s8, s0 2053; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0 2054; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2055; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2056; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 0 2057; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2058; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0 2059; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2060; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 2061; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2062; GFX10-W32-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen 2063; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2064; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2065; GFX10-W32-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2066; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1 2067; GFX10-W32-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen 2068; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2069; GFX10-W32-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 2070; GFX10-W32-NEXT: s_endpgm 2071entry: 2072 %array = alloca [32 x i32], align 4, addrspace(5) 2073 2074 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0) 2075 2076 store volatile i32 %a, ptr addrspace(5) %array, align 4 2077 2078 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0) 2079 2080 %c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx 2081 %c = load i32, ptr addrspace(5) %c.gep, align 4 2082 %c.bc = bitcast i32 %c to float 2083 %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2084 call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0) 2085 2086 ret void 2087} 2088 2089; Must return to exact at the end of a non-void returning shader, 2090; otherwise the EXEC mask exported by the epilog will be wrong. This is true 2091; even if the shader has no kills, because a kill could have happened in a 2092; previous shader fragment. 2093define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { 2094; GFX9-W64-LABEL: test_nonvoid_return: 2095; GFX9-W64: ; %bb.0: 2096; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 2097; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2098; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 2099; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] 2100; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2101; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2102; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2103; GFX9-W64-NEXT: ; return to shader part epilog 2104; 2105; GFX10-W32-LABEL: test_nonvoid_return: 2106; GFX10-W32: ; %bb.0: 2107; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2108; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2109; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2110; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 2111; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2112; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2113; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2114; GFX10-W32-NEXT: ; return to shader part epilog 2115 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2116 %tex0 = extractelement <4 x float> %tex, i32 0 2117 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2118 ret <4 x float> %dtex 2119} 2120 2121define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { 2122; GFX9-W64-LABEL: test_nonvoid_return_unreachable: 2123; GFX9-W64: ; %bb.0: ; %entry 2124; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2125; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 2126; GFX9-W64-NEXT: s_and_b64 exec, exec, exec 2127; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2128; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2129; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2130; GFX9-W64-NEXT: s_cbranch_scc0 .LBB38_2 2131; GFX9-W64-NEXT: ; %bb.1: ; %else 2132; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2133; GFX9-W64-NEXT: s_branch .LBB38_3 2134; GFX9-W64-NEXT: .LBB38_2: ; %if 2135; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2136; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2137; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2138; GFX9-W64-NEXT: .LBB38_3: 2139; 2140; GFX10-W32-LABEL: test_nonvoid_return_unreachable: 2141; GFX10-W32: ; %bb.0: ; %entry 2142; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2143; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D 2144; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo 2145; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2146; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2147; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2148; GFX10-W32-NEXT: s_cbranch_scc0 .LBB38_2 2149; GFX10-W32-NEXT: ; %bb.1: ; %else 2150; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2151; GFX10-W32-NEXT: s_branch .LBB38_3 2152; GFX10-W32-NEXT: .LBB38_2: ; %if 2153; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2154; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off 2155; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 2156; GFX10-W32-NEXT: .LBB38_3: 2157entry: 2158 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2159 %tex0 = extractelement <4 x float> %tex, i32 0 2160 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2161 %cc = icmp sgt i32 %c, 0 2162 br i1 %cc, label %if, label %else 2163 2164if: 2165 store volatile <4 x float> %dtex, ptr addrspace(1) undef 2166 unreachable 2167 2168else: 2169 ret <4 x float> %dtex 2170} 2171 2172; Test awareness that s_wqm_b64 clobbers SCC. 2173define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { 2174; GFX9-W64-LABEL: test_scc: 2175; GFX9-W64: ; %bb.0: ; %main_body 2176; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2177; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2178; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 2179; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 2180; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2 2181; GFX9-W64-NEXT: ; %bb.1: ; %else 2182; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2183; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 2184; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf 2185; GFX9-W64-NEXT: s_cbranch_execz .LBB39_3 2186; GFX9-W64-NEXT: s_branch .LBB39_4 2187; GFX9-W64-NEXT: .LBB39_2: 2188; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2189; GFX9-W64-NEXT: .LBB39_3: ; %if 2190; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2191; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2192; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf 2193; GFX9-W64-NEXT: .LBB39_4: ; %end 2194; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2195; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 2196; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2197; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2198; GFX9-W64-NEXT: ; return to shader part epilog 2199; 2200; GFX10-W32-LABEL: test_scc: 2201; GFX10-W32: ; %bb.0: ; %main_body 2202; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 2203; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2204; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 2205; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 2206; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2 2207; GFX10-W32-NEXT: ; %bb.1: ; %else 2208; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 2209; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2210; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D 2211; GFX10-W32-NEXT: s_cbranch_execz .LBB39_3 2212; GFX10-W32-NEXT: s_branch .LBB39_4 2213; GFX10-W32-NEXT: .LBB39_2: 2214; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 2215; GFX10-W32-NEXT: .LBB39_3: ; %if 2216; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2217; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2218; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 2219; GFX10-W32-NEXT: .LBB39_4: ; %end 2220; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 2221; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 2222; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen 2223; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2224; GFX10-W32-NEXT: ; return to shader part epilog 2225main_body: 2226 %cc = icmp sgt i32 %sel, 0 2227 br i1 %cc, label %if, label %else 2228 2229if: 2230 %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2231 br label %end 2232 2233else: 2234 %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0 2235 br label %end 2236 2237end: 2238 %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ] 2239 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2240 ret <4 x float> %r 2241} 2242 2243; Check a case of a block being entirely WQM except for a bit of WWM. 2244; There was a bug where it forgot to enter and leave WWM. 2245define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2246; GFX9-W64-LABEL: test_wwm_within_wqm: 2247; GFX9-W64: ; %bb.0: ; %main_body 2248; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2249; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2250; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2251; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2252; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2253; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2 2254; GFX9-W64-NEXT: ; %bb.1: ; %IF 2255; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2256; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2257; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2258; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2259; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 2260; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2261; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2262; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2263; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2264; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2265; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2266; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2267; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF 2268; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2269; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2270; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2271; GFX9-W64-NEXT: ; return to shader part epilog 2272; 2273; GFX10-W32-LABEL: test_wwm_within_wqm: 2274; GFX10-W32: ; %bb.0: ; %main_body 2275; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2276; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2277; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2278; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2279; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2280; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2 2281; GFX10-W32-NEXT: ; %bb.1: ; %IF 2282; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2283; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2284; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2285; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2286; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 2287; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2288; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 2289; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2290; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2291; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2292; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2293; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2294; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF 2295; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2296; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2297; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2298; GFX10-W32-NEXT: ; return to shader part epilog 2299main_body: 2300 %cmp = icmp eq i32 %z, 0 2301 br i1 %cmp, label %IF, label %ENDIF 2302 2303IF: 2304 %c.bc = bitcast i32 %c to float 2305 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2306 %tex0 = extractelement <4 x float> %tex, i32 0 2307 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2308 %dataf = extractelement <4 x float> %dtex, i32 0 2309 %data1 = fptosi float %dataf to i32 2310 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2311 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2312 %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3) 2313 %data4f = sitofp i32 %data4 to float 2314 br label %ENDIF 2315 2316ENDIF: 2317 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2318 ret float %r 2319} 2320 2321; Check that WWM is triggered by the strict_wwm intrinsic. 2322define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) { 2323; GFX9-W64-LABEL: test_strict_wwm1: 2324; GFX9-W64: ; %bb.0: ; %main_body 2325; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2326; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2327; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2328; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2329; GFX9-W64-NEXT: s_nop 0 2330; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2331; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2332; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2333; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2334; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2335; GFX9-W64-NEXT: ; return to shader part epilog 2336; 2337; GFX10-W32-LABEL: test_strict_wwm1: 2338; GFX10-W32: ; %bb.0: ; %main_body 2339; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2340; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2341; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2342; GFX10-W32-NEXT: s_clause 0x1 2343; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2344; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2345; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2346; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2347; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2348; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2349; GFX10-W32-NEXT: ; return to shader part epilog 2350main_body: 2351 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 2352 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 2353 %out = fadd float %src0, %src1 2354 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2355 ret float %out.0 2356} 2357 2358; Same as above, but with an integer type. 2359define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) { 2360; GFX9-W64-LABEL: test_strict_wwm2: 2361; GFX9-W64: ; %bb.0: ; %main_body 2362; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2363; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2364; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 2365; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2366; GFX9-W64-NEXT: s_nop 0 2367; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2368; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2369; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2 2370; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2371; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2372; GFX9-W64-NEXT: ; return to shader part epilog 2373; 2374; GFX10-W32-LABEL: test_strict_wwm2: 2375; GFX10-W32: ; %bb.0: ; %main_body 2376; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2377; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2378; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 2379; GFX10-W32-NEXT: s_clause 0x1 2380; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2381; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen 2382; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2383; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2 2384; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2385; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2386; GFX10-W32-NEXT: ; return to shader part epilog 2387main_body: 2388 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 2389 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 2390 %src0.0 = bitcast float %src0 to i32 2391 %src1.0 = bitcast float %src1 to i32 2392 %out = add i32 %src0.0, %src1.0 2393 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2394 %out.1 = bitcast i32 %out.0 to float 2395 ret float %out.1 2396} 2397 2398; Check that we don't leave WWM on for computations that don't require WWM, 2399; since that will lead clobbering things that aren't supposed to be clobbered 2400; in cases like this. 2401; We enforce this by checking that v_add gets emitted in the same block as 2402; WWM computations. 2403define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) { 2404; GFX9-W64-LABEL: test_strict_wwm3: 2405; GFX9-W64: ; %bb.0: ; %main_body 2406; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2407; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2408; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2409; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2410; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2411; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2 2412; GFX9-W64-NEXT: ; %bb.1: ; %if 2413; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2414; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2415; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2416; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2417; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1 2418; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2419; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2420; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0 2421; GFX9-W64-NEXT: .LBB43_2: ; %endif 2422; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2423; GFX9-W64-NEXT: ; return to shader part epilog 2424; 2425; GFX10-W32-LABEL: test_strict_wwm3: 2426; GFX10-W32: ; %bb.0: ; %main_body 2427; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2428; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2429; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2430; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2431; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2432; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2 2433; GFX10-W32-NEXT: ; %bb.1: ; %if 2434; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2435; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2436; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2437; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2438; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1 2439; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2440; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2441; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0 2442; GFX10-W32-NEXT: .LBB43_2: ; %endif 2443; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2444; GFX10-W32-NEXT: ; return to shader part epilog 2445main_body: 2446 ; use mbcnt to make sure the branch is divergent 2447 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2448 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2449 %cc = icmp uge i32 %hi, 16 2450 br i1 %cc, label %endif, label %if 2451 2452if: 2453 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2454 %out = fadd float %src, %src 2455 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2456 %out.1 = fadd float %src, %out.0 2457 br label %endif 2458 2459endif: 2460 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 2461 ret float %out.2 2462} 2463 2464; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM 2465; write could clobber disabled channels in the non-WWM one. 2466; We enforce this by checking that v_mov gets emitted in the same block as 2467; WWM computations. 2468define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) { 2469; GFX9-W64-LABEL: test_strict_wwm4: 2470; GFX9-W64: ; %bb.0: ; %main_body 2471; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2472; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2473; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2474; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2475; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc 2476; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2 2477; GFX9-W64-NEXT: ; %bb.1: ; %if 2478; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2479; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2480; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2481; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2482; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2483; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2484; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2485; GFX9-W64-NEXT: .LBB44_2: ; %endif 2486; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3] 2487; GFX9-W64-NEXT: ; return to shader part epilog 2488; 2489; GFX10-W32-LABEL: test_strict_wwm4: 2490; GFX10-W32: ; %bb.0: ; %main_body 2491; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2492; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2493; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2494; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2495; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo 2496; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2 2497; GFX10-W32-NEXT: ; %bb.1: ; %if 2498; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1 2499; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2500; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2501; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2502; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2503; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2 2504; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2505; GFX10-W32-NEXT: .LBB44_2: ; %endif 2506; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1 2507; GFX10-W32-NEXT: ; return to shader part epilog 2508main_body: 2509 ; use mbcnt to make sure the branch is divergent 2510 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2511 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2512 %cc = icmp uge i32 %hi, 16 2513 br i1 %cc, label %endif, label %if 2514 2515if: 2516 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2517 %out = fadd float %src, %src 2518 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2519 br label %endif 2520 2521endif: 2522 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2523 ret float %out.1 2524} 2525 2526; Make sure the transition from Exact to WWM then WQM works properly. 2527define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) { 2528; GFX9-W64-LABEL: test_strict_wwm5: 2529; GFX9-W64: ; %bb.0: ; %main_body 2530; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 2531; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0 2532; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2533; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2534; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2535; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1 2536; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 2537; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2538; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2539; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1 2540; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5] 2541; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2542; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2543; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 2544; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2545; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] 2546; GFX9-W64-NEXT: ; return to shader part epilog 2547; 2548; GFX10-W32-LABEL: test_strict_wwm5: 2549; GFX10-W32: ; %bb.0: ; %main_body 2550; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0 2551; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo 2552; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2553; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2554; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 2555; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2556; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2557; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen 2558; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2559; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2560; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2561; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1 2562; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2563; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2564; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2565; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 2566; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2567; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2 2568; GFX10-W32-NEXT: ; return to shader part epilog 2569main_body: 2570 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 2571 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 2572 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 2573 %temp = fadd float %src1, %src1 2574 %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 2575 %out = fadd float %temp.0, %temp.0 2576 %out.0 = call float @llvm.amdgcn.wqm.f32(float %out) 2577 ret float %out.0 2578} 2579 2580; Check that WWM is turned on correctly across basic block boundaries. 2581; if..then..endif version 2582;SI-CHECK: buffer_load_dword 2583;VI-CHECK: flat_load_dword 2584;SI-CHECK: buffer_load_dword 2585;VI-CHECK: flat_load_dword 2586define amdgpu_ps float @test_strict_wwm6_then() { 2587; GFX9-W64-LABEL: test_strict_wwm6_then: 2588; GFX9-W64: ; %bb.0: ; %main_body 2589; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2590; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2591; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2592; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2593; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2594; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2595; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2596; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2597; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 2598; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2 2599; GFX9-W64-NEXT: ; %bb.1: ; %if 2600; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2601; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2602; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2603; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2 2604; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2605; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2606; GFX9-W64-NEXT: .LBB46_2: ; %endif 2607; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2608; GFX9-W64-NEXT: ; return to shader part epilog 2609; 2610; GFX10-W32-LABEL: test_strict_wwm6_then: 2611; GFX10-W32: ; %bb.0: ; %main_body 2612; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2613; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2614; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2615; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2616; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2617; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2618; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2619; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2620; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo 2621; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2 2622; GFX10-W32-NEXT: ; %bb.1: ; %if 2623; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2624; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2625; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2626; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2 2627; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2628; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2629; GFX10-W32-NEXT: .LBB46_2: ; %endif 2630; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2631; GFX10-W32-NEXT: ; return to shader part epilog 2632main_body: 2633 %src0 = load volatile float, ptr addrspace(1) undef 2634 ; use mbcnt to make sure the branch is divergent 2635 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2636 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2637 %cc = icmp uge i32 %hi, 16 2638 br i1 %cc, label %endif, label %if 2639 2640if: 2641 %src1 = load volatile float, ptr addrspace(1) undef 2642 %out = fadd float %src0, %src1 2643 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2644 br label %endif 2645 2646endif: 2647 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ] 2648 ret float %out.1 2649} 2650 2651; Check that WWM is turned on correctly across basic block boundaries. 2652; loop version 2653define amdgpu_ps float @test_strict_wwm6_loop() { 2654; GFX9-W64-LABEL: test_strict_wwm6_loop: 2655; GFX9-W64: ; %bb.0: ; %main_body 2656; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2657; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc 2658; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2659; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2660; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2661; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 2662; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0 2663; GFX9-W64-NEXT: .LBB47_1: ; %loop 2664; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 2665; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2666; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc 2667; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2668; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2669; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3 2670; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 2671; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 2672; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2 2673; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 2674; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2675; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2676; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1] 2677; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1 2678; GFX9-W64-NEXT: ; %bb.2: ; %endloop 2679; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1] 2680; GFX9-W64-NEXT: ; return to shader part epilog 2681; 2682; GFX10-W32-LABEL: test_strict_wwm6_loop: 2683; GFX10-W32: ; %bb.0: ; %main_body 2684; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2685; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc 2686; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2687; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2688; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2689; GFX10-W32-NEXT: s_mov_b32 s0, 0 2690; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0 2691; GFX10-W32-NEXT: .LBB47_1: ; %loop 2692; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 2693; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2694; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc 2695; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2696; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2697; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3 2698; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 2699; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2 2700; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 2701; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 2702; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2703; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0 2704; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 2705; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1 2706; GFX10-W32-NEXT: ; %bb.2: ; %endloop 2707; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0 2708; GFX10-W32-NEXT: ; return to shader part epilog 2709main_body: 2710 %src0 = load volatile float, ptr addrspace(1) undef 2711 ; use mbcnt to make sure the branch is divergent 2712 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2713 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2714 br label %loop 2715 2716loop: 2717 %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ] 2718 %src1 = load volatile float, ptr addrspace(1) undef 2719 %out = fadd float %src0, %src1 2720 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2721 %counter.1 = sub i32 %counter, 1 2722 %cc = icmp ne i32 %counter.1, 0 2723 br i1 %cc, label %loop, label %endloop 2724 2725endloop: 2726 ret float %out.0 2727} 2728 2729; Check that @llvm.amdgcn.set.inactive disables WWM. 2730define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) { 2731; GFX9-W64-LABEL: test_strict_wwm_set_inactive1: 2732; GFX9-W64: ; %bb.0: ; %main_body 2733; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 2734; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 2735; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2736; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2737; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] 2738; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0 2739; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2740; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2741; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2742; GFX9-W64-NEXT: s_endpgm 2743; 2744; GFX10-W32-LABEL: test_strict_wwm_set_inactive1: 2745; GFX10-W32: ; %bb.0: ; %main_body 2746; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 2747; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen 2748; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2749; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2750; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 2751; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0 2752; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2753; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2754; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen 2755; GFX10-W32-NEXT: s_endpgm 2756main_body: 2757 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2758 %src.0 = bitcast float %src to i32 2759 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0) 2760 %out = add i32 %src.1, %src.1 2761 %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out) 2762 %out.1 = bitcast i32 %out.0 to float 2763 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2764 ret void 2765} 2766 2767; Check a case of a block being entirely WQM except for a bit of WWM. 2768; There was a bug where it forgot to enter and leave WWM. 2769define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2770; GFX9-W64-LABEL: test_strict_wwm_within_wqm: 2771; GFX9-W64: ; %bb.0: ; %main_body 2772; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2773; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2774; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2775; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 2776; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2777; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2 2778; GFX9-W64-NEXT: ; %bb.1: ; %IF 2779; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2780; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2781; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2782; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2783; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0 2784; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 2785; GFX9-W64-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1] 2786; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2787; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2788; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2789; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2790; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 2791; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF 2792; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2793; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2794; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 2795; GFX9-W64-NEXT: ; return to shader part epilog 2796; 2797; GFX10-W32-LABEL: test_strict_wwm_within_wqm: 2798; GFX10-W32: ; %bb.0: ; %main_body 2799; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2800; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2801; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2802; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 2803; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo 2804; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2 2805; GFX10-W32-NEXT: ; %bb.1: ; %IF 2806; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2807; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2808; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2809; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2810; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0 2811; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 2812; GFX10-W32-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 2813; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2814; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2815; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2816; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2817; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 2818; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF 2819; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2820; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2821; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 2822; GFX10-W32-NEXT: ; return to shader part epilog 2823main_body: 2824 %cmp = icmp eq i32 %z, 0 2825 br i1 %cmp, label %IF, label %ENDIF 2826 2827IF: 2828 %c.bc = bitcast i32 %c to float 2829 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2830 %tex0 = extractelement <4 x float> %tex, i32 0 2831 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2832 %dataf = extractelement <4 x float> %dtex, i32 0 2833 %data1 = fptosi float %dataf to i32 2834 %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0) 2835 %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079) 2836 %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3) 2837 %data4f = sitofp i32 %data4 to float 2838 br label %ENDIF 2839 2840ENDIF: 2841 %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ] 2842 ret float %r 2843} 2844 2845; Check a case of a block being entirely WQM except for a bit of STRICT WQM. 2846define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) { 2847; GFX9-W64-LABEL: test_strict_wqm_within_wqm: 2848; GFX9-W64: ; %bb.0: ; %main_body 2849; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2850; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2851; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec 2852; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2853; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 2854; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] 2855; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2856; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 2857; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 2858; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2 2859; GFX9-W64-NEXT: ; %bb.1: ; %IF 2860; GFX9-W64-NEXT: s_mov_b64 s[16:17], exec 2861; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2862; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2863; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2864; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 2865; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2866; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2 2867; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2868; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 2869; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2870; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 2871; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 2872; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF 2873; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 2874; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2875; GFX9-W64-NEXT: ; return to shader part epilog 2876; 2877; GFX10-W32-LABEL: test_strict_wqm_within_wqm: 2878; GFX10-W32: ; %bb.0: ; %main_body 2879; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2880; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2881; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2882; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2883; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 2884; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 2885; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 2886; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2887; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1 2888; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2 2889; GFX10-W32-NEXT: ; %bb.1: ; %IF 2890; GFX10-W32-NEXT: s_mov_b32 s14, exec_lo 2891; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2892; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2893; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2894; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2895; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2896; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2 2897; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2) 2898; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 2899; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2900; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 2901; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 2902; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF 2903; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 2904; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2905; GFX10-W32-NEXT: ; return to shader part epilog 2906main_body: 2907 %cmp = icmp eq i32 %z, 0 2908 br i1 %cmp, label %IF, label %ENDIF 2909 2910IF: 2911 %c.bc = bitcast i32 %c to float 2912 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2913 %tex0 = extractelement <4 x float> %tex, i32 0 2914 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 2915 %dataf = extractelement <4 x float> %dtex, i32 0 2916 %data1 = fptosi float %dataf to i32 2917 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079) 2918 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) 2919 %data3f = sitofp i32 %data3 to float 2920 br label %ENDIF 2921 2922ENDIF: 2923 %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ] 2924 ret float %r 2925} 2926 2927; WQM -> StrictWQM transition must be preserved because kill breaks WQM mask 2928define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data, i32 %wqm_data) { 2929; GFX9-W64-LABEL: test_strict_wqm_within_wqm_with_kill: 2930; GFX9-W64: ; %bb.0: ; %main_body 2931; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 2932; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec 2933; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2934; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 2935; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15] 2936; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2937; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2938; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 2939; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2940; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 2941; GFX9-W64-NEXT: s_andn2_b64 s[0:1], exec, vcc 2942; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[0:1] 2943; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2 2944; GFX9-W64-NEXT: ; %bb.1: ; %main_body 2945; GFX9-W64-NEXT: s_and_b64 exec, exec, vcc 2946; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 2947; GFX9-W64-NEXT: s_wqm_b64 exec, exec 2948; GFX9-W64-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2) 2949; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 2950; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 2951; GFX9-W64-NEXT: v_mov_b32_e32 v1, v3 2952; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v1 2953; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 2954; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1 2955; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2956; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 2957; GFX9-W64-NEXT: s_branch .LBB51_3 2958; GFX9-W64-NEXT: .LBB51_2: 2959; GFX9-W64-NEXT: s_mov_b64 exec, 0 2960; GFX9-W64-NEXT: exp null off, off, off, off done vm 2961; GFX9-W64-NEXT: s_endpgm 2962; GFX9-W64-NEXT: .LBB51_3: 2963; 2964; GFX10-W32-LABEL: test_strict_wqm_within_wqm_with_kill: 2965; GFX10-W32: ; %bb.0: ; %main_body 2966; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 2967; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 2968; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2969; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 2970; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13 2971; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2972; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2973; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 2974; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2975; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 2976; GFX10-W32-NEXT: s_andn2_b32 s0, exec_lo, vcc_lo 2977; GFX10-W32-NEXT: s_andn2_b32 s12, s12, s0 2978; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2 2979; GFX10-W32-NEXT: ; %bb.1: ; %main_body 2980; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo 2981; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 2982; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 2983; GFX10-W32-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2) 2984; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 2985; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 2986; GFX10-W32-NEXT: v_mov_b32_e32 v1, v3 2987; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v1 2988; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 2989; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1 2990; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2991; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 2992; GFX10-W32-NEXT: s_branch .LBB51_3 2993; GFX10-W32-NEXT: .LBB51_2: 2994; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 2995; GFX10-W32-NEXT: exp null off, off, off, off done vm 2996; GFX10-W32-NEXT: s_endpgm 2997; GFX10-W32-NEXT: .LBB51_3: 2998main_body: 2999 %c.bc = bitcast i32 %c to float 3000 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3001 %tex0 = extractelement <4 x float> %tex, i32 0 3002 %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3003 %cmp = icmp eq i32 %z, 0 3004 call void @llvm.amdgcn.kill(i1 %cmp) 3005 %dataf = extractelement <4 x float> %dtex, i32 0 3006 %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %wqm_data, i32 2079) 3007 %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2) 3008 %data3f = sitofp i32 %data3 to float 3009 %result.f = fadd float %dataf, %data3f 3010 %result.i = bitcast float %result.f to i32 3011 %result.wqm = call i32 @llvm.amdgcn.wqm.i32(i32 %result.i) 3012 %result = bitcast i32 %result.wqm to float 3013 ret float %result 3014} 3015 3016;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again. 3017define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) { 3018; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm: 3019; GFX9-W64: ; %bb.0: ; %main_body 3020; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec 3021; GFX9-W64-NEXT: s_mov_b32 s19, s17 3022; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec 3023; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3024; GFX9-W64-NEXT: s_mov_b32 s23, s5 3025; GFX9-W64-NEXT: s_mov_b32 s22, s4 3026; GFX9-W64-NEXT: s_mov_b32 s21, s3 3027; GFX9-W64-NEXT: s_mov_b32 s20, s2 3028; GFX9-W64-NEXT: s_mov_b32 s27, s9 3029; GFX9-W64-NEXT: s_mov_b32 s26, s8 3030; GFX9-W64-NEXT: s_mov_b32 s25, s7 3031; GFX9-W64-NEXT: s_mov_b32 s24, s6 3032; GFX9-W64-NEXT: s_mov_b32 s18, s16 3033; GFX9-W64-NEXT: s_mov_b32 s17, s15 3034; GFX9-W64-NEXT: s_mov_b32 s16, s14 3035; GFX9-W64-NEXT: s_mov_b32 s15, s13 3036; GFX9-W64-NEXT: s_mov_b32 s14, s12 3037; GFX9-W64-NEXT: s_mov_b32 s13, s11 3038; GFX9-W64-NEXT: s_mov_b32 s12, s10 3039; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1 3040; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31] 3041; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3042; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec 3043; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3044; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 3045; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3046; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 3047; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0 3048; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 3049; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3050; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3051; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3052; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3053; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 3054; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3055; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3056; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 3057; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3058; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3059; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 3060; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 3061; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29] 3062; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 3063; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3064; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3065; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 3066; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3067; GFX9-W64-NEXT: ; return to shader part epilog 3068; 3069; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm: 3070; GFX10-W32: ; %bb.0: ; %main_body 3071; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo 3072; GFX10-W32-NEXT: s_mov_b32 s19, s17 3073; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo 3074; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3075; GFX10-W32-NEXT: s_mov_b32 s23, s5 3076; GFX10-W32-NEXT: s_mov_b32 s22, s4 3077; GFX10-W32-NEXT: s_mov_b32 s21, s3 3078; GFX10-W32-NEXT: s_mov_b32 s20, s2 3079; GFX10-W32-NEXT: s_mov_b32 s27, s9 3080; GFX10-W32-NEXT: s_mov_b32 s26, s8 3081; GFX10-W32-NEXT: s_mov_b32 s25, s7 3082; GFX10-W32-NEXT: s_mov_b32 s24, s6 3083; GFX10-W32-NEXT: s_mov_b32 s18, s16 3084; GFX10-W32-NEXT: s_mov_b32 s17, s15 3085; GFX10-W32-NEXT: s_mov_b32 s16, s14 3086; GFX10-W32-NEXT: s_mov_b32 s15, s13 3087; GFX10-W32-NEXT: s_mov_b32 s14, s12 3088; GFX10-W32-NEXT: s_mov_b32 s13, s11 3089; GFX10-W32-NEXT: s_mov_b32 s12, s10 3090; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1 3091; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29 3092; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3093; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 3094; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3095; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen 3096; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 3097; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1 3098; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0 3099; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen 3100; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1 3101; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3102; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3103; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3104; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 3105; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3106; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3107; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 3108; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3109; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 3110; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3111; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 3112; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28 3113; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D 3114; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3115; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen 3116; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen 3117; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3118; GFX10-W32-NEXT: ; return to shader part epilog 3119main_body: 3120 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3121 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3122 %temp = fadd float %reload, %reload 3123 %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp) 3124 %temp3 = fadd float %temp2, %temp2 3125 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0) 3126 %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm) 3127 %temp5 = fadd float %temp3, %temp4 3128 %res.int = ptrtoint ptr addrspace(8) %res to i128 3129 %res.vec = bitcast i128 %res.int to <4 x i32> 3130 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0) 3131 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3132 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3133 ret float %out 3134} 3135 3136define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { 3137; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm: 3138; GFX9-W64: ; %bb.0: ; %main_body 3139; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3140; GFX9-W64-NEXT: s_mov_b32 s15, s13 3141; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3142; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3143; GFX9-W64-NEXT: s_mov_b32 s19, s5 3144; GFX9-W64-NEXT: s_mov_b32 s18, s4 3145; GFX9-W64-NEXT: s_mov_b32 s17, s3 3146; GFX9-W64-NEXT: s_mov_b32 s16, s2 3147; GFX9-W64-NEXT: s_mov_b32 s14, s12 3148; GFX9-W64-NEXT: s_mov_b32 s13, s11 3149; GFX9-W64-NEXT: s_mov_b32 s12, s10 3150; GFX9-W64-NEXT: s_mov_b32 s11, s9 3151; GFX9-W64-NEXT: s_mov_b32 s10, s8 3152; GFX9-W64-NEXT: s_mov_b32 s9, s7 3153; GFX9-W64-NEXT: s_mov_b32 s8, s6 3154; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3155; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3156; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3157; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1 3158; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1 3159; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3160; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3] 3161; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3162; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3163; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3164; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3165; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1 3166; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3167; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2 3168; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3169; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3170; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 3171; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3172; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3173; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3 3174; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4 3175; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3176; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3177; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3178; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3179; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3180; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3181; GFX9-W64-NEXT: ; return to shader part epilog 3182; 3183; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm: 3184; GFX10-W32: ; %bb.0: ; %main_body 3185; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3186; GFX10-W32-NEXT: s_mov_b32 s15, s13 3187; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3188; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3189; GFX10-W32-NEXT: s_mov_b32 s19, s5 3190; GFX10-W32-NEXT: s_mov_b32 s18, s4 3191; GFX10-W32-NEXT: s_mov_b32 s17, s3 3192; GFX10-W32-NEXT: s_mov_b32 s16, s2 3193; GFX10-W32-NEXT: s_mov_b32 s14, s12 3194; GFX10-W32-NEXT: s_mov_b32 s13, s11 3195; GFX10-W32-NEXT: s_mov_b32 s12, s10 3196; GFX10-W32-NEXT: s_mov_b32 s11, s9 3197; GFX10-W32-NEXT: s_mov_b32 s10, s8 3198; GFX10-W32-NEXT: s_mov_b32 s9, s7 3199; GFX10-W32-NEXT: s_mov_b32 s8, s6 3200; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3201; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3202; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3203; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1 3204; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3205; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3206; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3207; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen 3208; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3209; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3210; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3211; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen 3212; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3213; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1 3214; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3215; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2 3216; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3217; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3218; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 3219; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3220; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3 3221; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3222; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4 3223; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3224; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3225; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3226; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3227; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3228; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3229; GFX10-W32-NEXT: ; return to shader part epilog 3230main_body: 3231 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3232 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3233 %temp = fadd float %reload, %reload 3234 %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp) 3235 %temp3 = fadd float %temp2, %temp2 3236 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3237 %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3238 %temp5 = fadd float %temp3, %temp4 3239 %res.int = ptrtoint ptr addrspace(8) %res to i128 3240 %res.vec = bitcast i128 %res.int to <4 x i32> 3241 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) 3242 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3243 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3244 ret float %out 3245} 3246 3247;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again. 3248define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) { 3249; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm: 3250; GFX9-W64: ; %bb.0: ; %main_body 3251; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec 3252; GFX9-W64-NEXT: s_mov_b32 s15, s13 3253; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec 3254; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3255; GFX9-W64-NEXT: s_mov_b32 s19, s5 3256; GFX9-W64-NEXT: s_mov_b32 s18, s4 3257; GFX9-W64-NEXT: s_mov_b32 s17, s3 3258; GFX9-W64-NEXT: s_mov_b32 s16, s2 3259; GFX9-W64-NEXT: s_mov_b32 s14, s12 3260; GFX9-W64-NEXT: s_mov_b32 s13, s11 3261; GFX9-W64-NEXT: s_mov_b32 s12, s10 3262; GFX9-W64-NEXT: s_mov_b32 s11, s9 3263; GFX9-W64-NEXT: s_mov_b32 s10, s8 3264; GFX9-W64-NEXT: s_mov_b32 s9, s7 3265; GFX9-W64-NEXT: s_mov_b32 s8, s6 3266; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3267; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23] 3268; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3269; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3270; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1 3271; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen 3272; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec 3273; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3274; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3275; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1] 3276; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3277; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3278; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3279; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3280; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2 3281; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3282; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0 3283; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3 3284; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21] 3285; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 3286; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3287; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3288; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3289; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3290; GFX9-W64-NEXT: ; return to shader part epilog 3291; 3292; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm: 3293; GFX10-W32: ; %bb.0: ; %main_body 3294; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo 3295; GFX10-W32-NEXT: s_mov_b32 s15, s13 3296; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo 3297; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3298; GFX10-W32-NEXT: s_mov_b32 s19, s5 3299; GFX10-W32-NEXT: s_mov_b32 s18, s4 3300; GFX10-W32-NEXT: s_mov_b32 s17, s3 3301; GFX10-W32-NEXT: s_mov_b32 s16, s2 3302; GFX10-W32-NEXT: s_mov_b32 s14, s12 3303; GFX10-W32-NEXT: s_mov_b32 s13, s11 3304; GFX10-W32-NEXT: s_mov_b32 s12, s10 3305; GFX10-W32-NEXT: s_mov_b32 s11, s9 3306; GFX10-W32-NEXT: s_mov_b32 s10, s8 3307; GFX10-W32-NEXT: s_mov_b32 s9, s7 3308; GFX10-W32-NEXT: s_mov_b32 s8, s6 3309; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0 3310; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21 3311; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3312; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1 3313; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3314; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3315; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3316; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen 3317; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3318; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3319; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen 3320; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0 3321; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3322; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3323; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3324; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2 3325; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3326; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3327; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0 3328; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3 3329; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20 3330; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D 3331; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3332; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen 3333; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen 3334; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3335; GFX10-W32-NEXT: ; return to shader part epilog 3336main_body: 3337 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3338 %reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0) 3339 %temp = fadd float %reload, %reload 3340 %res.int = ptrtoint ptr addrspace(8) %res to i128 3341 %res.vec = bitcast i128 %res.int to <4 x i32> 3342 %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) 3343 %temp2 = fadd float %tex, %tex 3344 %reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3345 %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm) 3346 %temp4 = fadd float %temp2, %temp3 3347 %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0) 3348 call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3349 %out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0) 3350 ret float %out 3351} 3352 3353; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for 3354; vector comparisons in Wave32 mode. 3355define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) { 3356; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32: 3357; GFX9-W64: ; %bb.0: ; %main_body 3358; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac 3359; GFX9-W64-NEXT: s_mov_b32 s2, 32 3360; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000 3361; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 3362; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 3363; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0 3364; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc 3365; GFX9-W64-NEXT: s_cbranch_scc0 .LBB55_1 3366; GFX9-W64-NEXT: s_endpgm 3367; GFX9-W64-NEXT: .LBB55_1: 3368; GFX9-W64-NEXT: s_mov_b64 exec, 0 3369; GFX9-W64-NEXT: exp null off, off, off, off done vm 3370; GFX9-W64-NEXT: s_endpgm 3371; 3372; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32: 3373; GFX10-W32: ; %bb.0: ; %main_body 3374; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac 3375; GFX10-W32-NEXT: s_mov_b32 s2, 32 3376; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000 3377; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 3378; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 3379; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0 3380; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo 3381; GFX10-W32-NEXT: s_cbranch_scc0 .LBB55_1 3382; GFX10-W32-NEXT: s_endpgm 3383; GFX10-W32-NEXT: .LBB55_1: 3384; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 3385; GFX10-W32-NEXT: exp null off, off, off, off done vm 3386; GFX10-W32-NEXT: s_endpgm 3387main_body: 3388 %1 = ptrtoint ptr addrspace(6) %0 to i32 3389 %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0 3390 %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3 3391 %4 = fcmp nsz arcp ugt float %3, 0.000000e+00 3392 call void @llvm.amdgcn.kill(i1 %4) #1 3393 ret void 3394} 3395 3396; Test the interaction between wqm and llvm.amdgcn.init.exec. 3397define amdgpu_gs void @wqm_init_exec() { 3398; GFX9-W64-LABEL: wqm_init_exec: 3399; GFX9-W64: ; %bb.0: ; %bb 3400; GFX9-W64-NEXT: s_mov_b64 exec, -1 3401; GFX9-W64-NEXT: s_mov_b32 s0, 0 3402; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 3403; GFX9-W64-NEXT: s_mov_b32 s1, s0 3404; GFX9-W64-NEXT: s_mov_b32 s2, s0 3405; GFX9-W64-NEXT: s_mov_b32 s3, s0 3406; GFX9-W64-NEXT: v_mov_b32_e32 v1, v0 3407; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0 3408; GFX9-W64-NEXT: v_mov_b32_e32 v3, v0 3409; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3410; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3411; GFX9-W64-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $exec 3412; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0 3413; GFX9-W64-NEXT: ds_write_b32 v0, v1 3414; GFX9-W64-NEXT: s_endpgm 3415; 3416; GFX10-W32-LABEL: wqm_init_exec: 3417; GFX10-W32: ; %bb.0: ; %bb 3418; GFX10-W32-NEXT: s_mov_b32 exec_lo, -1 3419; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo 3420; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 3421; GFX10-W32-NEXT: s_mov_b32 s0, 0 3422; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3423; GFX10-W32-NEXT: s_mov_b32 s2, s0 3424; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 3425; GFX10-W32-NEXT: v_mov_b32_e32 v1, v0 3426; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0 3427; GFX10-W32-NEXT: v_mov_b32_e32 v3, v0 3428; GFX10-W32-NEXT: v_mov_b32_e32 v4, s0 3429; GFX10-W32-NEXT: s_mov_b32 s1, s0 3430; GFX10-W32-NEXT: s_mov_b32 s3, s0 3431; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3432; GFX10-W32-NEXT: ds_write_b32 v0, v4 3433; GFX10-W32-NEXT: s_endpgm 3434bb: 3435 call void @llvm.amdgcn.init.exec(i64 -1) 3436 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> zeroinitializer, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0) 3437 %i = call i32 @llvm.amdgcn.wqm.i32(i32 0) 3438 store i32 %i, i32 addrspace(3)* null, align 4 3439 ret void 3440} 3441 3442; Test a case that failed machine verification. 3443define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) { 3444; GFX9-W64-LABEL: wqm_init_exec_switch: 3445; GFX9-W64: ; %bb.0: 3446; GFX9-W64-NEXT: s_mov_b64 exec, 0 3447; GFX9-W64-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 3448; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc 3449; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3450; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3451; GFX9-W64-NEXT: s_endpgm 3452; 3453; GFX10-W32-LABEL: wqm_init_exec_switch: 3454; GFX10-W32: ; %bb.0: 3455; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 3456; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo 3457; GFX10-W32-NEXT: v_cmpx_lt_i32_e32 0, v0 3458; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0 3459; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0 3460; GFX10-W32-NEXT: s_endpgm 3461 call void @llvm.amdgcn.init.exec(i64 0) 3462 switch i32 %arg, label %bb1 [ 3463 i32 0, label %bb3 3464 i32 1, label %bb2 3465 ] 3466bb1: 3467 ret void 3468bb2: 3469 ret void 3470bb3: 3471 ret void 3472} 3473 3474define amdgpu_gs void @wqm_init_exec_wwm() { 3475; GFX9-W64-LABEL: wqm_init_exec_wwm: 3476; GFX9-W64: ; %bb.0: 3477; GFX9-W64-NEXT: s_mov_b64 exec, 0 3478; GFX9-W64-NEXT: s_mov_b32 s1, 0 3479; GFX9-W64-NEXT: s_mov_b32 s0, s1 3480; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0 3481; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0 3482; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 3483; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0 3484; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 3485; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] 3486; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 3487; GFX9-W64-NEXT: exp mrt0 off, off, off, off 3488; GFX9-W64-NEXT: s_endpgm 3489; 3490; GFX10-W32-LABEL: wqm_init_exec_wwm: 3491; GFX10-W32: ; %bb.0: 3492; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0 3493; GFX10-W32-NEXT: s_mov_b32 s1, 0 3494; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0 3495; GFX10-W32-NEXT: s_mov_b32 s0, s1 3496; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0 3497; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0 3498; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 3499; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0 3500; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0 3501; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 3502; GFX10-W32-NEXT: exp mrt0 off, off, off, off 3503; GFX10-W32-NEXT: s_endpgm 3504 call void @llvm.amdgcn.init.exec(i64 0) 3505 %i = call i64 @llvm.amdgcn.ballot.i64(i1 true) 3506 %i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0) 3507 %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0 3508 %i3 = bitcast <2 x i32> %i2 to i64 3509 %i4 = icmp ne i64 %i, 0 3510 %i5 = icmp ne i64 %i3, 0 3511 %i6 = xor i1 %i4, %i5 3512 %i7 = uitofp i1 %i6 to float 3513 call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false) 3514 ret void 3515} 3516 3517; Check that exact regions with execz affected instructions are as short as possible 3518define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) { 3519; GFX9-W64-LABEL: short_exact_regions: 3520; GFX9-W64: ; %bb.0: ; %main_body 3521; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 3522; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3523; GFX9-W64-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf 3524; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 3525; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 3526; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 3527; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc 3528; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2 3529; GFX9-W64-NEXT: ; %bb.1: ; %if 3530; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off 3531; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3532; GFX9-W64-NEXT: v_readfirstlane_b32 s16, v0 3533; GFX9-W64-NEXT: s_buffer_load_dword s16, s[8:11], s16 offset:0x0 3534; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 3535; GFX9-W64-NEXT: v_mov_b32_e32 v0, s16 3536; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13] 3537; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen 3538; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17] 3539; GFX9-W64-NEXT: .LBB59_2: ; %endif 3540; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] 3541; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3542; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 3543; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3544; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v0 3545; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 3546; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 3547; GFX9-W64-NEXT: ; return to shader part epilog 3548; 3549; GFX10-W32-LABEL: short_exact_regions: 3550; GFX10-W32: ; %bb.0: ; %main_body 3551; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 3552; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3553; GFX10-W32-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 3554; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 3555; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo 3556; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 3557; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0 3558; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2 3559; GFX10-W32-NEXT: ; %bb.1: ; %if 3560; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off 3561; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3562; GFX10-W32-NEXT: v_readfirstlane_b32 s14, v0 3563; GFX10-W32-NEXT: s_buffer_load_dword s14, s[8:11], s14 offset:0x0 3564; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 3565; GFX10-W32-NEXT: v_mov_b32_e32 v0, s14 3566; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12 3567; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen 3568; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14 3569; GFX10-W32-NEXT: .LBB59_2: ; %endif 3570; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 3571; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3572; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D 3573; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3574; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v0 3575; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 3576; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 3577; GFX10-W32-NEXT: ; return to shader part epilog 3578main_body: 3579 %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3580 %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4 3581 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 3582 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 3583 %cc = icmp uge i32 %hi, 16 3584 br i1 %cc, label %endif, label %if 3585 3586if: 3587 %idx1 = extractelement <4 x i32> %idx0, i64 0 3588 %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1) 3589 %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0) 3590 3591 call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0) 3592 br label %endif 3593 3594endif: 3595 %d = extractelement <4 x float> %tex1, i64 0 3596 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3597 %r0 = extractelement <4 x float> %tex1, i64 1 3598 %r1 = extractelement <4 x float> %tex2, i64 2 3599 %r2 = fadd float %r0, %r1 3600 %out = call float @llvm.amdgcn.wqm.f32(float %r2) 3601 3602 ret float %out 3603} 3604 3605; Check that exact regions shortening doesn't prevent early WQM exit 3606define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) { 3607; GFX9-W64-LABEL: short_exact_regions_2: 3608; GFX9-W64: ; %bb.0: ; %main_body 3609; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec 3610; GFX9-W64-NEXT: s_wqm_b64 exec, exec 3611; GFX9-W64-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 3612; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] 3613; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off 3614; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3615; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4 3616; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7 3617; GFX9-W64-NEXT: ; kill: killed $vgpr3 3618; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2 3619; GFX9-W64-NEXT: s_waitcnt vmcnt(1) 3620; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0 3621; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0 3622; GFX9-W64-NEXT: s_waitcnt vmcnt(0) 3623; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5 3624; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0) 3625; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0 3626; GFX9-W64-NEXT: ; return to shader part epilog 3627; 3628; GFX10-W32-LABEL: short_exact_regions_2: 3629; GFX10-W32: ; %bb.0: ; %main_body 3630; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo 3631; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo 3632; GFX10-W32-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D 3633; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 3634; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off 3635; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3636; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D 3637; GFX10-W32-NEXT: s_waitcnt vmcnt(1) 3638; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0 3639; GFX10-W32-NEXT: s_waitcnt vmcnt(0) 3640; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1 3641; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0 3642; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0) 3643; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0 3644; GFX10-W32-NEXT: ; return to shader part epilog 3645main_body: 3646 %tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3647 %idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4 3648 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 3649 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 3650 %idx1 = extractelement <4 x i32> %idx0, i64 0 3651 %d = extractelement <4 x float> %tex1, i64 0 3652 3653 %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0 3654 3655 %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1) 3656 %idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0) 3657 3658 %r0 = extractelement <4 x float> %tex1, i64 1 3659 %r1 = extractelement <4 x float> %tex2, i64 2 3660 %r2 = fadd float %r0, %r1 3661 %out = fadd float %r2, %idx3 3662 3663 ret float %out 3664} 3665 3666declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 3667declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1 3668 3669declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2 3670declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2 3671declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2 3672declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2 3673declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3 3674declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3 3675 3676declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 3677declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2 3678declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2 3679declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2 3680declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3 3681declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3 3682 3683declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3 3684declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3685declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3686declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3 3687declare void @llvm.amdgcn.kill(i1) #1 3688declare float @llvm.amdgcn.wqm.f32(float) #3 3689declare i32 @llvm.amdgcn.wqm.i32(i32) #3 3690declare float @llvm.amdgcn.strict.wwm.f32(float) #3 3691declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3 3692declare float @llvm.amdgcn.wwm.f32(float) #3 3693declare i32 @llvm.amdgcn.wwm.i32(i32) #3 3694declare float @llvm.amdgcn.strict.wqm.f32(float) #3 3695declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3 3696declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4 3697declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3 3698declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3 3699declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3 3700declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1 3701declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 3702declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2 3703declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) 3704declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7 3705declare i32 @llvm.amdgcn.readfirstlane.i32(i32) 3706 3707attributes #1 = { nounwind } 3708attributes #2 = { nounwind readonly } 3709attributes #3 = { nounwind readnone } 3710attributes #4 = { nounwind readnone convergent } 3711attributes #5 = { "amdgpu-ps-wqm-outputs" } 3712attributes #6 = { nounwind "InitialPSInputAddr"="2" } 3713attributes #7 = { nounwind readnone willreturn } 3714