1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s 6 7define amdgpu_ps void @static_exact(float %arg0, float %arg1) { 8; SI-LABEL: static_exact: 9; SI: ; %bb.0: ; %.entry 10; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 11; SI-NEXT: s_andn2_b64 exec, exec, exec 12; SI-NEXT: s_cbranch_scc0 .LBB0_2 13; SI-NEXT: ; %bb.1: ; %.entry 14; SI-NEXT: s_mov_b64 exec, 0 15; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 16; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 17; SI-NEXT: s_endpgm 18; SI-NEXT: .LBB0_2: 19; SI-NEXT: s_mov_b64 exec, 0 20; SI-NEXT: exp null off, off, off, off done vm 21; SI-NEXT: s_endpgm 22; 23; GFX9-LABEL: static_exact: 24; GFX9: ; %bb.0: ; %.entry 25; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 26; GFX9-NEXT: s_andn2_b64 exec, exec, exec 27; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 28; GFX9-NEXT: ; %bb.1: ; %.entry 29; GFX9-NEXT: s_mov_b64 exec, 0 30; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 31; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 32; GFX9-NEXT: s_endpgm 33; GFX9-NEXT: .LBB0_2: 34; GFX9-NEXT: s_mov_b64 exec, 0 35; GFX9-NEXT: exp null off, off, off, off done vm 36; GFX9-NEXT: s_endpgm 37; 38; GFX10-32-LABEL: static_exact: 39; GFX10-32: ; %bb.0: ; %.entry 40; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 41; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo 42; GFX10-32-NEXT: s_cbranch_scc0 .LBB0_2 43; GFX10-32-NEXT: ; %bb.1: ; %.entry 44; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 45; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 46; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 47; GFX10-32-NEXT: s_endpgm 48; GFX10-32-NEXT: .LBB0_2: 49; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 50; GFX10-32-NEXT: exp null off, off, off, off done vm 51; GFX10-32-NEXT: s_endpgm 52; 53; GFX10-64-LABEL: static_exact: 54; GFX10-64: ; %bb.0: ; %.entry 55; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 56; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec 57; GFX10-64-NEXT: s_cbranch_scc0 .LBB0_2 58; GFX10-64-NEXT: ; %bb.1: ; %.entry 59; GFX10-64-NEXT: s_mov_b64 exec, 0 60; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 61; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 62; GFX10-64-NEXT: s_endpgm 63; GFX10-64-NEXT: .LBB0_2: 64; GFX10-64-NEXT: s_mov_b64 exec, 0 65; GFX10-64-NEXT: exp null off, off, off, off done vm 66; GFX10-64-NEXT: s_endpgm 67.entry: 68 %c0 = fcmp olt float %arg0, 0.000000e+00 69 %c1 = fcmp oge float %arg1, 0.0 70 call void @llvm.amdgcn.wqm.demote(i1 false) 71 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 72 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 73 ret void 74} 75 76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { 77; SI-LABEL: dynamic_exact: 78; SI: ; %bb.0: ; %.entry 79; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 80; SI-NEXT: s_mov_b64 s[2:3], exec 81; SI-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 82; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 83; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 84; SI-NEXT: s_cbranch_scc0 .LBB1_2 85; SI-NEXT: ; %bb.1: ; %.entry 86; SI-NEXT: s_and_b64 exec, exec, s[2:3] 87; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 88; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 89; SI-NEXT: s_endpgm 90; SI-NEXT: .LBB1_2: 91; SI-NEXT: s_mov_b64 exec, 0 92; SI-NEXT: exp null off, off, off, off done vm 93; SI-NEXT: s_endpgm 94; 95; GFX9-LABEL: dynamic_exact: 96; GFX9: ; %bb.0: ; %.entry 97; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 98; GFX9-NEXT: s_mov_b64 s[2:3], exec 99; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 100; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 101; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 102; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 103; GFX9-NEXT: ; %bb.1: ; %.entry 104; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] 105; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 106; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 107; GFX9-NEXT: s_endpgm 108; GFX9-NEXT: .LBB1_2: 109; GFX9-NEXT: s_mov_b64 exec, 0 110; GFX9-NEXT: exp null off, off, off, off done vm 111; GFX9-NEXT: s_endpgm 112; 113; GFX10-32-LABEL: dynamic_exact: 114; GFX10-32: ; %bb.0: ; %.entry 115; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 116; GFX10-32-NEXT: s_mov_b32 s1, exec_lo 117; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 118; GFX10-32-NEXT: s_andn2_b32 s0, exec_lo, s0 119; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 120; GFX10-32-NEXT: s_cbranch_scc0 .LBB1_2 121; GFX10-32-NEXT: ; %bb.1: ; %.entry 122; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 123; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 124; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 125; GFX10-32-NEXT: s_endpgm 126; GFX10-32-NEXT: .LBB1_2: 127; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 128; GFX10-32-NEXT: exp null off, off, off, off done vm 129; GFX10-32-NEXT: s_endpgm 130; 131; GFX10-64-LABEL: dynamic_exact: 132; GFX10-64: ; %bb.0: ; %.entry 133; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 134; GFX10-64-NEXT: s_mov_b64 s[2:3], exec 135; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 136; GFX10-64-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 137; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 138; GFX10-64-NEXT: s_cbranch_scc0 .LBB1_2 139; GFX10-64-NEXT: ; %bb.1: ; %.entry 140; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] 141; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 142; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 143; GFX10-64-NEXT: s_endpgm 144; GFX10-64-NEXT: .LBB1_2: 145; GFX10-64-NEXT: s_mov_b64 exec, 0 146; GFX10-64-NEXT: exp null off, off, off, off done vm 147; GFX10-64-NEXT: s_endpgm 148.entry: 149 %c0 = fcmp olt float %arg0, 0.000000e+00 150 %c1 = fcmp oge float %arg1, 0.0 151 call void @llvm.amdgcn.wqm.demote(i1 %c1) 152 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 153 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 154 ret void 155} 156 157define amdgpu_ps void @branch(float %arg0, float %arg1) { 158; SI-LABEL: branch: 159; SI: ; %bb.0: ; %.entry 160; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 161; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 162; SI-NEXT: s_mov_b64 s[0:1], exec 163; SI-NEXT: v_or_b32_e32 v0, v0, v1 164; SI-NEXT: v_and_b32_e32 v0, 1, v0 165; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 166; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 167; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 168; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 169; SI-NEXT: s_cbranch_execz .LBB2_3 170; SI-NEXT: ; %bb.1: ; %.demote 171; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 172; SI-NEXT: s_cbranch_scc0 .LBB2_4 173; SI-NEXT: ; %bb.2: ; %.demote 174; SI-NEXT: s_mov_b64 exec, 0 175; SI-NEXT: .LBB2_3: ; %.continue 176; SI-NEXT: s_or_b64 exec, exec, s[2:3] 177; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 178; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 179; SI-NEXT: s_endpgm 180; SI-NEXT: .LBB2_4: 181; SI-NEXT: s_mov_b64 exec, 0 182; SI-NEXT: exp null off, off, off, off done vm 183; SI-NEXT: s_endpgm 184; 185; GFX9-LABEL: branch: 186; GFX9: ; %bb.0: ; %.entry 187; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 188; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 189; GFX9-NEXT: s_mov_b64 s[0:1], exec 190; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 191; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 192; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 193; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 194; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 195; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 196; GFX9-NEXT: s_cbranch_execz .LBB2_3 197; GFX9-NEXT: ; %bb.1: ; %.demote 198; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 199; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 200; GFX9-NEXT: ; %bb.2: ; %.demote 201; GFX9-NEXT: s_mov_b64 exec, 0 202; GFX9-NEXT: .LBB2_3: ; %.continue 203; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 204; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 205; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 206; GFX9-NEXT: s_endpgm 207; GFX9-NEXT: .LBB2_4: 208; GFX9-NEXT: s_mov_b64 exec, 0 209; GFX9-NEXT: exp null off, off, off, off done vm 210; GFX9-NEXT: s_endpgm 211; 212; GFX10-32-LABEL: branch: 213; GFX10-32: ; %bb.0: ; %.entry 214; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 215; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 216; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 217; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 218; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 219; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 220; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 221; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 222; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 223; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 224; GFX10-32-NEXT: ; %bb.1: ; %.demote 225; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 226; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 227; GFX10-32-NEXT: ; %bb.2: ; %.demote 228; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 229; GFX10-32-NEXT: .LBB2_3: ; %.continue 230; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 231; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 232; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 233; GFX10-32-NEXT: s_endpgm 234; GFX10-32-NEXT: .LBB2_4: 235; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 236; GFX10-32-NEXT: exp null off, off, off, off done vm 237; GFX10-32-NEXT: s_endpgm 238; 239; GFX10-64-LABEL: branch: 240; GFX10-64: ; %bb.0: ; %.entry 241; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 242; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 243; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 244; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 245; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 246; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 248; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 249; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 250; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 251; GFX10-64-NEXT: ; %bb.1: ; %.demote 252; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 253; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 254; GFX10-64-NEXT: ; %bb.2: ; %.demote 255; GFX10-64-NEXT: s_mov_b64 exec, 0 256; GFX10-64-NEXT: .LBB2_3: ; %.continue 257; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 258; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 259; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 260; GFX10-64-NEXT: s_endpgm 261; GFX10-64-NEXT: .LBB2_4: 262; GFX10-64-NEXT: s_mov_b64 exec, 0 263; GFX10-64-NEXT: exp null off, off, off, off done vm 264; GFX10-64-NEXT: s_endpgm 265.entry: 266 %i0 = fptosi float %arg0 to i32 267 %i1 = fptosi float %arg1 to i32 268 %c0 = or i32 %i0, %i1 269 %c1 = and i32 %c0, 1 270 %c2 = icmp eq i32 %c1, 0 271 br i1 %c2, label %.continue, label %.demote 272 273.demote: 274 call void @llvm.amdgcn.wqm.demote(i1 false) 275 br label %.continue 276 277.continue: 278 %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 279 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 280 ret void 281} 282 283define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 284; SI-LABEL: wqm_demote_1: 285; SI: ; %bb.0: ; %.entry 286; SI-NEXT: s_mov_b64 s[12:13], exec 287; SI-NEXT: s_wqm_b64 exec, exec 288; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 289; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 290; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 291; SI-NEXT: s_cbranch_execz .LBB3_3 292; SI-NEXT: ; %bb.1: ; %.demote 293; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 294; SI-NEXT: s_cbranch_scc0 .LBB3_4 295; SI-NEXT: ; %bb.2: ; %.demote 296; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 297; SI-NEXT: s_and_b64 exec, exec, s[16:17] 298; SI-NEXT: .LBB3_3: ; %.continue 299; SI-NEXT: s_or_b64 exec, exec, s[14:15] 300; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 301; SI-NEXT: s_waitcnt vmcnt(0) 302; SI-NEXT: v_add_f32_e32 v0, v0, v0 303; SI-NEXT: s_and_b64 exec, exec, s[12:13] 304; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 305; SI-NEXT: s_waitcnt vmcnt(0) 306; SI-NEXT: s_branch .LBB3_5 307; SI-NEXT: .LBB3_4: 308; SI-NEXT: s_mov_b64 exec, 0 309; SI-NEXT: exp null off, off, off, off done vm 310; SI-NEXT: s_endpgm 311; SI-NEXT: .LBB3_5: 312; 313; GFX9-LABEL: wqm_demote_1: 314; GFX9: ; %bb.0: ; %.entry 315; GFX9-NEXT: s_mov_b64 s[12:13], exec 316; GFX9-NEXT: s_wqm_b64 exec, exec 317; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 318; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 319; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 320; GFX9-NEXT: s_cbranch_execz .LBB3_3 321; GFX9-NEXT: ; %bb.1: ; %.demote 322; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 323; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 324; GFX9-NEXT: ; %bb.2: ; %.demote 325; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 326; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 327; GFX9-NEXT: .LBB3_3: ; %.continue 328; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 329; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 332; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 333; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 334; GFX9-NEXT: s_waitcnt vmcnt(0) 335; GFX9-NEXT: s_branch .LBB3_5 336; GFX9-NEXT: .LBB3_4: 337; GFX9-NEXT: s_mov_b64 exec, 0 338; GFX9-NEXT: exp null off, off, off, off done vm 339; GFX9-NEXT: s_endpgm 340; GFX9-NEXT: .LBB3_5: 341; 342; GFX10-32-LABEL: wqm_demote_1: 343; GFX10-32: ; %bb.0: ; %.entry 344; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 345; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 346; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 347; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 348; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 349; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 350; GFX10-32-NEXT: ; %bb.1: ; %.demote 351; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 352; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 353; GFX10-32-NEXT: ; %bb.2: ; %.demote 354; GFX10-32-NEXT: s_wqm_b32 s14, s12 355; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 356; GFX10-32-NEXT: .LBB3_3: ; %.continue 357; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 358; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 359; GFX10-32-NEXT: s_waitcnt vmcnt(0) 360; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 361; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 362; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 363; GFX10-32-NEXT: s_waitcnt vmcnt(0) 364; GFX10-32-NEXT: s_branch .LBB3_5 365; GFX10-32-NEXT: .LBB3_4: 366; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 367; GFX10-32-NEXT: exp null off, off, off, off done vm 368; GFX10-32-NEXT: s_endpgm 369; GFX10-32-NEXT: .LBB3_5: 370; 371; GFX10-64-LABEL: wqm_demote_1: 372; GFX10-64: ; %bb.0: ; %.entry 373; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 374; GFX10-64-NEXT: s_wqm_b64 exec, exec 375; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 376; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 377; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 378; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 379; GFX10-64-NEXT: ; %bb.1: ; %.demote 380; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 381; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 382; GFX10-64-NEXT: ; %bb.2: ; %.demote 383; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 384; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 385; GFX10-64-NEXT: .LBB3_3: ; %.continue 386; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 387; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 388; GFX10-64-NEXT: s_waitcnt vmcnt(0) 389; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 390; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 391; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 392; GFX10-64-NEXT: s_waitcnt vmcnt(0) 393; GFX10-64-NEXT: s_branch .LBB3_5 394; GFX10-64-NEXT: .LBB3_4: 395; GFX10-64-NEXT: s_mov_b64 exec, 0 396; GFX10-64-NEXT: exp null off, off, off, off done vm 397; GFX10-64-NEXT: s_endpgm 398; GFX10-64-NEXT: .LBB3_5: 399.entry: 400 %z.cmp = fcmp olt float %z, 0.0 401 br i1 %z.cmp, label %.continue, label %.demote 402 403.demote: 404 call void @llvm.amdgcn.wqm.demote(i1 false) 405 br label %.continue 406 407.continue: 408 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 409 %tex0 = extractelement <4 x float> %tex, i32 0 410 %tex1 = extractelement <4 x float> %tex, i32 0 411 %coord1 = fadd float %tex0, %tex1 412 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 413 414 ret <4 x float> %rtex 415} 416 417define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 418; SI-LABEL: wqm_demote_2: 419; SI: ; %bb.0: ; %.entry 420; SI-NEXT: s_mov_b64 s[12:13], exec 421; SI-NEXT: s_wqm_b64 exec, exec 422; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 423; SI-NEXT: s_waitcnt vmcnt(0) 424; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 425; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 426; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 427; SI-NEXT: s_cbranch_execz .LBB4_3 428; SI-NEXT: ; %bb.1: ; %.demote 429; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 430; SI-NEXT: s_cbranch_scc0 .LBB4_4 431; SI-NEXT: ; %bb.2: ; %.demote 432; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 433; SI-NEXT: s_and_b64 exec, exec, s[16:17] 434; SI-NEXT: .LBB4_3: ; %.continue 435; SI-NEXT: s_or_b64 exec, exec, s[14:15] 436; SI-NEXT: v_add_f32_e32 v0, v0, v0 437; SI-NEXT: s_and_b64 exec, exec, s[12:13] 438; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 439; SI-NEXT: s_waitcnt vmcnt(0) 440; SI-NEXT: s_branch .LBB4_5 441; SI-NEXT: .LBB4_4: 442; SI-NEXT: s_mov_b64 exec, 0 443; SI-NEXT: exp null off, off, off, off done vm 444; SI-NEXT: s_endpgm 445; SI-NEXT: .LBB4_5: 446; 447; GFX9-LABEL: wqm_demote_2: 448; GFX9: ; %bb.0: ; %.entry 449; GFX9-NEXT: s_mov_b64 s[12:13], exec 450; GFX9-NEXT: s_wqm_b64 exec, exec 451; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 452; GFX9-NEXT: s_waitcnt vmcnt(0) 453; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 454; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 455; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 456; GFX9-NEXT: s_cbranch_execz .LBB4_3 457; GFX9-NEXT: ; %bb.1: ; %.demote 458; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 459; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 460; GFX9-NEXT: ; %bb.2: ; %.demote 461; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 462; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 463; GFX9-NEXT: .LBB4_3: ; %.continue 464; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 465; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 466; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 467; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 468; GFX9-NEXT: s_waitcnt vmcnt(0) 469; GFX9-NEXT: s_branch .LBB4_5 470; GFX9-NEXT: .LBB4_4: 471; GFX9-NEXT: s_mov_b64 exec, 0 472; GFX9-NEXT: exp null off, off, off, off done vm 473; GFX9-NEXT: s_endpgm 474; GFX9-NEXT: .LBB4_5: 475; 476; GFX10-32-LABEL: wqm_demote_2: 477; GFX10-32: ; %bb.0: ; %.entry 478; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 479; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 480; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 481; GFX10-32-NEXT: s_waitcnt vmcnt(0) 482; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 483; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 484; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 485; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 486; GFX10-32-NEXT: ; %bb.1: ; %.demote 487; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 488; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 489; GFX10-32-NEXT: ; %bb.2: ; %.demote 490; GFX10-32-NEXT: s_wqm_b32 s14, s12 491; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 492; GFX10-32-NEXT: .LBB4_3: ; %.continue 493; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 494; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 495; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 496; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 497; GFX10-32-NEXT: s_waitcnt vmcnt(0) 498; GFX10-32-NEXT: s_branch .LBB4_5 499; GFX10-32-NEXT: .LBB4_4: 500; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 501; GFX10-32-NEXT: exp null off, off, off, off done vm 502; GFX10-32-NEXT: s_endpgm 503; GFX10-32-NEXT: .LBB4_5: 504; 505; GFX10-64-LABEL: wqm_demote_2: 506; GFX10-64: ; %bb.0: ; %.entry 507; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 508; GFX10-64-NEXT: s_wqm_b64 exec, exec 509; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 510; GFX10-64-NEXT: s_waitcnt vmcnt(0) 511; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 512; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 513; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 514; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 515; GFX10-64-NEXT: ; %bb.1: ; %.demote 516; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 517; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 518; GFX10-64-NEXT: ; %bb.2: ; %.demote 519; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 520; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 521; GFX10-64-NEXT: .LBB4_3: ; %.continue 522; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 523; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 524; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 525; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 526; GFX10-64-NEXT: s_waitcnt vmcnt(0) 527; GFX10-64-NEXT: s_branch .LBB4_5 528; GFX10-64-NEXT: .LBB4_4: 529; GFX10-64-NEXT: s_mov_b64 exec, 0 530; GFX10-64-NEXT: exp null off, off, off, off done vm 531; GFX10-64-NEXT: s_endpgm 532; GFX10-64-NEXT: .LBB4_5: 533.entry: 534 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 535 %tex0 = extractelement <4 x float> %tex, i32 0 536 %tex1 = extractelement <4 x float> %tex, i32 0 537 %z.cmp = fcmp olt float %tex0, 0.0 538 br i1 %z.cmp, label %.continue, label %.demote 539 540.demote: 541 call void @llvm.amdgcn.wqm.demote(i1 false) 542 br label %.continue 543 544.continue: 545 %coord1 = fadd float %tex0, %tex1 546 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 547 548 ret <4 x float> %rtex 549} 550 551define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 552; SI-LABEL: wqm_demote_dynamic: 553; SI: ; %bb.0: ; %.entry 554; SI-NEXT: s_mov_b64 s[12:13], exec 555; SI-NEXT: s_wqm_b64 exec, exec 556; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 557; SI-NEXT: s_waitcnt vmcnt(0) 558; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 559; SI-NEXT: s_andn2_b64 s[14:15], exec, vcc 560; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 561; SI-NEXT: s_cbranch_scc0 .LBB5_2 562; SI-NEXT: ; %bb.1: ; %.entry 563; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] 564; SI-NEXT: s_and_b64 exec, exec, s[14:15] 565; SI-NEXT: v_add_f32_e32 v0, v0, v0 566; SI-NEXT: s_and_b64 exec, exec, s[12:13] 567; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 568; SI-NEXT: s_waitcnt vmcnt(0) 569; SI-NEXT: s_branch .LBB5_3 570; SI-NEXT: .LBB5_2: 571; SI-NEXT: s_mov_b64 exec, 0 572; SI-NEXT: exp null off, off, off, off done vm 573; SI-NEXT: s_endpgm 574; SI-NEXT: .LBB5_3: 575; 576; GFX9-LABEL: wqm_demote_dynamic: 577; GFX9: ; %bb.0: ; %.entry 578; GFX9-NEXT: s_mov_b64 s[12:13], exec 579; GFX9-NEXT: s_wqm_b64 exec, exec 580; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 581; GFX9-NEXT: s_waitcnt vmcnt(0) 582; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 583; GFX9-NEXT: s_andn2_b64 s[14:15], exec, vcc 584; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 585; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 586; GFX9-NEXT: ; %bb.1: ; %.entry 587; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] 588; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] 589; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 590; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 591; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 592; GFX9-NEXT: s_waitcnt vmcnt(0) 593; GFX9-NEXT: s_branch .LBB5_3 594; GFX9-NEXT: .LBB5_2: 595; GFX9-NEXT: s_mov_b64 exec, 0 596; GFX9-NEXT: exp null off, off, off, off done vm 597; GFX9-NEXT: s_endpgm 598; GFX9-NEXT: .LBB5_3: 599; 600; GFX10-32-LABEL: wqm_demote_dynamic: 601; GFX10-32: ; %bb.0: ; %.entry 602; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 603; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 604; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 605; GFX10-32-NEXT: s_waitcnt vmcnt(0) 606; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 607; GFX10-32-NEXT: s_andn2_b32 s13, exec_lo, vcc_lo 608; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 609; GFX10-32-NEXT: s_cbranch_scc0 .LBB5_2 610; GFX10-32-NEXT: ; %bb.1: ; %.entry 611; GFX10-32-NEXT: s_wqm_b32 s13, s12 612; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 613; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 614; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 615; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 616; GFX10-32-NEXT: s_waitcnt vmcnt(0) 617; GFX10-32-NEXT: s_branch .LBB5_3 618; GFX10-32-NEXT: .LBB5_2: 619; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 620; GFX10-32-NEXT: exp null off, off, off, off done vm 621; GFX10-32-NEXT: s_endpgm 622; GFX10-32-NEXT: .LBB5_3: 623; 624; GFX10-64-LABEL: wqm_demote_dynamic: 625; GFX10-64: ; %bb.0: ; %.entry 626; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 627; GFX10-64-NEXT: s_wqm_b64 exec, exec 628; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 629; GFX10-64-NEXT: s_waitcnt vmcnt(0) 630; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 631; GFX10-64-NEXT: s_andn2_b64 s[14:15], exec, vcc 632; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 633; GFX10-64-NEXT: s_cbranch_scc0 .LBB5_2 634; GFX10-64-NEXT: ; %bb.1: ; %.entry 635; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] 636; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] 637; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 638; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 639; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 640; GFX10-64-NEXT: s_waitcnt vmcnt(0) 641; GFX10-64-NEXT: s_branch .LBB5_3 642; GFX10-64-NEXT: .LBB5_2: 643; GFX10-64-NEXT: s_mov_b64 exec, 0 644; GFX10-64-NEXT: exp null off, off, off, off done vm 645; GFX10-64-NEXT: s_endpgm 646; GFX10-64-NEXT: .LBB5_3: 647.entry: 648 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 649 %tex0 = extractelement <4 x float> %tex, i32 0 650 %tex1 = extractelement <4 x float> %tex, i32 0 651 %z.cmp = fcmp olt float %tex0, 0.0 652 call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) 653 %coord1 = fadd float %tex0, %tex1 654 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 655 656 ret <4 x float> %rtex 657} 658 659define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { 660; SI-LABEL: wqm_deriv: 661; SI: ; %bb.0: ; %.entry 662; SI-NEXT: s_mov_b64 s[0:1], exec 663; SI-NEXT: s_wqm_b64 exec, exec 664; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 665; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 666; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc 667; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 668; SI-NEXT: s_cbranch_execz .LBB6_3 669; SI-NEXT: ; %bb.1: ; %.demote0 670; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 671; SI-NEXT: s_cbranch_scc0 .LBB6_7 672; SI-NEXT: ; %bb.2: ; %.demote0 673; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] 674; SI-NEXT: s_and_b64 exec, exec, s[4:5] 675; SI-NEXT: .LBB6_3: ; %.continue0 676; SI-NEXT: s_or_b64 exec, exec, s[2:3] 677; SI-NEXT: s_mov_b64 s[2:3], s[0:1] 678; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 679; SI-NEXT: v_mov_b32_e32 v1, v0 680; SI-NEXT: s_nop 1 681; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 682; SI-NEXT: s_nop 1 683; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 684; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 685; SI-NEXT: s_and_b64 exec, exec, s[0:1] 686; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 687; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc 688; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 689; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 690; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 691; SI-NEXT: s_cbranch_execz .LBB6_6 692; SI-NEXT: ; %bb.4: ; %.demote1 693; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 694; SI-NEXT: s_cbranch_scc0 .LBB6_7 695; SI-NEXT: ; %bb.5: ; %.demote1 696; SI-NEXT: s_mov_b64 exec, 0 697; SI-NEXT: .LBB6_6: ; %.continue1 698; SI-NEXT: s_or_b64 exec, exec, s[2:3] 699; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 700; SI-NEXT: v_bfrev_b32_e32 v1, 60 701; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 702; SI-NEXT: s_endpgm 703; SI-NEXT: .LBB6_7: 704; SI-NEXT: s_mov_b64 exec, 0 705; SI-NEXT: exp null off, off, off, off done vm 706; SI-NEXT: s_endpgm 707; 708; GFX9-LABEL: wqm_deriv: 709; GFX9: ; %bb.0: ; %.entry 710; GFX9-NEXT: s_mov_b64 s[0:1], exec 711; GFX9-NEXT: s_wqm_b64 exec, exec 712; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 713; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 714; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 715; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 716; GFX9-NEXT: s_cbranch_execz .LBB6_3 717; GFX9-NEXT: ; %bb.1: ; %.demote0 718; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 719; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 720; GFX9-NEXT: ; %bb.2: ; %.demote0 721; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] 722; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] 723; GFX9-NEXT: .LBB6_3: ; %.continue0 724; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 725; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] 726; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 727; GFX9-NEXT: v_mov_b32_e32 v1, v0 728; GFX9-NEXT: s_nop 1 729; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 730; GFX9-NEXT: s_nop 1 731; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 732; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 733; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 734; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 735; GFX9-NEXT: s_and_b64 s[2:3], s[0:1], vcc 736; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 737; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 738; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 739; GFX9-NEXT: s_cbranch_execz .LBB6_6 740; GFX9-NEXT: ; %bb.4: ; %.demote1 741; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 742; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 743; GFX9-NEXT: ; %bb.5: ; %.demote1 744; GFX9-NEXT: s_mov_b64 exec, 0 745; GFX9-NEXT: .LBB6_6: ; %.continue1 746; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 747; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 748; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 749; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 750; GFX9-NEXT: s_endpgm 751; GFX9-NEXT: .LBB6_7: 752; GFX9-NEXT: s_mov_b64 exec, 0 753; GFX9-NEXT: exp null off, off, off, off done vm 754; GFX9-NEXT: s_endpgm 755; 756; GFX10-32-LABEL: wqm_deriv: 757; GFX10-32: ; %bb.0: ; %.entry 758; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 759; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 760; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 761; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 762; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo 763; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 764; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 765; GFX10-32-NEXT: ; %bb.1: ; %.demote0 766; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 767; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 768; GFX10-32-NEXT: ; %bb.2: ; %.demote0 769; GFX10-32-NEXT: s_wqm_b32 s2, s0 770; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 771; GFX10-32-NEXT: .LBB6_3: ; %.continue0 772; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 773; GFX10-32-NEXT: s_mov_b32 s1, s0 774; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 775; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 776; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 777; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 778; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 779; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 780; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 781; GFX10-32-NEXT: s_and_b32 s1, s0, vcc_lo 782; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 783; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 784; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 785; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 786; GFX10-32-NEXT: ; %bb.4: ; %.demote1 787; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 788; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 789; GFX10-32-NEXT: ; %bb.5: ; %.demote1 790; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 791; GFX10-32-NEXT: .LBB6_6: ; %.continue1 792; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 793; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 794; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 795; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 796; GFX10-32-NEXT: s_endpgm 797; GFX10-32-NEXT: .LBB6_7: 798; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 799; GFX10-32-NEXT: exp null off, off, off, off done vm 800; GFX10-32-NEXT: s_endpgm 801; 802; GFX10-64-LABEL: wqm_deriv: 803; GFX10-64: ; %bb.0: ; %.entry 804; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 805; GFX10-64-NEXT: s_wqm_b64 exec, exec 806; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 807; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 808; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc 809; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 810; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 811; GFX10-64-NEXT: ; %bb.1: ; %.demote0 812; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 813; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 814; GFX10-64-NEXT: ; %bb.2: ; %.demote0 815; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] 816; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] 817; GFX10-64-NEXT: .LBB6_3: ; %.continue0 818; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 819; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] 820; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 821; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 822; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 823; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 824; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 825; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 826; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 827; GFX10-64-NEXT: s_and_b64 s[2:3], s[0:1], vcc 828; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 829; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 830; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 831; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 832; GFX10-64-NEXT: ; %bb.4: ; %.demote1 833; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 834; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 835; GFX10-64-NEXT: ; %bb.5: ; %.demote1 836; GFX10-64-NEXT: s_mov_b64 exec, 0 837; GFX10-64-NEXT: .LBB6_6: ; %.continue1 838; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 839; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 840; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 841; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 842; GFX10-64-NEXT: s_endpgm 843; GFX10-64-NEXT: .LBB6_7: 844; GFX10-64-NEXT: s_mov_b64 exec, 0 845; GFX10-64-NEXT: exp null off, off, off, off done vm 846; GFX10-64-NEXT: s_endpgm 847.entry: 848 %p0 = extractelement <2 x float> %input, i32 0 849 %p1 = extractelement <2 x float> %input, i32 1 850 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2 851 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2 852 %argi = fptosi float %arg to i32 853 %cond0 = icmp eq i32 %argi, 0 854 br i1 %cond0, label %.continue0, label %.demote0 855 856.demote0: 857 call void @llvm.amdgcn.wqm.demote(i1 false) 858 br label %.continue0 859 860.continue0: 861 %live = call i1 @llvm.amdgcn.live.mask() 862 %live.cond = select i1 %live, i32 0, i32 1065353216 863 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 864 %live.v0f = bitcast i32 %live.v0 to float 865 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 866 %live.v1f = bitcast i32 %live.v1 to float 867 %v0 = fsub float %live.v0f, %live.v1f 868 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 869 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 870 %cond2 = and i1 %live, %cond1 871 br i1 %cond2, label %.continue1, label %.demote1 872 873.demote1: 874 call void @llvm.amdgcn.wqm.demote(i1 false) 875 br label %.continue1 876 877.continue1: 878 call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3 879 ret void 880} 881 882define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { 883; SI-LABEL: wqm_deriv_loop: 884; SI: ; %bb.0: ; %.entry 885; SI-NEXT: s_mov_b64 s[0:1], exec 886; SI-NEXT: s_wqm_b64 exec, exec 887; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 888; SI-NEXT: s_mov_b32 s4, 0 889; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 890; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc 891; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 892; SI-NEXT: s_cbranch_execz .LBB7_3 893; SI-NEXT: ; %bb.1: ; %.demote0 894; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 895; SI-NEXT: s_cbranch_scc0 .LBB7_9 896; SI-NEXT: ; %bb.2: ; %.demote0 897; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] 898; SI-NEXT: s_and_b64 exec, exec, s[6:7] 899; SI-NEXT: .LBB7_3: ; %.continue0.preheader 900; SI-NEXT: s_or_b64 exec, exec, s[2:3] 901; SI-NEXT: s_mov_b64 s[2:3], 0 902; SI-NEXT: v_mov_b32_e32 v0, s4 903; SI-NEXT: s_branch .LBB7_5 904; SI-NEXT: .LBB7_4: ; %.continue1 905; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 906; SI-NEXT: s_or_b64 exec, exec, s[4:5] 907; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 908; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 909; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 910; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] 911; SI-NEXT: s_cbranch_execz .LBB7_8 912; SI-NEXT: .LBB7_5: ; %.continue0 913; SI-NEXT: ; =>This Inner Loop Header: Depth=1 914; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 915; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] 916; SI-NEXT: v_mov_b32_e32 v3, v2 917; SI-NEXT: s_nop 1 918; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 919; SI-NEXT: s_nop 1 920; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 921; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 922; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 923; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc 924; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 925; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] 926; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] 927; SI-NEXT: s_cbranch_execz .LBB7_4 928; SI-NEXT: ; %bb.6: ; %.demote1 929; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 930; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 931; SI-NEXT: s_cbranch_scc0 .LBB7_9 932; SI-NEXT: ; %bb.7: ; %.demote1 933; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 934; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] 935; SI-NEXT: s_and_b64 exec, exec, s[6:7] 936; SI-NEXT: s_branch .LBB7_4 937; SI-NEXT: .LBB7_8: ; %.return 938; SI-NEXT: s_or_b64 exec, exec, s[2:3] 939; SI-NEXT: s_and_b64 exec, exec, s[0:1] 940; SI-NEXT: v_mov_b32_e32 v0, 0x3c00 941; SI-NEXT: v_bfrev_b32_e32 v1, 60 942; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 943; SI-NEXT: s_endpgm 944; SI-NEXT: .LBB7_9: 945; SI-NEXT: s_mov_b64 exec, 0 946; SI-NEXT: exp null off, off, off, off done vm 947; SI-NEXT: s_endpgm 948; 949; GFX9-LABEL: wqm_deriv_loop: 950; GFX9: ; %bb.0: ; %.entry 951; GFX9-NEXT: s_mov_b64 s[0:1], exec 952; GFX9-NEXT: s_wqm_b64 exec, exec 953; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 954; GFX9-NEXT: s_mov_b32 s4, 0 955; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 956; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 957; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 958; GFX9-NEXT: s_cbranch_execz .LBB7_3 959; GFX9-NEXT: ; %bb.1: ; %.demote0 960; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 961; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 962; GFX9-NEXT: ; %bb.2: ; %.demote0 963; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] 964; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] 965; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader 966; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 967; GFX9-NEXT: s_mov_b64 s[2:3], 0 968; GFX9-NEXT: v_mov_b32_e32 v0, s4 969; GFX9-NEXT: s_branch .LBB7_5 970; GFX9-NEXT: .LBB7_4: ; %.continue1 971; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 972; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 973; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 974; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 975; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 976; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 977; GFX9-NEXT: s_cbranch_execz .LBB7_8 978; GFX9-NEXT: .LBB7_5: ; %.continue0 979; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 980; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] 981; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] 982; GFX9-NEXT: v_mov_b32_e32 v3, v2 983; GFX9-NEXT: s_nop 1 984; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 985; GFX9-NEXT: s_nop 1 986; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 987; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 988; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 989; GFX9-NEXT: s_and_b64 s[4:5], s[0:1], vcc 990; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], -1 991; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] 992; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[6:7] 993; GFX9-NEXT: s_cbranch_execz .LBB7_4 994; GFX9-NEXT: ; %bb.6: ; %.demote1 995; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 996; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 997; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 998; GFX9-NEXT: ; %bb.7: ; %.demote1 999; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 1000; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] 1001; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] 1002; GFX9-NEXT: s_branch .LBB7_4 1003; GFX9-NEXT: .LBB7_8: ; %.return 1004; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1005; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 1006; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 1007; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 1008; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1009; GFX9-NEXT: s_endpgm 1010; GFX9-NEXT: .LBB7_9: 1011; GFX9-NEXT: s_mov_b64 exec, 0 1012; GFX9-NEXT: exp null off, off, off, off done vm 1013; GFX9-NEXT: s_endpgm 1014; 1015; GFX10-32-LABEL: wqm_deriv_loop: 1016; GFX10-32: ; %bb.0: ; %.entry 1017; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 1018; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 1019; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 1020; GFX10-32-NEXT: s_mov_b32 s1, 0 1021; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1022; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo 1023; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 1024; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 1025; GFX10-32-NEXT: ; %bb.1: ; %.demote0 1026; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1027; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 1028; GFX10-32-NEXT: ; %bb.2: ; %.demote0 1029; GFX10-32-NEXT: s_wqm_b32 s3, s0 1030; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 1031; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader 1032; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 1033; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 1034; GFX10-32-NEXT: s_branch .LBB7_5 1035; GFX10-32-NEXT: .LBB7_4: ; %.continue1 1036; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1037; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 1038; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0 1039; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 1040; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 1041; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 1042; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 1043; GFX10-32-NEXT: .LBB7_5: ; %.continue0 1044; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 1045; GFX10-32-NEXT: s_mov_b32 s2, s0 1046; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 1047; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 1048; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1049; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1050; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 1051; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 1052; GFX10-32-NEXT: s_and_b32 s2, s0, vcc_lo 1053; GFX10-32-NEXT: s_xor_b32 s2, s2, -1 1054; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2 1055; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s3 1056; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 1057; GFX10-32-NEXT: ; %bb.6: ; %.demote1 1058; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1059; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1060; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 1061; GFX10-32-NEXT: ; %bb.7: ; %.demote1 1062; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1063; GFX10-32-NEXT: s_wqm_b32 s3, s0 1064; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 1065; GFX10-32-NEXT: s_branch .LBB7_4 1066; GFX10-32-NEXT: .LBB7_8: ; %.return 1067; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1068; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1069; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 1070; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 1071; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1072; GFX10-32-NEXT: s_endpgm 1073; GFX10-32-NEXT: .LBB7_9: 1074; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 1075; GFX10-32-NEXT: exp null off, off, off, off done vm 1076; GFX10-32-NEXT: s_endpgm 1077; 1078; GFX10-64-LABEL: wqm_deriv_loop: 1079; GFX10-64: ; %bb.0: ; %.entry 1080; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 1081; GFX10-64-NEXT: s_wqm_b64 exec, exec 1082; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 1083; GFX10-64-NEXT: s_mov_b32 s4, 0 1084; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1085; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc 1086; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 1087; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 1088; GFX10-64-NEXT: ; %bb.1: ; %.demote0 1089; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1090; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 1091; GFX10-64-NEXT: ; %bb.2: ; %.demote0 1092; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] 1093; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] 1094; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader 1095; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 1096; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 1097; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 1098; GFX10-64-NEXT: s_branch .LBB7_5 1099; GFX10-64-NEXT: .LBB7_4: ; %.continue1 1100; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1101; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] 1102; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0 1103; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 1104; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 1105; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] 1106; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 1107; GFX10-64-NEXT: .LBB7_5: ; %.continue0 1108; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 1109; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] 1110; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] 1111; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 1112; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1113; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1114; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec 1115; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 1116; GFX10-64-NEXT: s_and_b64 s[4:5], s[0:1], vcc 1117; GFX10-64-NEXT: s_xor_b64 s[4:5], s[4:5], -1 1118; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] 1119; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[6:7] 1120; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 1121; GFX10-64-NEXT: ; %bb.6: ; %.demote1 1122; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1123; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1124; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 1125; GFX10-64-NEXT: ; %bb.7: ; %.demote1 1126; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1127; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] 1128; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] 1129; GFX10-64-NEXT: s_branch .LBB7_4 1130; GFX10-64-NEXT: .LBB7_8: ; %.return 1131; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 1132; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 1133; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 1134; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 1135; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1136; GFX10-64-NEXT: s_endpgm 1137; GFX10-64-NEXT: .LBB7_9: 1138; GFX10-64-NEXT: s_mov_b64 exec, 0 1139; GFX10-64-NEXT: exp null off, off, off, off done vm 1140; GFX10-64-NEXT: s_endpgm 1141.entry: 1142 %p0 = extractelement <2 x float> %input, i32 0 1143 %p1 = extractelement <2 x float> %input, i32 1 1144 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2 1145 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2 1146 %argi = fptosi float %arg to i32 1147 %cond0 = icmp eq i32 %argi, 0 1148 br i1 %cond0, label %.continue0, label %.demote0 1149 1150.demote0: 1151 call void @llvm.amdgcn.wqm.demote(i1 false) 1152 br label %.continue0 1153 1154.continue0: 1155 %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] 1156 %live = call i1 @llvm.amdgcn.live.mask() 1157 %live.cond = select i1 %live, i32 0, i32 %count 1158 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 1159 %live.v0f = bitcast i32 %live.v0 to float 1160 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 1161 %live.v1f = bitcast i32 %live.v1 to float 1162 %v0 = fsub float %live.v0f, %live.v1f 1163 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 1164 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 1165 %cond2 = and i1 %live, %cond1 1166 br i1 %cond2, label %.continue1, label %.demote1 1167 1168.demote1: 1169 call void @llvm.amdgcn.wqm.demote(i1 false) 1170 br label %.continue1 1171 1172.continue1: 1173 %next = add i32 %count, 1 1174 %loop.cond = icmp slt i32 %next, %limit 1175 br i1 %loop.cond, label %.continue0, label %.return 1176 1177.return: 1178 call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3 1179 ret void 1180} 1181 1182declare void @llvm.amdgcn.wqm.demote(i1) #0 1183declare i1 @llvm.amdgcn.live.mask() #0 1184declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 1185declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1186declare float @llvm.amdgcn.wqm.f32(float) #1 1187declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 1188declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 1189declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 1190declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 1191 1192attributes #0 = { nounwind } 1193attributes #1 = { nounwind readnone } 1194attributes #2 = { nounwind readnone speculatable } 1195attributes #3 = { inaccessiblememonly nounwind } 1196attributes #4 = { convergent nounwind readnone } 1197