1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s 6 7define amdgpu_ps void @static_exact(float %arg0, float %arg1) { 8; SI-LABEL: static_exact: 9; SI: ; %bb.0: ; %.entry 10; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 11; SI-NEXT: s_andn2_b64 exec, exec, exec 12; SI-NEXT: s_cbranch_scc0 .LBB0_2 13; SI-NEXT: ; %bb.1: ; %.entry 14; SI-NEXT: s_mov_b64 exec, 0 15; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 16; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 17; SI-NEXT: s_endpgm 18; SI-NEXT: .LBB0_2: 19; SI-NEXT: s_mov_b64 exec, 0 20; SI-NEXT: exp null off, off, off, off done vm 21; SI-NEXT: s_endpgm 22; 23; GFX9-LABEL: static_exact: 24; GFX9: ; %bb.0: ; %.entry 25; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 26; GFX9-NEXT: s_andn2_b64 exec, exec, exec 27; GFX9-NEXT: s_cbranch_scc0 .LBB0_2 28; GFX9-NEXT: ; %bb.1: ; %.entry 29; GFX9-NEXT: s_mov_b64 exec, 0 30; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 31; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 32; GFX9-NEXT: s_endpgm 33; GFX9-NEXT: .LBB0_2: 34; GFX9-NEXT: s_mov_b64 exec, 0 35; GFX9-NEXT: exp null off, off, off, off done vm 36; GFX9-NEXT: s_endpgm 37; 38; GFX10-32-LABEL: static_exact: 39; GFX10-32: ; %bb.0: ; %.entry 40; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 41; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo 42; GFX10-32-NEXT: s_cbranch_scc0 .LBB0_2 43; GFX10-32-NEXT: ; %bb.1: ; %.entry 44; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 45; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 46; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 47; GFX10-32-NEXT: s_endpgm 48; GFX10-32-NEXT: .LBB0_2: 49; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 50; GFX10-32-NEXT: exp null off, off, off, off done vm 51; GFX10-32-NEXT: s_endpgm 52; 53; GFX10-64-LABEL: static_exact: 54; GFX10-64: ; %bb.0: ; %.entry 55; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 56; GFX10-64-NEXT: s_andn2_b64 exec, exec, exec 57; GFX10-64-NEXT: s_cbranch_scc0 .LBB0_2 58; GFX10-64-NEXT: ; %bb.1: ; %.entry 59; GFX10-64-NEXT: s_mov_b64 exec, 0 60; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 61; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 62; GFX10-64-NEXT: s_endpgm 63; GFX10-64-NEXT: .LBB0_2: 64; GFX10-64-NEXT: s_mov_b64 exec, 0 65; GFX10-64-NEXT: exp null off, off, off, off done vm 66; GFX10-64-NEXT: s_endpgm 67.entry: 68 %c0 = fcmp olt float %arg0, 0.000000e+00 69 %c1 = fcmp oge float %arg1, 0.0 70 call void @llvm.amdgcn.wqm.demote(i1 false) 71 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 72 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 73 ret void 74} 75 76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) { 77; SI-LABEL: dynamic_exact: 78; SI: ; %bb.0: ; %.entry 79; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 80; SI-NEXT: s_mov_b64 s[2:3], exec 81; SI-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 82; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 83; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 84; SI-NEXT: s_cbranch_scc0 .LBB1_2 85; SI-NEXT: ; %bb.1: ; %.entry 86; SI-NEXT: s_and_b64 exec, exec, s[2:3] 87; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 88; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 89; SI-NEXT: s_endpgm 90; SI-NEXT: .LBB1_2: 91; SI-NEXT: s_mov_b64 exec, 0 92; SI-NEXT: exp null off, off, off, off done vm 93; SI-NEXT: s_endpgm 94; 95; GFX9-LABEL: dynamic_exact: 96; GFX9: ; %bb.0: ; %.entry 97; GFX9-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 98; GFX9-NEXT: s_mov_b64 s[2:3], exec 99; GFX9-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 100; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 101; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 102; GFX9-NEXT: s_cbranch_scc0 .LBB1_2 103; GFX9-NEXT: ; %bb.1: ; %.entry 104; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] 105; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 106; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 107; GFX9-NEXT: s_endpgm 108; GFX9-NEXT: .LBB1_2: 109; GFX9-NEXT: s_mov_b64 exec, 0 110; GFX9-NEXT: exp null off, off, off, off done vm 111; GFX9-NEXT: s_endpgm 112; 113; GFX10-32-LABEL: dynamic_exact: 114; GFX10-32: ; %bb.0: ; %.entry 115; GFX10-32-NEXT: v_cmp_le_f32_e64 s0, 0, v1 116; GFX10-32-NEXT: s_mov_b32 s1, exec_lo 117; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 118; GFX10-32-NEXT: s_andn2_b32 s0, exec_lo, s0 119; GFX10-32-NEXT: s_andn2_b32 s1, s1, s0 120; GFX10-32-NEXT: s_cbranch_scc0 .LBB1_2 121; GFX10-32-NEXT: ; %bb.1: ; %.entry 122; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s1 123; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 124; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 125; GFX10-32-NEXT: s_endpgm 126; GFX10-32-NEXT: .LBB1_2: 127; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 128; GFX10-32-NEXT: exp null off, off, off, off done vm 129; GFX10-32-NEXT: s_endpgm 130; 131; GFX10-64-LABEL: dynamic_exact: 132; GFX10-64: ; %bb.0: ; %.entry 133; GFX10-64-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 134; GFX10-64-NEXT: s_mov_b64 s[2:3], exec 135; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 136; GFX10-64-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 137; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 138; GFX10-64-NEXT: s_cbranch_scc0 .LBB1_2 139; GFX10-64-NEXT: ; %bb.1: ; %.entry 140; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] 141; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 142; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 143; GFX10-64-NEXT: s_endpgm 144; GFX10-64-NEXT: .LBB1_2: 145; GFX10-64-NEXT: s_mov_b64 exec, 0 146; GFX10-64-NEXT: exp null off, off, off, off done vm 147; GFX10-64-NEXT: s_endpgm 148.entry: 149 %c0 = fcmp olt float %arg0, 0.000000e+00 150 %c1 = fcmp oge float %arg1, 0.0 151 call void @llvm.amdgcn.wqm.demote(i1 %c1) 152 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 153 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 154 ret void 155} 156 157define amdgpu_ps void @branch(float %arg0, float %arg1) { 158; SI-LABEL: branch: 159; SI: ; %bb.0: ; %.entry 160; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 161; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 162; SI-NEXT: s_mov_b64 s[2:3], exec 163; SI-NEXT: v_or_b32_e32 v0, v0, v1 164; SI-NEXT: v_and_b32_e32 v0, 1, v0 165; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 166; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 167; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 168; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 169; SI-NEXT: s_cbranch_execz .LBB2_3 170; SI-NEXT: ; %bb.1: ; %.demote 171; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 172; SI-NEXT: s_cbranch_scc0 .LBB2_4 173; SI-NEXT: ; %bb.2: ; %.demote 174; SI-NEXT: s_mov_b64 exec, 0 175; SI-NEXT: .LBB2_3: ; %.continue 176; SI-NEXT: s_or_b64 exec, exec, s[0:1] 177; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 178; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 179; SI-NEXT: s_endpgm 180; SI-NEXT: .LBB2_4: 181; SI-NEXT: s_mov_b64 exec, 0 182; SI-NEXT: exp null off, off, off, off done vm 183; SI-NEXT: s_endpgm 184; 185; GFX9-LABEL: branch: 186; GFX9: ; %bb.0: ; %.entry 187; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 188; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 189; GFX9-NEXT: s_mov_b64 s[2:3], exec 190; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 191; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 192; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 193; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 194; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 195; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 196; GFX9-NEXT: s_cbranch_execz .LBB2_3 197; GFX9-NEXT: ; %bb.1: ; %.demote 198; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 199; GFX9-NEXT: s_cbranch_scc0 .LBB2_4 200; GFX9-NEXT: ; %bb.2: ; %.demote 201; GFX9-NEXT: s_mov_b64 exec, 0 202; GFX9-NEXT: .LBB2_3: ; %.continue 203; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] 204; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 205; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 206; GFX9-NEXT: s_endpgm 207; GFX9-NEXT: .LBB2_4: 208; GFX9-NEXT: s_mov_b64 exec, 0 209; GFX9-NEXT: exp null off, off, off, off done vm 210; GFX9-NEXT: s_endpgm 211; 212; GFX10-32-LABEL: branch: 213; GFX10-32: ; %bb.0: ; %.entry 214; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 215; GFX10-32-NEXT: v_cvt_i32_f32_e32 v1, v1 216; GFX10-32-NEXT: s_mov_b32 s1, exec_lo 217; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 218; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 219; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 220; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 221; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 222; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 223; GFX10-32-NEXT: s_cbranch_execz .LBB2_3 224; GFX10-32-NEXT: ; %bb.1: ; %.demote 225; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo 226; GFX10-32-NEXT: s_cbranch_scc0 .LBB2_4 227; GFX10-32-NEXT: ; %bb.2: ; %.demote 228; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 229; GFX10-32-NEXT: .LBB2_3: ; %.continue 230; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 231; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 232; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 233; GFX10-32-NEXT: s_endpgm 234; GFX10-32-NEXT: .LBB2_4: 235; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 236; GFX10-32-NEXT: exp null off, off, off, off done vm 237; GFX10-32-NEXT: s_endpgm 238; 239; GFX10-64-LABEL: branch: 240; GFX10-64: ; %bb.0: ; %.entry 241; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 242; GFX10-64-NEXT: v_cvt_i32_f32_e32 v1, v1 243; GFX10-64-NEXT: s_mov_b64 s[2:3], exec 244; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 245; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 246; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 247; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 248; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] 249; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] 250; GFX10-64-NEXT: s_cbranch_execz .LBB2_3 251; GFX10-64-NEXT: ; %bb.1: ; %.demote 252; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 253; GFX10-64-NEXT: s_cbranch_scc0 .LBB2_4 254; GFX10-64-NEXT: ; %bb.2: ; %.demote 255; GFX10-64-NEXT: s_mov_b64 exec, 0 256; GFX10-64-NEXT: .LBB2_3: ; %.continue 257; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] 258; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 259; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 260; GFX10-64-NEXT: s_endpgm 261; GFX10-64-NEXT: .LBB2_4: 262; GFX10-64-NEXT: s_mov_b64 exec, 0 263; GFX10-64-NEXT: exp null off, off, off, off done vm 264; GFX10-64-NEXT: s_endpgm 265.entry: 266 %i0 = fptosi float %arg0 to i32 267 %i1 = fptosi float %arg1 to i32 268 %c0 = or i32 %i0, %i1 269 %c1 = and i32 %c0, 1 270 %c2 = icmp eq i32 %c1, 0 271 br i1 %c2, label %.continue, label %.demote 272 273.demote: 274 call void @llvm.amdgcn.wqm.demote(i1 false) 275 br label %.continue 276 277.continue: 278 %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00 279 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 280 ret void 281} 282 283 284define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 285; SI-LABEL: wqm_demote_1: 286; SI: ; %bb.0: ; %.entry 287; SI-NEXT: s_mov_b64 s[12:13], exec 288; SI-NEXT: s_wqm_b64 exec, exec 289; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 290; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 291; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 292; SI-NEXT: s_cbranch_execz .LBB3_3 293; SI-NEXT: ; %bb.1: ; %.demote 294; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 295; SI-NEXT: s_cbranch_scc0 .LBB3_4 296; SI-NEXT: ; %bb.2: ; %.demote 297; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 298; SI-NEXT: s_and_b64 exec, exec, s[16:17] 299; SI-NEXT: .LBB3_3: ; %.continue 300; SI-NEXT: s_or_b64 exec, exec, s[14:15] 301; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 302; SI-NEXT: s_waitcnt vmcnt(0) 303; SI-NEXT: v_add_f32_e32 v0, v0, v0 304; SI-NEXT: s_and_b64 exec, exec, s[12:13] 305; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 306; SI-NEXT: s_waitcnt vmcnt(0) 307; SI-NEXT: s_branch .LBB3_5 308; SI-NEXT: .LBB3_4: 309; SI-NEXT: s_mov_b64 exec, 0 310; SI-NEXT: exp null off, off, off, off done vm 311; SI-NEXT: s_endpgm 312; SI-NEXT: .LBB3_5: 313; 314; GFX9-LABEL: wqm_demote_1: 315; GFX9: ; %bb.0: ; %.entry 316; GFX9-NEXT: s_mov_b64 s[12:13], exec 317; GFX9-NEXT: s_wqm_b64 exec, exec 318; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 319; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 320; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 321; GFX9-NEXT: s_cbranch_execz .LBB3_3 322; GFX9-NEXT: ; %bb.1: ; %.demote 323; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 324; GFX9-NEXT: s_cbranch_scc0 .LBB3_4 325; GFX9-NEXT: ; %bb.2: ; %.demote 326; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 327; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 328; GFX9-NEXT: .LBB3_3: ; %.continue 329; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 330; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 331; GFX9-NEXT: s_waitcnt vmcnt(0) 332; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 333; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 334; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 335; GFX9-NEXT: s_waitcnt vmcnt(0) 336; GFX9-NEXT: s_branch .LBB3_5 337; GFX9-NEXT: .LBB3_4: 338; GFX9-NEXT: s_mov_b64 exec, 0 339; GFX9-NEXT: exp null off, off, off, off done vm 340; GFX9-NEXT: s_endpgm 341; GFX9-NEXT: .LBB3_5: 342; 343; GFX10-32-LABEL: wqm_demote_1: 344; GFX10-32: ; %bb.0: ; %.entry 345; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 346; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 347; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 348; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 349; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 350; GFX10-32-NEXT: s_cbranch_execz .LBB3_3 351; GFX10-32-NEXT: ; %bb.1: ; %.demote 352; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 353; GFX10-32-NEXT: s_cbranch_scc0 .LBB3_4 354; GFX10-32-NEXT: ; %bb.2: ; %.demote 355; GFX10-32-NEXT: s_wqm_b32 s14, s12 356; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 357; GFX10-32-NEXT: .LBB3_3: ; %.continue 358; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 359; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 360; GFX10-32-NEXT: s_waitcnt vmcnt(0) 361; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 362; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 363; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 364; GFX10-32-NEXT: s_waitcnt vmcnt(0) 365; GFX10-32-NEXT: s_branch .LBB3_5 366; GFX10-32-NEXT: .LBB3_4: 367; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 368; GFX10-32-NEXT: exp null off, off, off, off done vm 369; GFX10-32-NEXT: s_endpgm 370; GFX10-32-NEXT: .LBB3_5: 371; 372; GFX10-64-LABEL: wqm_demote_1: 373; GFX10-64: ; %bb.0: ; %.entry 374; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 375; GFX10-64-NEXT: s_wqm_b64 exec, exec 376; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 377; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 378; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 379; GFX10-64-NEXT: s_cbranch_execz .LBB3_3 380; GFX10-64-NEXT: ; %bb.1: ; %.demote 381; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 382; GFX10-64-NEXT: s_cbranch_scc0 .LBB3_4 383; GFX10-64-NEXT: ; %bb.2: ; %.demote 384; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 385; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 386; GFX10-64-NEXT: .LBB3_3: ; %.continue 387; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 388; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 389; GFX10-64-NEXT: s_waitcnt vmcnt(0) 390; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 391; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 392; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 393; GFX10-64-NEXT: s_waitcnt vmcnt(0) 394; GFX10-64-NEXT: s_branch .LBB3_5 395; GFX10-64-NEXT: .LBB3_4: 396; GFX10-64-NEXT: s_mov_b64 exec, 0 397; GFX10-64-NEXT: exp null off, off, off, off done vm 398; GFX10-64-NEXT: s_endpgm 399; GFX10-64-NEXT: .LBB3_5: 400.entry: 401 %z.cmp = fcmp olt float %z, 0.0 402 br i1 %z.cmp, label %.continue, label %.demote 403 404.demote: 405 call void @llvm.amdgcn.wqm.demote(i1 false) 406 br label %.continue 407 408.continue: 409 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 410 %tex0 = extractelement <4 x float> %tex, i32 0 411 %tex1 = extractelement <4 x float> %tex, i32 0 412 %coord1 = fadd float %tex0, %tex1 413 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 414 415 ret <4 x float> %rtex 416} 417 418define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 419; SI-LABEL: wqm_demote_2: 420; SI: ; %bb.0: ; %.entry 421; SI-NEXT: s_mov_b64 s[12:13], exec 422; SI-NEXT: s_wqm_b64 exec, exec 423; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 424; SI-NEXT: s_waitcnt vmcnt(0) 425; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 426; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc 427; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 428; SI-NEXT: s_cbranch_execz .LBB4_3 429; SI-NEXT: ; %bb.1: ; %.demote 430; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 431; SI-NEXT: s_cbranch_scc0 .LBB4_4 432; SI-NEXT: ; %bb.2: ; %.demote 433; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] 434; SI-NEXT: s_and_b64 exec, exec, s[16:17] 435; SI-NEXT: .LBB4_3: ; %.continue 436; SI-NEXT: s_or_b64 exec, exec, s[14:15] 437; SI-NEXT: v_add_f32_e32 v0, v0, v0 438; SI-NEXT: s_and_b64 exec, exec, s[12:13] 439; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 440; SI-NEXT: s_waitcnt vmcnt(0) 441; SI-NEXT: s_branch .LBB4_5 442; SI-NEXT: .LBB4_4: 443; SI-NEXT: s_mov_b64 exec, 0 444; SI-NEXT: exp null off, off, off, off done vm 445; SI-NEXT: s_endpgm 446; SI-NEXT: .LBB4_5: 447; 448; GFX9-LABEL: wqm_demote_2: 449; GFX9: ; %bb.0: ; %.entry 450; GFX9-NEXT: s_mov_b64 s[12:13], exec 451; GFX9-NEXT: s_wqm_b64 exec, exec 452; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 453; GFX9-NEXT: s_waitcnt vmcnt(0) 454; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 455; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc 456; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 457; GFX9-NEXT: s_cbranch_execz .LBB4_3 458; GFX9-NEXT: ; %bb.1: ; %.demote 459; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 460; GFX9-NEXT: s_cbranch_scc0 .LBB4_4 461; GFX9-NEXT: ; %bb.2: ; %.demote 462; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] 463; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] 464; GFX9-NEXT: .LBB4_3: ; %.continue 465; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] 466; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 467; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 468; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 469; GFX9-NEXT: s_waitcnt vmcnt(0) 470; GFX9-NEXT: s_branch .LBB4_5 471; GFX9-NEXT: .LBB4_4: 472; GFX9-NEXT: s_mov_b64 exec, 0 473; GFX9-NEXT: exp null off, off, off, off done vm 474; GFX9-NEXT: s_endpgm 475; GFX9-NEXT: .LBB4_5: 476; 477; GFX10-32-LABEL: wqm_demote_2: 478; GFX10-32: ; %bb.0: ; %.entry 479; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 480; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 481; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 482; GFX10-32-NEXT: s_waitcnt vmcnt(0) 483; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 484; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo 485; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 486; GFX10-32-NEXT: s_cbranch_execz .LBB4_3 487; GFX10-32-NEXT: ; %bb.1: ; %.demote 488; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo 489; GFX10-32-NEXT: s_cbranch_scc0 .LBB4_4 490; GFX10-32-NEXT: ; %bb.2: ; %.demote 491; GFX10-32-NEXT: s_wqm_b32 s14, s12 492; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s14 493; GFX10-32-NEXT: .LBB4_3: ; %.continue 494; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 495; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 496; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 497; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 498; GFX10-32-NEXT: s_waitcnt vmcnt(0) 499; GFX10-32-NEXT: s_branch .LBB4_5 500; GFX10-32-NEXT: .LBB4_4: 501; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 502; GFX10-32-NEXT: exp null off, off, off, off done vm 503; GFX10-32-NEXT: s_endpgm 504; GFX10-32-NEXT: .LBB4_5: 505; 506; GFX10-64-LABEL: wqm_demote_2: 507; GFX10-64: ; %bb.0: ; %.entry 508; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 509; GFX10-64-NEXT: s_wqm_b64 exec, exec 510; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 511; GFX10-64-NEXT: s_waitcnt vmcnt(0) 512; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 513; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc 514; GFX10-64-NEXT: s_xor_b64 s[14:15], exec, s[14:15] 515; GFX10-64-NEXT: s_cbranch_execz .LBB4_3 516; GFX10-64-NEXT: ; %bb.1: ; %.demote 517; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec 518; GFX10-64-NEXT: s_cbranch_scc0 .LBB4_4 519; GFX10-64-NEXT: ; %bb.2: ; %.demote 520; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] 521; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] 522; GFX10-64-NEXT: .LBB4_3: ; %.continue 523; GFX10-64-NEXT: s_or_b64 exec, exec, s[14:15] 524; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 525; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 526; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 527; GFX10-64-NEXT: s_waitcnt vmcnt(0) 528; GFX10-64-NEXT: s_branch .LBB4_5 529; GFX10-64-NEXT: .LBB4_4: 530; GFX10-64-NEXT: s_mov_b64 exec, 0 531; GFX10-64-NEXT: exp null off, off, off, off done vm 532; GFX10-64-NEXT: s_endpgm 533; GFX10-64-NEXT: .LBB4_5: 534.entry: 535 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 536 %tex0 = extractelement <4 x float> %tex, i32 0 537 %tex1 = extractelement <4 x float> %tex, i32 0 538 %z.cmp = fcmp olt float %tex0, 0.0 539 br i1 %z.cmp, label %.continue, label %.demote 540 541.demote: 542 call void @llvm.amdgcn.wqm.demote(i1 false) 543 br label %.continue 544 545.continue: 546 %coord1 = fadd float %tex0, %tex1 547 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 548 549 ret <4 x float> %rtex 550} 551 552define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) { 553; SI-LABEL: wqm_demote_dynamic: 554; SI: ; %bb.0: ; %.entry 555; SI-NEXT: s_mov_b64 s[12:13], exec 556; SI-NEXT: s_wqm_b64 exec, exec 557; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 558; SI-NEXT: s_waitcnt vmcnt(0) 559; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 560; SI-NEXT: s_andn2_b64 s[14:15], exec, vcc 561; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 562; SI-NEXT: s_cbranch_scc0 .LBB5_2 563; SI-NEXT: ; %bb.1: ; %.entry 564; SI-NEXT: s_wqm_b64 s[14:15], s[12:13] 565; SI-NEXT: s_and_b64 exec, exec, s[14:15] 566; SI-NEXT: v_add_f32_e32 v0, v0, v0 567; SI-NEXT: s_and_b64 exec, exec, s[12:13] 568; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 569; SI-NEXT: s_waitcnt vmcnt(0) 570; SI-NEXT: s_branch .LBB5_3 571; SI-NEXT: .LBB5_2: 572; SI-NEXT: s_mov_b64 exec, 0 573; SI-NEXT: exp null off, off, off, off done vm 574; SI-NEXT: s_endpgm 575; SI-NEXT: .LBB5_3: 576; 577; GFX9-LABEL: wqm_demote_dynamic: 578; GFX9: ; %bb.0: ; %.entry 579; GFX9-NEXT: s_mov_b64 s[12:13], exec 580; GFX9-NEXT: s_wqm_b64 exec, exec 581; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 582; GFX9-NEXT: s_waitcnt vmcnt(0) 583; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 584; GFX9-NEXT: s_andn2_b64 s[14:15], exec, vcc 585; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 586; GFX9-NEXT: s_cbranch_scc0 .LBB5_2 587; GFX9-NEXT: ; %bb.1: ; %.entry 588; GFX9-NEXT: s_wqm_b64 s[14:15], s[12:13] 589; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] 590; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 591; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 592; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf 593; GFX9-NEXT: s_waitcnt vmcnt(0) 594; GFX9-NEXT: s_branch .LBB5_3 595; GFX9-NEXT: .LBB5_2: 596; GFX9-NEXT: s_mov_b64 exec, 0 597; GFX9-NEXT: exp null off, off, off, off done vm 598; GFX9-NEXT: s_endpgm 599; GFX9-NEXT: .LBB5_3: 600; 601; GFX10-32-LABEL: wqm_demote_dynamic: 602; GFX10-32: ; %bb.0: ; %.entry 603; GFX10-32-NEXT: s_mov_b32 s12, exec_lo 604; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 605; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 606; GFX10-32-NEXT: s_waitcnt vmcnt(0) 607; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 608; GFX10-32-NEXT: s_andn2_b32 s13, exec_lo, vcc_lo 609; GFX10-32-NEXT: s_andn2_b32 s12, s12, s13 610; GFX10-32-NEXT: s_cbranch_scc0 .LBB5_2 611; GFX10-32-NEXT: ; %bb.1: ; %.entry 612; GFX10-32-NEXT: s_wqm_b32 s13, s12 613; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s13 614; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 615; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 616; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 617; GFX10-32-NEXT: s_waitcnt vmcnt(0) 618; GFX10-32-NEXT: s_branch .LBB5_3 619; GFX10-32-NEXT: .LBB5_2: 620; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 621; GFX10-32-NEXT: exp null off, off, off, off done vm 622; GFX10-32-NEXT: s_endpgm 623; GFX10-32-NEXT: .LBB5_3: 624; 625; GFX10-64-LABEL: wqm_demote_dynamic: 626; GFX10-64: ; %bb.0: ; %.entry 627; GFX10-64-NEXT: s_mov_b64 s[12:13], exec 628; GFX10-64-NEXT: s_wqm_b64 exec, exec 629; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D 630; GFX10-64-NEXT: s_waitcnt vmcnt(0) 631; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 632; GFX10-64-NEXT: s_andn2_b64 s[14:15], exec, vcc 633; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[14:15] 634; GFX10-64-NEXT: s_cbranch_scc0 .LBB5_2 635; GFX10-64-NEXT: ; %bb.1: ; %.entry 636; GFX10-64-NEXT: s_wqm_b64 s[14:15], s[12:13] 637; GFX10-64-NEXT: s_and_b64 exec, exec, s[14:15] 638; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 639; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] 640; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D 641; GFX10-64-NEXT: s_waitcnt vmcnt(0) 642; GFX10-64-NEXT: s_branch .LBB5_3 643; GFX10-64-NEXT: .LBB5_2: 644; GFX10-64-NEXT: s_mov_b64 exec, 0 645; GFX10-64-NEXT: exp null off, off, off, off done vm 646; GFX10-64-NEXT: s_endpgm 647; GFX10-64-NEXT: .LBB5_3: 648.entry: 649 %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 650 %tex0 = extractelement <4 x float> %tex, i32 0 651 %tex1 = extractelement <4 x float> %tex, i32 0 652 %z.cmp = fcmp olt float %tex0, 0.0 653 call void @llvm.amdgcn.wqm.demote(i1 %z.cmp) 654 %coord1 = fadd float %tex0, %tex1 655 %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0 656 657 ret <4 x float> %rtex 658} 659 660 661define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { 662; SI-LABEL: wqm_deriv: 663; SI: ; %bb.0: ; %.entry 664; SI-NEXT: s_mov_b64 s[0:1], exec 665; SI-NEXT: s_wqm_b64 exec, exec 666; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 667; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 668; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc 669; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 670; SI-NEXT: s_cbranch_execz .LBB6_3 671; SI-NEXT: ; %bb.1: ; %.demote0 672; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 673; SI-NEXT: s_cbranch_scc0 .LBB6_7 674; SI-NEXT: ; %bb.2: ; %.demote0 675; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] 676; SI-NEXT: s_and_b64 exec, exec, s[4:5] 677; SI-NEXT: .LBB6_3: ; %.continue0 678; SI-NEXT: s_or_b64 exec, exec, s[2:3] 679; SI-NEXT: s_mov_b64 s[2:3], s[0:1] 680; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 681; SI-NEXT: v_mov_b32_e32 v1, v0 682; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 683; SI-NEXT: s_nop 0 684; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 685; SI-NEXT: s_nop 1 686; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 687; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 688; SI-NEXT: s_and_b64 exec, exec, s[0:1] 689; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 690; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc 691; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 692; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 693; SI-NEXT: s_cbranch_execz .LBB6_6 694; SI-NEXT: ; %bb.4: ; %.demote1 695; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 696; SI-NEXT: s_cbranch_scc0 .LBB6_7 697; SI-NEXT: ; %bb.5: ; %.demote1 698; SI-NEXT: s_mov_b64 exec, 0 699; SI-NEXT: .LBB6_6: ; %.continue1 700; SI-NEXT: s_or_b64 exec, exec, s[2:3] 701; SI-NEXT: v_bfrev_b32_e32 v0, 60 702; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 703; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm 704; SI-NEXT: s_endpgm 705; SI-NEXT: .LBB6_7: 706; SI-NEXT: s_mov_b64 exec, 0 707; SI-NEXT: exp null off, off, off, off done vm 708; SI-NEXT: s_endpgm 709; 710; GFX9-LABEL: wqm_deriv: 711; GFX9: ; %bb.0: ; %.entry 712; GFX9-NEXT: s_mov_b64 s[0:1], exec 713; GFX9-NEXT: s_wqm_b64 exec, exec 714; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 715; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 716; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 717; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 718; GFX9-NEXT: s_cbranch_execz .LBB6_3 719; GFX9-NEXT: ; %bb.1: ; %.demote0 720; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 721; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 722; GFX9-NEXT: ; %bb.2: ; %.demote0 723; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] 724; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] 725; GFX9-NEXT: .LBB6_3: ; %.continue0 726; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 727; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] 728; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 729; GFX9-NEXT: v_mov_b32_e32 v1, v0 730; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 731; GFX9-NEXT: s_nop 0 732; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 733; GFX9-NEXT: s_nop 1 734; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 735; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 736; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 737; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 738; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc 739; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 740; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 741; GFX9-NEXT: s_cbranch_execz .LBB6_6 742; GFX9-NEXT: ; %bb.4: ; %.demote1 743; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 744; GFX9-NEXT: s_cbranch_scc0 .LBB6_7 745; GFX9-NEXT: ; %bb.5: ; %.demote1 746; GFX9-NEXT: s_mov_b64 exec, 0 747; GFX9-NEXT: .LBB6_6: ; %.continue1 748; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 749; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 750; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 751; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 752; GFX9-NEXT: s_endpgm 753; GFX9-NEXT: .LBB6_7: 754; GFX9-NEXT: s_mov_b64 exec, 0 755; GFX9-NEXT: exp null off, off, off, off done vm 756; GFX9-NEXT: s_endpgm 757; 758; GFX10-32-LABEL: wqm_deriv: 759; GFX10-32: ; %bb.0: ; %.entry 760; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 761; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 762; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 763; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 764; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo 765; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 766; GFX10-32-NEXT: s_cbranch_execz .LBB6_3 767; GFX10-32-NEXT: ; %bb.1: ; %.demote0 768; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 769; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 770; GFX10-32-NEXT: ; %bb.2: ; %.demote0 771; GFX10-32-NEXT: s_wqm_b32 s2, s0 772; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 773; GFX10-32-NEXT: .LBB6_3: ; %.continue0 774; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 775; GFX10-32-NEXT: s_mov_b32 s1, s0 776; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 777; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 778; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 779; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 780; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 781; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 782; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 783; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 784; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo 785; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 786; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 787; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 788; GFX10-32-NEXT: ; %bb.4: ; %.demote1 789; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 790; GFX10-32-NEXT: s_cbranch_scc0 .LBB6_7 791; GFX10-32-NEXT: ; %bb.5: ; %.demote1 792; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 793; GFX10-32-NEXT: .LBB6_6: ; %.continue1 794; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 795; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 796; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 797; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 798; GFX10-32-NEXT: s_endpgm 799; GFX10-32-NEXT: .LBB6_7: 800; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 801; GFX10-32-NEXT: exp null off, off, off, off done vm 802; GFX10-32-NEXT: s_endpgm 803; 804; GFX10-64-LABEL: wqm_deriv: 805; GFX10-64: ; %bb.0: ; %.entry 806; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 807; GFX10-64-NEXT: s_wqm_b64 exec, exec 808; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 809; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 810; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc 811; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 812; GFX10-64-NEXT: s_cbranch_execz .LBB6_3 813; GFX10-64-NEXT: ; %bb.1: ; %.demote0 814; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 815; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 816; GFX10-64-NEXT: ; %bb.2: ; %.demote0 817; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] 818; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] 819; GFX10-64-NEXT: .LBB6_3: ; %.continue0 820; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 821; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] 822; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] 823; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 824; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 825; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 826; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 827; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 828; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 829; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 830; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc 831; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] 832; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] 833; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 834; GFX10-64-NEXT: ; %bb.4: ; %.demote1 835; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 836; GFX10-64-NEXT: s_cbranch_scc0 .LBB6_7 837; GFX10-64-NEXT: ; %bb.5: ; %.demote1 838; GFX10-64-NEXT: s_mov_b64 exec, 0 839; GFX10-64-NEXT: .LBB6_6: ; %.continue1 840; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 841; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 842; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 843; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 844; GFX10-64-NEXT: s_endpgm 845; GFX10-64-NEXT: .LBB6_7: 846; GFX10-64-NEXT: s_mov_b64 exec, 0 847; GFX10-64-NEXT: exp null off, off, off, off done vm 848; GFX10-64-NEXT: s_endpgm 849.entry: 850 %p0 = extractelement <2 x float> %input, i32 0 851 %p1 = extractelement <2 x float> %input, i32 1 852 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2 853 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2 854 %argi = fptosi float %arg to i32 855 %cond0 = icmp eq i32 %argi, 0 856 br i1 %cond0, label %.continue0, label %.demote0 857 858.demote0: 859 call void @llvm.amdgcn.wqm.demote(i1 false) 860 br label %.continue0 861 862.continue0: 863 %live = call i1 @llvm.amdgcn.live.mask() 864 %live.cond = select i1 %live, i32 0, i32 1065353216 865 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 866 %live.v0f = bitcast i32 %live.v0 to float 867 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 868 %live.v1f = bitcast i32 %live.v1 to float 869 %v0 = fsub float %live.v0f, %live.v1f 870 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 871 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 872 %cond2 = and i1 %live, %cond1 873 br i1 %cond2, label %.continue1, label %.demote1 874 875.demote1: 876 call void @llvm.amdgcn.wqm.demote(i1 false) 877 br label %.continue1 878 879.continue1: 880 call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3 881 ret void 882} 883 884define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) { 885; SI-LABEL: wqm_deriv_loop: 886; SI: ; %bb.0: ; %.entry 887; SI-NEXT: s_mov_b64 s[0:1], exec 888; SI-NEXT: s_wqm_b64 exec, exec 889; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 890; SI-NEXT: s_mov_b32 s6, 0 891; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 892; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc 893; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 894; SI-NEXT: s_cbranch_execz .LBB7_3 895; SI-NEXT: ; %bb.1: ; %.demote0 896; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 897; SI-NEXT: s_cbranch_scc0 .LBB7_9 898; SI-NEXT: ; %bb.2: ; %.demote0 899; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] 900; SI-NEXT: s_and_b64 exec, exec, s[4:5] 901; SI-NEXT: .LBB7_3: ; %.continue0.preheader 902; SI-NEXT: s_or_b64 exec, exec, s[2:3] 903; SI-NEXT: s_mov_b64 s[2:3], 0 904; SI-NEXT: s_branch .LBB7_5 905; SI-NEXT: .LBB7_4: ; %.continue1 906; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 907; SI-NEXT: s_or_b64 exec, exec, s[4:5] 908; SI-NEXT: s_add_i32 s6, s6, 1 909; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 910; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 911; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] 912; SI-NEXT: s_cbranch_execz .LBB7_8 913; SI-NEXT: .LBB7_5: ; %.continue0 914; SI-NEXT: ; =>This Inner Loop Header: Depth=1 915; SI-NEXT: v_mov_b32_e32 v0, s6 916; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 917; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] 918; SI-NEXT: v_mov_b32_e32 v2, v0 919; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 920; SI-NEXT: s_nop 0 921; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 922; SI-NEXT: s_nop 1 923; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 924; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 925; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 926; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc 927; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] 928; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] 929; SI-NEXT: s_cbranch_execz .LBB7_4 930; SI-NEXT: ; %bb.6: ; %.demote1 931; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 932; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 933; SI-NEXT: s_cbranch_scc0 .LBB7_9 934; SI-NEXT: ; %bb.7: ; %.demote1 935; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 936; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] 937; SI-NEXT: s_and_b64 exec, exec, s[8:9] 938; SI-NEXT: s_branch .LBB7_4 939; SI-NEXT: .LBB7_8: ; %.return 940; SI-NEXT: s_or_b64 exec, exec, s[2:3] 941; SI-NEXT: s_and_b64 exec, exec, s[0:1] 942; SI-NEXT: v_bfrev_b32_e32 v0, 60 943; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 944; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm 945; SI-NEXT: s_endpgm 946; SI-NEXT: .LBB7_9: 947; SI-NEXT: s_mov_b64 exec, 0 948; SI-NEXT: exp null off, off, off, off done vm 949; SI-NEXT: s_endpgm 950; 951; GFX9-LABEL: wqm_deriv_loop: 952; GFX9: ; %bb.0: ; %.entry 953; GFX9-NEXT: s_mov_b64 s[0:1], exec 954; GFX9-NEXT: s_wqm_b64 exec, exec 955; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 956; GFX9-NEXT: s_mov_b32 s6, 0 957; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 958; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc 959; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 960; GFX9-NEXT: s_cbranch_execz .LBB7_3 961; GFX9-NEXT: ; %bb.1: ; %.demote0 962; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 963; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 964; GFX9-NEXT: ; %bb.2: ; %.demote0 965; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] 966; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] 967; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader 968; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 969; GFX9-NEXT: s_mov_b64 s[2:3], 0 970; GFX9-NEXT: s_branch .LBB7_5 971; GFX9-NEXT: .LBB7_4: ; %.continue1 972; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 973; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] 974; GFX9-NEXT: s_add_i32 s6, s6, 1 975; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 976; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 977; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] 978; GFX9-NEXT: s_cbranch_execz .LBB7_8 979; GFX9-NEXT: .LBB7_5: ; %.continue0 980; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 981; GFX9-NEXT: v_mov_b32_e32 v0, s6 982; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] 983; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] 984; GFX9-NEXT: v_mov_b32_e32 v2, v0 985; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 986; GFX9-NEXT: s_nop 0 987; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 988; GFX9-NEXT: s_nop 1 989; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 990; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 991; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 992; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc 993; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] 994; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] 995; GFX9-NEXT: s_cbranch_execz .LBB7_4 996; GFX9-NEXT: ; %bb.6: ; %.demote1 997; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 998; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 999; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 1000; GFX9-NEXT: ; %bb.7: ; %.demote1 1001; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 1002; GFX9-NEXT: s_wqm_b64 s[8:9], s[0:1] 1003; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] 1004; GFX9-NEXT: s_branch .LBB7_4 1005; GFX9-NEXT: .LBB7_8: ; %.return 1006; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] 1007; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] 1008; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 1009; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 1010; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1011; GFX9-NEXT: s_endpgm 1012; GFX9-NEXT: .LBB7_9: 1013; GFX9-NEXT: s_mov_b64 exec, 0 1014; GFX9-NEXT: exp null off, off, off, off done vm 1015; GFX9-NEXT: s_endpgm 1016; 1017; GFX10-32-LABEL: wqm_deriv_loop: 1018; GFX10-32: ; %bb.0: ; %.entry 1019; GFX10-32-NEXT: s_mov_b32 s0, exec_lo 1020; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo 1021; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 1022; GFX10-32-NEXT: s_mov_b32 s1, 0 1023; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1024; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo 1025; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 1026; GFX10-32-NEXT: s_cbranch_execz .LBB7_3 1027; GFX10-32-NEXT: ; %bb.1: ; %.demote0 1028; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1029; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 1030; GFX10-32-NEXT: ; %bb.2: ; %.demote0 1031; GFX10-32-NEXT: s_wqm_b32 s3, s0 1032; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 1033; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader 1034; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 1035; GFX10-32-NEXT: s_mov_b32 s2, 0 1036; GFX10-32-NEXT: s_branch .LBB7_5 1037; GFX10-32-NEXT: .LBB7_4: ; %.continue1 1038; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1039; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 1040; GFX10-32-NEXT: s_add_i32 s2, s2, 1 1041; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 1042; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 1043; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 1044; GFX10-32-NEXT: s_cbranch_execz .LBB7_8 1045; GFX10-32-NEXT: .LBB7_5: ; %.continue0 1046; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 1047; GFX10-32-NEXT: s_mov_b32 s3, s0 1048; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 1049; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 1050; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 1051; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1052; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1053; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1054; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 1055; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo 1056; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 1057; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 1058; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 1059; GFX10-32-NEXT: ; %bb.6: ; %.demote1 1060; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1061; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo 1062; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9 1063; GFX10-32-NEXT: ; %bb.7: ; %.demote1 1064; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 1065; GFX10-32-NEXT: s_wqm_b32 s4, s0 1066; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 1067; GFX10-32-NEXT: s_branch .LBB7_4 1068; GFX10-32-NEXT: .LBB7_8: ; %.return 1069; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 1070; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 1071; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 1072; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 1073; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1074; GFX10-32-NEXT: s_endpgm 1075; GFX10-32-NEXT: .LBB7_9: 1076; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 1077; GFX10-32-NEXT: exp null off, off, off, off done vm 1078; GFX10-32-NEXT: s_endpgm 1079; 1080; GFX10-64-LABEL: wqm_deriv_loop: 1081; GFX10-64: ; %bb.0: ; %.entry 1082; GFX10-64-NEXT: s_mov_b64 s[0:1], exec 1083; GFX10-64-NEXT: s_wqm_b64 exec, exec 1084; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 1085; GFX10-64-NEXT: s_mov_b32 s6, 0 1086; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1087; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc 1088; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] 1089; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 1090; GFX10-64-NEXT: ; %bb.1: ; %.demote0 1091; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1092; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 1093; GFX10-64-NEXT: ; %bb.2: ; %.demote0 1094; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] 1095; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] 1096; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader 1097; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 1098; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 1099; GFX10-64-NEXT: s_branch .LBB7_5 1100; GFX10-64-NEXT: .LBB7_4: ; %.continue1 1101; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1102; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] 1103; GFX10-64-NEXT: s_add_i32 s6, s6, 1 1104; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 1105; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 1106; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] 1107; GFX10-64-NEXT: s_cbranch_execz .LBB7_8 1108; GFX10-64-NEXT: .LBB7_5: ; %.continue0 1109; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 1110; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] 1111; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] 1112; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 1113; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 1114; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 1115; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 1116; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 1117; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 1118; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc 1119; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] 1120; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] 1121; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 1122; GFX10-64-NEXT: ; %bb.6: ; %.demote1 1123; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1124; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec 1125; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 1126; GFX10-64-NEXT: ; %bb.7: ; %.demote1 1127; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 1128; GFX10-64-NEXT: s_wqm_b64 s[8:9], s[0:1] 1129; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] 1130; GFX10-64-NEXT: s_branch .LBB7_4 1131; GFX10-64-NEXT: .LBB7_8: ; %.return 1132; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] 1133; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] 1134; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 1135; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 1136; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm 1137; GFX10-64-NEXT: s_endpgm 1138; GFX10-64-NEXT: .LBB7_9: 1139; GFX10-64-NEXT: s_mov_b64 exec, 0 1140; GFX10-64-NEXT: exp null off, off, off, off done vm 1141; GFX10-64-NEXT: s_endpgm 1142.entry: 1143 %p0 = extractelement <2 x float> %input, i32 0 1144 %p1 = extractelement <2 x float> %input, i32 1 1145 %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 0, i32 0, i32 %index) #2 1146 %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 0, i32 0, i32 %index) #2 1147 %argi = fptosi float %arg to i32 1148 %cond0 = icmp eq i32 %argi, 0 1149 br i1 %cond0, label %.continue0, label %.demote0 1150 1151.demote0: 1152 call void @llvm.amdgcn.wqm.demote(i1 false) 1153 br label %.continue0 1154 1155.continue0: 1156 %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ] 1157 %live = call i1 @llvm.amdgcn.live.mask() 1158 %live.cond = select i1 %live, i32 0, i32 %count 1159 %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true) 1160 %live.v0f = bitcast i32 %live.v0 to float 1161 %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true) 1162 %live.v1f = bitcast i32 %live.v1 to float 1163 %v0 = fsub float %live.v0f, %live.v1f 1164 %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0) 1165 %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00 1166 %cond2 = and i1 %live, %cond1 1167 br i1 %cond2, label %.continue1, label %.demote1 1168 1169.demote1: 1170 call void @llvm.amdgcn.wqm.demote(i1 false) 1171 br label %.continue1 1172 1173.continue1: 1174 %next = add i32 %count, 1 1175 %loop.cond = icmp slt i32 %next, %limit 1176 br i1 %loop.cond, label %.continue0, label %.return 1177 1178.return: 1179 call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 true, i1 true) #3 1180 ret void 1181} 1182 1183define amdgpu_ps void @static_exact_nop(float %arg0, float %arg1) { 1184; SI-LABEL: static_exact_nop: 1185; SI: ; %bb.0: ; %.entry 1186; SI-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1187; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1188; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1189; SI-NEXT: s_endpgm 1190; 1191; GFX9-LABEL: static_exact_nop: 1192; GFX9: ; %bb.0: ; %.entry 1193; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1194; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1195; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1196; GFX9-NEXT: s_endpgm 1197; 1198; GFX10-32-LABEL: static_exact_nop: 1199; GFX10-32: ; %bb.0: ; %.entry 1200; GFX10-32-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v0 1201; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo 1202; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1203; GFX10-32-NEXT: s_endpgm 1204; 1205; GFX10-64-LABEL: static_exact_nop: 1206; GFX10-64: ; %bb.0: ; %.entry 1207; GFX10-64-NEXT: v_cmp_gt_f32_e32 vcc, 0, v0 1208; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc 1209; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm 1210; GFX10-64-NEXT: s_endpgm 1211.entry: 1212 %c0 = fcmp olt float %arg0, 0.000000e+00 1213 %c1 = fcmp oge float %arg1, 0.0 1214 call void @llvm.amdgcn.wqm.demote(i1 true) 1215 %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00 1216 call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 1217 ret void 1218} 1219 1220 1221declare void @llvm.amdgcn.wqm.demote(i1) #0 1222declare i1 @llvm.amdgcn.live.mask() #0 1223declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 1224declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1225declare float @llvm.amdgcn.wqm.f32(float) #1 1226declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 1227declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 1228declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3 1229declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4 1230 1231attributes #0 = { nounwind } 1232attributes #1 = { nounwind readnone } 1233attributes #2 = { nounwind readnone speculatable } 1234attributes #3 = { inaccessiblememonly nounwind } 1235attributes #4 = { convergent nounwind readnone } 1236