1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032,GFX10DEFWAVE %s 7 8define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) { 9; GFX1032-LABEL: test_vopc_i32: 10; GFX1032: ; %bb.0: 11; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 12; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 13; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 14; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] 15; GFX1032-NEXT: s_waitcnt vmcnt(0) 16; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1 17; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo 18; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 19; GFX1032-NEXT: s_endpgm 20; 21; GFX1064-LABEL: test_vopc_i32: 22; GFX1064: ; %bb.0: 23; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 24; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 25; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 26; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] 27; GFX1064-NEXT: s_waitcnt vmcnt(0) 28; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 29; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc 30; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 31; GFX1064-NEXT: s_endpgm 32 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 33 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid 34 %load = load i32, ptr addrspace(1) %gep, align 4 35 %cmp = icmp sgt i32 %load, 0 36 %sel = select i1 %cmp, i32 1, i32 2 37 store i32 %sel, ptr addrspace(1) %gep, align 4 38 ret void 39} 40 41define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) { 42; GFX1032-LABEL: test_vopc_f32: 43; GFX1032: ; %bb.0: 44; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 45; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 46; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 47; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] 48; GFX1032-NEXT: s_waitcnt vmcnt(0) 49; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1 50; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo 51; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 52; GFX1032-NEXT: s_endpgm 53; 54; GFX1064-LABEL: test_vopc_f32: 55; GFX1064: ; %bb.0: 56; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 57; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 58; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 59; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] 60; GFX1064-NEXT: s_waitcnt vmcnt(0) 61; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1 62; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc 63; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 64; GFX1064-NEXT: s_endpgm 65 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 66 %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid 67 %load = load float, ptr addrspace(1) %gep, align 4 68 %cmp = fcmp ugt float %load, 0.0 69 %sel = select i1 %cmp, float 1.0, float 2.0 70 store float %sel, ptr addrspace(1) %gep, align 4 71 ret void 72} 73 74define amdgpu_ps void @test_vopc_vcmp(float %x) { 75; GFX1032-LABEL: test_vopc_vcmp: 76; GFX1032: ; %bb.0: 77; GFX1032-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v0 78; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo 79; GFX1032-NEXT: s_cbranch_scc0 .LBB2_1 80; GFX1032-NEXT: s_endpgm 81; GFX1032-NEXT: .LBB2_1: 82; GFX1032-NEXT: s_mov_b32 exec_lo, 0 83; GFX1032-NEXT: exp null off, off, off, off done vm 84; GFX1032-NEXT: s_endpgm 85; 86; GFX1064-LABEL: test_vopc_vcmp: 87; GFX1064: ; %bb.0: 88; GFX1064-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0 89; GFX1064-NEXT: s_andn2_b64 exec, exec, vcc 90; GFX1064-NEXT: s_cbranch_scc0 .LBB2_1 91; GFX1064-NEXT: s_endpgm 92; GFX1064-NEXT: .LBB2_1: 93; GFX1064-NEXT: s_mov_b64 exec, 0 94; GFX1064-NEXT: exp null off, off, off, off done vm 95; GFX1064-NEXT: s_endpgm 96 %cmp = fcmp oge float %x, 0.0 97 call void @llvm.amdgcn.kill(i1 %cmp) 98 ret void 99} 100 101define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) { 102; GFX1032-LABEL: test_vopc_2xf16: 103; GFX1032: ; %bb.0: 104; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 105; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 106; GFX1032-NEXT: v_mov_b32_e32 v2, 0 107; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 108; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] 109; GFX1032-NEXT: s_waitcnt vmcnt(0) 110; GFX1032-NEXT: v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD 111; GFX1032-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo 112; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 113; GFX1032-NEXT: s_endpgm 114; 115; GFX1064-LABEL: test_vopc_2xf16: 116; GFX1064: ; %bb.0: 117; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 118; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 119; GFX1064-NEXT: v_mov_b32_e32 v2, 0 120; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 121; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] 122; GFX1064-NEXT: s_waitcnt vmcnt(0) 123; GFX1064-NEXT: v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD 124; GFX1064-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc 125; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 126; GFX1064-NEXT: s_endpgm 127 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 128 %gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid 129 %load = load <2 x half>, ptr addrspace(1) %gep, align 4 130 %elt = extractelement <2 x half> %load, i32 1 131 %cmp = fcmp ugt half %elt, 0.0 132 %sel = select i1 %cmp, <2 x half> <half 1.0, half 1.0>, <2 x half> %load 133 store <2 x half> %sel, ptr addrspace(1) %gep, align 4 134 ret void 135} 136 137define amdgpu_kernel void @test_vopc_class(ptr addrspace(1) %out, float %x) #0 { 138; GFX1032-LABEL: test_vopc_class: 139; GFX1032: ; %bb.0: 140; GFX1032-NEXT: s_clause 0x1 141; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c 142; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 143; GFX1032-NEXT: v_mov_b32_e32 v0, 0 144; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 145; GFX1032-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204 146; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 147; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 148; GFX1032-NEXT: s_endpgm 149; 150; GFX1064-LABEL: test_vopc_class: 151; GFX1064: ; %bb.0: 152; GFX1064-NEXT: s_clause 0x1 153; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c 154; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 155; GFX1064-NEXT: v_mov_b32_e32 v0, 0 156; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 157; GFX1064-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 0x204 158; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] 159; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 160; GFX1064-NEXT: s_endpgm 161 %fabs = tail call float @llvm.fabs.f32(float %x) 162 %cmp = fcmp oeq float %fabs, 0x7FF0000000000000 163 %ext = zext i1 %cmp to i32 164 store i32 %ext, ptr addrspace(1) %out, align 4 165 ret void 166} 167 168define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 { 169; GFX1032-LABEL: test_vcmp_vcnd_f16: 170; GFX1032: ; %bb.0: 171; GFX1032-NEXT: s_clause 0x1 172; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c 173; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 174; GFX1032-NEXT: v_mov_b32_e32 v1, 0 175; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 176; GFX1032-NEXT: v_mov_b32_e32 v0, s2 177; GFX1032-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, s2 178; GFX1032-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc_lo 179; GFX1032-NEXT: global_store_short v1, v0, s[0:1] 180; GFX1032-NEXT: s_endpgm 181; 182; GFX1064-LABEL: test_vcmp_vcnd_f16: 183; GFX1064: ; %bb.0: 184; GFX1064-NEXT: s_clause 0x1 185; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c 186; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 187; GFX1064-NEXT: v_mov_b32_e32 v1, 0 188; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 189; GFX1064-NEXT: v_mov_b32_e32 v0, s2 190; GFX1064-NEXT: v_cmp_neq_f16_e64 vcc, 0x7c00, s2 191; GFX1064-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v0, vcc 192; GFX1064-NEXT: global_store_short v1, v0, s[0:1] 193; GFX1064-NEXT: s_endpgm 194 %cmp = fcmp oeq half %x, 0x7FF0000000000000 195 %sel = select i1 %cmp, half 1.0, half %x 196 store half %sel, ptr addrspace(1) %out, align 2 197 ret void 198} 199 200define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { 201; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: 202; GFX1032: ; %bb.0: 203; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 204; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 206; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] 207; GFX1032-NEXT: s_waitcnt vmcnt(0) 208; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1 209; GFX1032-NEXT: v_cmp_nle_f32_e64 s0, 1.0, v1 210; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 211; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s0 212; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] 213; GFX1032-NEXT: s_endpgm 214; 215; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: 216; GFX1064: ; %bb.0: 217; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 218; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 219; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 220; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] 221; GFX1064-NEXT: s_waitcnt vmcnt(0) 222; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1 223; GFX1064-NEXT: v_cmp_nle_f32_e64 s[0:1], 1.0, v1 224; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 225; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1] 226; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] 227; GFX1064-NEXT: s_endpgm 228 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 229 %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid 230 %load = load float, ptr addrspace(1) %gep, align 4 231 %cmp = fcmp ugt float %load, 0.0 232 %cmp2 = fcmp ult float %load, 1.0 233 %and = and i1 %cmp, %cmp2 234 %sel = select i1 %and, float 1.0, float 2.0 235 store float %sel, ptr addrspace(1) %gep, align 4 236 ret void 237} 238 239define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { 240; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: 241; GFX1032: ; %bb.0: 242; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 243; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 244; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 245; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] 246; GFX1032-NEXT: s_waitcnt vmcnt(0) 247; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1 248; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1 249; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0 250; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 251; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] 252; GFX1032-NEXT: s_endpgm 253; 254; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: 255; GFX1064: ; %bb.0: 256; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 257; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 258; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 259; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] 260; GFX1064-NEXT: s_waitcnt vmcnt(0) 261; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 262; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1 263; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] 264; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] 265; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] 266; GFX1064-NEXT: s_endpgm 267 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 268 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid 269 %load = load i32, ptr addrspace(1) %gep, align 4 270 %cmp = icmp sgt i32 %load, 0 271 %cmp2 = icmp slt i32 %load, 1 272 %xor = xor i1 %cmp, %cmp2 273 %sel = select i1 %xor, i32 1, i32 2 274 store i32 %sel, ptr addrspace(1) %gep, align 4 275 ret void 276} 277 278define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { 279; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: 280; GFX1032: ; %bb.0: 281; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 282; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 283; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 284; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] 285; GFX1032-NEXT: s_waitcnt vmcnt(0) 286; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v1 287; GFX1032-NEXT: v_cmp_gt_u32_e64 s0, 2, v1 288; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 289; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 290; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] 291; GFX1032-NEXT: s_endpgm 292; 293; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: 294; GFX1064: ; %bb.0: 295; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 296; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 297; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 298; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] 299; GFX1064-NEXT: s_waitcnt vmcnt(0) 300; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1 301; GFX1064-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v1 302; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 303; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] 304; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] 305; GFX1064-NEXT: s_endpgm 306 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 307 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid 308 %load = load i32, ptr addrspace(1) %gep, align 4 309 %cmp = icmp ugt i32 %load, 3 310 %cmp2 = icmp ult i32 %load, 2 311 %or = or i1 %cmp, %cmp2 312 %sel = select i1 %or, i32 1, i32 2 313 store i32 %sel, ptr addrspace(1) %gep, align 4 314 ret void 315} 316 317define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 { 318; GFX1032-LABEL: test_mask_if: 319; GFX1032: ; %bb.0: 320; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 10, v0 321; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 322; GFX1032-NEXT: s_cbranch_execz .LBB9_2 323; GFX1032-NEXT: ; %bb.1: ; %if 324; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 325; GFX1032-NEXT: v_mov_b32_e32 v0, 0 326; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 327; GFX1032-NEXT: global_store_dword v0, v0, s[0:1] 328; GFX1032-NEXT: .LBB9_2: ; %endif 329; GFX1032-NEXT: s_endpgm 330; 331; GFX1064-LABEL: test_mask_if: 332; GFX1064: ; %bb.0: 333; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 10, v0 334; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 335; GFX1064-NEXT: s_cbranch_execz .LBB9_2 336; GFX1064-NEXT: ; %bb.1: ; %if 337; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 338; GFX1064-NEXT: v_mov_b32_e32 v0, 0 339; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 340; GFX1064-NEXT: global_store_dword v0, v0, s[0:1] 341; GFX1064-NEXT: .LBB9_2: ; %endif 342; GFX1064-NEXT: s_endpgm 343 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 344 %cmp = icmp ugt i32 %lid, 10 345 br i1 %cmp, label %if, label %endif 346 347if: 348 store i32 0, ptr addrspace(1) %arg, align 4 349 br label %endif 350 351endif: 352 ret void 353} 354 355define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { 356; GFX1032-LABEL: test_loop_with_if: 357; GFX1032: ; %bb.0: ; %bb 358; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 359; GFX1032-NEXT: v_mov_b32_e32 v1, 0 360; GFX1032-NEXT: s_mov_b32 s2, 0 361; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 362; GFX1032-NEXT: s_branch .LBB10_2 363; GFX1032-NEXT: .LBB10_1: ; %bb13 364; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 365; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 366; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 367; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4 368; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4 369; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 370; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 371; GFX1032-NEXT: s_cbranch_execz .LBB10_8 372; GFX1032-NEXT: .LBB10_2: ; %bb2 373; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 374; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 375; GFX1032-NEXT: s_mov_b32 s3, 0 376; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 377; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo 378; GFX1032-NEXT: s_cbranch_execz .LBB10_4 379; GFX1032-NEXT: ; %bb.3: ; %bb5 380; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 381; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1 382; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo 383; GFX1032-NEXT: s_mov_b32 s3, exec_lo 384; GFX1032-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2] 385; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 386; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2 387; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo 388; GFX1032-NEXT: global_load_dword v4, v[2:3], off 389; GFX1032-NEXT: s_waitcnt vmcnt(0) 390; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4 391; GFX1032-NEXT: s_and_b32 s6, vcc_lo, exec_lo 392; GFX1032-NEXT: s_or_b32 s4, s4, s6 393; GFX1032-NEXT: .LBB10_4: ; %Flow 394; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 395; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 396; GFX1032-NEXT: ; implicit-def: $vgpr4 397; GFX1032-NEXT: s_and_saveexec_b32 s5, s4 398; GFX1032-NEXT: s_xor_b32 s4, exec_lo, s5 399; GFX1032-NEXT: ; %bb.5: ; %bb11 400; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 401; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1 402; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo 403; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4 404; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4 405; GFX1032-NEXT: ; %bb.6: ; %Flow1 406; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 407; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 408; GFX1032-NEXT: s_and_saveexec_b32 s4, s3 409; GFX1032-NEXT: s_cbranch_execz .LBB10_1 410; GFX1032-NEXT: ; %bb.7: ; %bb10 411; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1 412; GFX1032-NEXT: v_mov_b32_e32 v4, v1 413; GFX1032-NEXT: global_store_dword v[2:3], v0, off 414; GFX1032-NEXT: s_branch .LBB10_1 415; GFX1032-NEXT: .LBB10_8: ; %bb1 416; GFX1032-NEXT: s_endpgm 417; 418; GFX1064-LABEL: test_loop_with_if: 419; GFX1064: ; %bb.0: ; %bb 420; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 421; GFX1064-NEXT: v_mov_b32_e32 v1, 0 422; GFX1064-NEXT: s_mov_b64 s[2:3], 0 423; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 424; GFX1064-NEXT: s_branch .LBB10_2 425; GFX1064-NEXT: .LBB10_1: ; %bb13 426; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 427; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 428; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 429; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4 430; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4 431; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 432; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] 433; GFX1064-NEXT: s_cbranch_execz .LBB10_8 434; GFX1064-NEXT: .LBB10_2: ; %bb2 435; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 436; GFX1064-NEXT: v_cmp_ge_i32_e64 s[6:7], v1, v0 437; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 438; GFX1064-NEXT: s_mov_b64 s[4:5], 0 439; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc 440; GFX1064-NEXT: s_cbranch_execz .LBB10_4 441; GFX1064-NEXT: ; %bb.3: ; %bb5 442; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 443; GFX1064-NEXT: v_ashrrev_i32_e32 v2, 31, v1 444; GFX1064-NEXT: s_andn2_b64 s[6:7], s[6:7], exec 445; GFX1064-NEXT: s_mov_b64 s[4:5], exec 446; GFX1064-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2] 447; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 448; GFX1064-NEXT: v_add_co_u32 v2, vcc, s0, v2 449; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc 450; GFX1064-NEXT: global_load_dword v4, v[2:3], off 451; GFX1064-NEXT: s_waitcnt vmcnt(0) 452; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4 453; GFX1064-NEXT: s_and_b64 s[10:11], vcc, exec 454; GFX1064-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] 455; GFX1064-NEXT: .LBB10_4: ; %Flow 456; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 457; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] 458; GFX1064-NEXT: ; implicit-def: $vgpr4 459; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] 460; GFX1064-NEXT: s_xor_b64 s[6:7], exec, s[8:9] 461; GFX1064-NEXT: ; %bb.5: ; %bb11 462; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 463; GFX1064-NEXT: v_lshrrev_b32_e32 v4, 31, v1 464; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec 465; GFX1064-NEXT: v_add_nc_u32_e32 v4, v1, v4 466; GFX1064-NEXT: v_ashrrev_i32_e32 v4, 1, v4 467; GFX1064-NEXT: ; %bb.6: ; %Flow1 468; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 469; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] 470; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] 471; GFX1064-NEXT: s_cbranch_execz .LBB10_1 472; GFX1064-NEXT: ; %bb.7: ; %bb10 473; GFX1064-NEXT: ; in Loop: Header=BB10_2 Depth=1 474; GFX1064-NEXT: v_mov_b32_e32 v4, v1 475; GFX1064-NEXT: global_store_dword v[2:3], v0, off 476; GFX1064-NEXT: s_branch .LBB10_1 477; GFX1064-NEXT: .LBB10_8: ; %bb1 478; GFX1064-NEXT: s_endpgm 479bb: 480 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 481 br label %bb2 482 483bb1: 484 ret void 485 486bb2: 487 %tmp3 = phi i32 [ 0, %bb ], [ %tmp15, %bb13 ] 488 %tmp4 = icmp slt i32 %tmp3, %tmp 489 br i1 %tmp4, label %bb5, label %bb11 490 491bb5: 492 %tmp6 = sext i32 %tmp3 to i64 493 %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6 494 %tmp8 = load i32, ptr addrspace(1) %tmp7, align 4 495 %tmp9 = icmp sgt i32 %tmp8, 10 496 br i1 %tmp9, label %bb10, label %bb11 497 498bb10: 499 store i32 %tmp, ptr addrspace(1) %tmp7, align 4 500 br label %bb13 501 502bb11: 503 %tmp12 = sdiv i32 %tmp3, 2 504 br label %bb13 505 506bb13: 507 %tmp14 = phi i32 [ %tmp3, %bb10 ], [ %tmp12, %bb11 ] 508 %tmp15 = add nsw i32 %tmp14, 1 509 %tmp16 = icmp slt i32 %tmp14, 255 510 br i1 %tmp16, label %bb2, label %bb1 511} 512 513 514 515define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 { 516; GFX1032-LABEL: test_loop_with_if_else_break: 517; GFX1032: ; %bb.0: ; %bb 518; GFX1032-NEXT: s_mov_b32 s2, 0 519; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 520; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo 521; GFX1032-NEXT: s_cbranch_execz .LBB11_6 522; GFX1032-NEXT: ; %bb.1: ; %.preheader 523; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 524; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0 525; GFX1032-NEXT: v_mov_b32_e32 v2, 0 526; GFX1032-NEXT: s_mov_b32 s3, 0 527; GFX1032-NEXT: ; implicit-def: $sgpr4 528; GFX1032-NEXT: s_branch .LBB11_4 529; GFX1032-NEXT: .LBB11_2: ; %bb8 530; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 531; GFX1032-NEXT: s_add_i32 s3, s3, 1 532; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] 533; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1 534; GFX1032-NEXT: s_add_u32 s0, s0, 4 535; GFX1032-NEXT: s_addc_u32 s1, s1, 0 536; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo 537; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo 538; GFX1032-NEXT: s_or_b32 s4, s4, s5 539; GFX1032-NEXT: .LBB11_3: ; %Flow 540; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1 541; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4 542; GFX1032-NEXT: s_or_b32 s2, s5, s2 543; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 544; GFX1032-NEXT: s_cbranch_execz .LBB11_6 545; GFX1032-NEXT: .LBB11_4: ; %bb2 546; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 547; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 548; GFX1032-NEXT: global_load_dword v3, v2, s[0:1] 549; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo 550; GFX1032-NEXT: s_waitcnt vmcnt(0) 551; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3 552; GFX1032-NEXT: s_cbranch_vccz .LBB11_2 553; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 554; GFX1032-NEXT: ; implicit-def: $sgpr3 555; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1 556; GFX1032-NEXT: s_branch .LBB11_3 557; GFX1032-NEXT: .LBB11_6: ; %.loopexit 558; GFX1032-NEXT: s_endpgm 559; 560; GFX1064-LABEL: test_loop_with_if_else_break: 561; GFX1064: ; %bb.0: ; %bb 562; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 563; GFX1064-NEXT: s_mov_b32 s6, 0 564; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc 565; GFX1064-NEXT: s_cbranch_execz .LBB11_6 566; GFX1064-NEXT: ; %bb.1: ; %.preheader 567; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 568; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0 569; GFX1064-NEXT: v_mov_b32_e32 v2, 0 570; GFX1064-NEXT: s_mov_b64 s[2:3], 0 571; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5 572; GFX1064-NEXT: s_branch .LBB11_4 573; GFX1064-NEXT: .LBB11_2: ; %bb8 574; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1 575; GFX1064-NEXT: s_add_i32 s6, s6, 1 576; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] 577; GFX1064-NEXT: v_cmp_ge_u32_e32 vcc, s6, v1 578; GFX1064-NEXT: s_add_u32 s0, s0, 4 579; GFX1064-NEXT: s_addc_u32 s1, s1, 0 580; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec 581; GFX1064-NEXT: s_and_b64 s[8:9], vcc, exec 582; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] 583; GFX1064-NEXT: .LBB11_3: ; %Flow 584; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1 585; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5] 586; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] 587; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] 588; GFX1064-NEXT: s_cbranch_execz .LBB11_6 589; GFX1064-NEXT: .LBB11_4: ; %bb2 590; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 591; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 592; GFX1064-NEXT: global_load_dword v3, v2, s[0:1] 593; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], exec 594; GFX1064-NEXT: s_waitcnt vmcnt(0) 595; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3 596; GFX1064-NEXT: s_cbranch_vccz .LBB11_2 597; GFX1064-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1 598; GFX1064-NEXT: ; implicit-def: $sgpr6 599; GFX1064-NEXT: ; implicit-def: $sgpr0_sgpr1 600; GFX1064-NEXT: s_branch .LBB11_3 601; GFX1064-NEXT: .LBB11_6: ; %.loopexit 602; GFX1064-NEXT: s_endpgm 603bb: 604 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 605 %tmp1 = icmp eq i32 %tmp, 0 606 br i1 %tmp1, label %.loopexit, label %.preheader 607 608.preheader: 609 br label %bb2 610 611bb2: 612 %tmp3 = phi i32 [ %tmp9, %bb8 ], [ 0, %.preheader ] 613 %tmp4 = zext i32 %tmp3 to i64 614 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4 615 %tmp6 = load i32, ptr addrspace(1) %tmp5, align 4 616 %tmp7 = icmp sgt i32 %tmp6, 10 617 br i1 %tmp7, label %bb8, label %.loopexit 618 619bb8: 620 store i32 %tmp, ptr addrspace(1) %tmp5, align 4 621 %tmp9 = add nuw nsw i32 %tmp3, 1 622 %tmp10 = icmp ult i32 %tmp9, 256 623 %tmp11 = icmp ult i32 %tmp9, %tmp 624 %tmp12 = and i1 %tmp10, %tmp11 625 br i1 %tmp12, label %bb2, label %.loopexit 626 627.loopexit: 628 ret void 629} 630 631define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { 632; GFX1032-LABEL: test_addc_vop2b: 633; GFX1032: ; %bb.0: ; %bb 634; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 635; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 636; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 637; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 638; GFX1032-NEXT: s_waitcnt vmcnt(0) 639; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2 640; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 641; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 642; GFX1032-NEXT: s_endpgm 643; 644; GFX1064-LABEL: test_addc_vop2b: 645; GFX1064: ; %bb.0: ; %bb 646; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 647; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 648; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 649; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 650; GFX1064-NEXT: s_waitcnt vmcnt(0) 651; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2 652; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc 653; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 654; GFX1064-NEXT: s_endpgm 655bb: 656 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 657 %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp 658 %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 659 %tmp5 = add nsw i64 %tmp4, %arg1 660 store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 661 ret void 662} 663 664define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { 665; GFX1032-LABEL: test_subbrev_vop2b: 666; GFX1032: ; %bb.0: ; %bb 667; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 668; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 669; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 670; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 671; GFX1032-NEXT: s_waitcnt vmcnt(0) 672; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2 673; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 674; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 675; GFX1032-NEXT: s_endpgm 676; 677; GFX1064-LABEL: test_subbrev_vop2b: 678; GFX1064: ; %bb.0: ; %bb 679; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 680; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 681; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 682; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 683; GFX1064-NEXT: s_waitcnt vmcnt(0) 684; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2 685; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc 686; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 687; GFX1064-NEXT: s_endpgm 688bb: 689 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 690 %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp 691 %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 692 %tmp5 = sub nsw i64 %tmp4, %arg1 693 store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 694 ret void 695} 696 697define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 { 698; GFX1032-LABEL: test_subb_vop2b: 699; GFX1032: ; %bb.0: ; %bb 700; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 701; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0 702; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 703; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 704; GFX1032-NEXT: s_waitcnt vmcnt(0) 705; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 706; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo 707; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 708; GFX1032-NEXT: s_endpgm 709; 710; GFX1064-LABEL: test_subb_vop2b: 711; GFX1064: ; %bb.0: ; %bb 712; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 713; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0 714; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 715; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] 716; GFX1064-NEXT: s_waitcnt vmcnt(0) 717; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 718; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc 719; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 720; GFX1064-NEXT: s_endpgm 721bb: 722 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() 723 %tmp3 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tmp 724 %tmp4 = load i64, ptr addrspace(1) %tmp3, align 8 725 %tmp5 = sub nsw i64 %arg1, %tmp4 726 store i64 %tmp5, ptr addrspace(1) %tmp3, align 8 727 ret void 728} 729 730define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { 731; GFX1032-LABEL: test_udiv64: 732; GFX1032: ; %bb.0: ; %bb 733; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 734; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 735; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 736; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 737; GFX1032-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5] 738; GFX1032-NEXT: s_mov_b32 s8, 0 739; GFX1032-NEXT: s_cmp_lg_u64 s[8:9], 0 740; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 741; GFX1032-NEXT: ; %bb.1: 742; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 743; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5 744; GFX1032-NEXT: s_sub_u32 s9, 0, s4 745; GFX1032-NEXT: s_subb_u32 s10, 0, s5 746; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 747; GFX1032-NEXT: v_rcp_f32_e32 v0, v0 748; GFX1032-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 749; GFX1032-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 750; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 751; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 752; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 753; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 754; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 755; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 756; GFX1032-NEXT: s_mul_i32 s11, s9, s0 757; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1 758; GFX1032-NEXT: s_mul_i32 s12, s10, s1 759; GFX1032-NEXT: s_add_i32 s11, s13, s11 760; GFX1032-NEXT: s_mul_i32 s14, s9, s1 761; GFX1032-NEXT: s_add_i32 s11, s11, s12 762; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14 763; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14 764; GFX1032-NEXT: s_mul_i32 s12, s0, s14 765; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 766; GFX1032-NEXT: s_mul_i32 s1, s1, s11 767; GFX1032-NEXT: s_mul_hi_u32 s16, s0, s11 768; GFX1032-NEXT: s_add_u32 s1, s13, s1 769; GFX1032-NEXT: s_addc_u32 s13, 0, s14 770; GFX1032-NEXT: s_add_u32 s1, s1, s12 771; GFX1032-NEXT: s_mul_i32 s11, s0, s11 772; GFX1032-NEXT: s_addc_u32 s1, s13, s15 773; GFX1032-NEXT: s_addc_u32 s12, s16, 0 774; GFX1032-NEXT: s_add_u32 s1, s1, s11 775; GFX1032-NEXT: s_addc_u32 s11, 0, s12 776; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 777; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 778; GFX1032-NEXT: s_addc_u32 s0, s0, s11 779; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 780; GFX1032-NEXT: s_mul_i32 s11, s9, s0 781; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1 782; GFX1032-NEXT: s_mul_i32 s10, s10, s1 783; GFX1032-NEXT: s_add_i32 s11, s12, s11 784; GFX1032-NEXT: s_mul_i32 s9, s9, s1 785; GFX1032-NEXT: s_add_i32 s11, s11, s10 786; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9 787; GFX1032-NEXT: s_mul_i32 s13, s0, s9 788; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9 789; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 790; GFX1032-NEXT: s_mul_i32 s1, s1, s11 791; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11 792; GFX1032-NEXT: s_add_u32 s1, s9, s1 793; GFX1032-NEXT: s_addc_u32 s9, 0, s14 794; GFX1032-NEXT: s_add_u32 s1, s1, s13 795; GFX1032-NEXT: s_mul_i32 s11, s0, s11 796; GFX1032-NEXT: s_addc_u32 s1, s9, s12 797; GFX1032-NEXT: s_addc_u32 s9, s10, 0 798; GFX1032-NEXT: s_add_u32 s1, s1, s11 799; GFX1032-NEXT: s_addc_u32 s9, 0, s9 800; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 801; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 802; GFX1032-NEXT: s_addc_u32 s0, s0, s9 803; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 804; GFX1032-NEXT: s_mul_i32 s10, s6, s0 805; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0 806; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0 807; GFX1032-NEXT: s_mul_i32 s0, s7, s0 808; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1 809; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1 810; GFX1032-NEXT: s_mul_i32 s1, s7, s1 811; GFX1032-NEXT: s_add_u32 s10, s12, s10 812; GFX1032-NEXT: s_addc_u32 s9, 0, s9 813; GFX1032-NEXT: s_add_u32 s1, s10, s1 814; GFX1032-NEXT: s_addc_u32 s1, s9, s13 815; GFX1032-NEXT: s_addc_u32 s9, s11, 0 816; GFX1032-NEXT: s_add_u32 s1, s1, s0 817; GFX1032-NEXT: s_addc_u32 s9, 0, s9 818; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1 819; GFX1032-NEXT: s_mul_i32 s11, s4, s9 820; GFX1032-NEXT: s_mul_i32 s12, s4, s1 821; GFX1032-NEXT: s_add_i32 s0, s0, s11 822; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12 823; GFX1032-NEXT: s_mul_i32 s10, s5, s1 824; GFX1032-NEXT: s_add_i32 s0, s0, s10 825; GFX1032-NEXT: v_sub_co_u32 v1, s12, v0, s4 826; GFX1032-NEXT: s_sub_i32 s10, s7, s0 827; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 828; GFX1032-NEXT: s_subb_u32 s10, s10, s5 829; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 830; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 831; GFX1032-NEXT: s_subb_u32 s10, s10, 0 832; GFX1032-NEXT: s_cmp_ge_u32 s10, s5 833; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 834; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 835; GFX1032-NEXT: s_cmp_eq_u32 s10, s5 836; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 837; GFX1032-NEXT: s_add_u32 s10, s1, 1 838; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 839; GFX1032-NEXT: s_addc_u32 s12, s9, 0 840; GFX1032-NEXT: s_add_u32 s13, s1, 2 841; GFX1032-NEXT: s_addc_u32 s14, s9, 0 842; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 843; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0 844; GFX1032-NEXT: s_subb_u32 s0, s7, s0 845; GFX1032-NEXT: v_mov_b32_e32 v2, s13 846; GFX1032-NEXT: s_cmp_ge_u32 s0, s5 847; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 848; GFX1032-NEXT: s_cselect_b32 s7, -1, 0 849; GFX1032-NEXT: s_cmp_eq_u32 s0, s5 850; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 851; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 852; GFX1032-NEXT: v_mov_b32_e32 v1, s14 853; GFX1032-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 854; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo 855; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 856; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 857; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo 858; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo 859; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 860; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 861; GFX1032-NEXT: .LBB15_2: 862; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 863; GFX1032-NEXT: s_sub_i32 s1, 0, s4 864; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0 865; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 866; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 867; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 868; GFX1032-NEXT: s_mul_i32 s1, s1, s0 869; GFX1032-NEXT: s_mul_hi_u32 s1, s0, s1 870; GFX1032-NEXT: s_add_i32 s0, s0, s1 871; GFX1032-NEXT: s_mul_hi_u32 s0, s6, s0 872; GFX1032-NEXT: s_mul_i32 s1, s0, s4 873; GFX1032-NEXT: s_add_i32 s5, s0, 1 874; GFX1032-NEXT: s_sub_i32 s1, s6, s1 875; GFX1032-NEXT: s_sub_i32 s6, s1, s4 876; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 877; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 878; GFX1032-NEXT: s_cselect_b32 s1, s6, s1 879; GFX1032-NEXT: s_add_i32 s5, s0, 1 880; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 881; GFX1032-NEXT: s_mov_b32 s1, 0 882; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 883; GFX1032-NEXT: v_mov_b32_e32 v0, s0 884; GFX1032-NEXT: v_mov_b32_e32 v1, s1 885; GFX1032-NEXT: .LBB15_3: 886; GFX1032-NEXT: v_mov_b32_e32 v2, 0 887; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 888; GFX1032-NEXT: s_endpgm 889; GFX1032-NEXT: .LBB15_4: 890; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 891; GFX1032-NEXT: s_branch .LBB15_2 892; 893; GFX1064-LABEL: test_udiv64: 894; GFX1064: ; %bb.0: ; %bb 895; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 896; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 897; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 898; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 899; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] 900; GFX1064-NEXT: s_mov_b32 s0, 0 901; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 902; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4 903; GFX1064-NEXT: ; %bb.1: 904; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 905; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5 906; GFX1064-NEXT: s_sub_u32 s9, 0, s4 907; GFX1064-NEXT: s_subb_u32 s10, 0, s5 908; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 909; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 910; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 911; GFX1064-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 912; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 913; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 914; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 915; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 916; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 917; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 918; GFX1064-NEXT: s_mul_i32 s1, s9, s8 919; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0 920; GFX1064-NEXT: s_mul_i32 s11, s10, s0 921; GFX1064-NEXT: s_add_i32 s1, s12, s1 922; GFX1064-NEXT: s_mul_i32 s13, s9, s0 923; GFX1064-NEXT: s_add_i32 s1, s1, s11 924; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13 925; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 926; GFX1064-NEXT: s_mul_i32 s11, s8, s13 927; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 928; GFX1064-NEXT: s_mul_i32 s0, s0, s1 929; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1 930; GFX1064-NEXT: s_add_u32 s0, s12, s0 931; GFX1064-NEXT: s_addc_u32 s12, 0, s13 932; GFX1064-NEXT: s_add_u32 s0, s0, s11 933; GFX1064-NEXT: s_mul_i32 s1, s8, s1 934; GFX1064-NEXT: s_addc_u32 s0, s12, s14 935; GFX1064-NEXT: s_addc_u32 s11, s15, 0 936; GFX1064-NEXT: s_add_u32 s0, s0, s1 937; GFX1064-NEXT: s_addc_u32 s11, 0, s11 938; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 939; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 940; GFX1064-NEXT: s_addc_u32 s8, s8, s11 941; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 942; GFX1064-NEXT: s_mul_i32 s1, s9, s8 943; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0 944; GFX1064-NEXT: s_mul_i32 s10, s10, s0 945; GFX1064-NEXT: s_add_i32 s1, s11, s1 946; GFX1064-NEXT: s_mul_i32 s9, s9, s0 947; GFX1064-NEXT: s_add_i32 s1, s1, s10 948; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9 949; GFX1064-NEXT: s_mul_i32 s12, s8, s9 950; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9 951; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 952; GFX1064-NEXT: s_mul_i32 s0, s0, s1 953; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1 954; GFX1064-NEXT: s_add_u32 s0, s9, s0 955; GFX1064-NEXT: s_addc_u32 s9, 0, s13 956; GFX1064-NEXT: s_add_u32 s0, s0, s12 957; GFX1064-NEXT: s_mul_i32 s1, s8, s1 958; GFX1064-NEXT: s_addc_u32 s0, s9, s11 959; GFX1064-NEXT: s_addc_u32 s9, s10, 0 960; GFX1064-NEXT: s_add_u32 s0, s0, s1 961; GFX1064-NEXT: s_addc_u32 s9, 0, s9 962; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 963; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 964; GFX1064-NEXT: s_addc_u32 s0, s8, s9 965; GFX1064-NEXT: v_readfirstlane_b32 s1, v0 966; GFX1064-NEXT: s_mul_i32 s9, s6, s0 967; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0 968; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0 969; GFX1064-NEXT: s_mul_i32 s0, s7, s0 970; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1 971; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1 972; GFX1064-NEXT: s_mul_i32 s1, s7, s1 973; GFX1064-NEXT: s_add_u32 s9, s11, s9 974; GFX1064-NEXT: s_addc_u32 s8, 0, s8 975; GFX1064-NEXT: s_add_u32 s1, s9, s1 976; GFX1064-NEXT: s_addc_u32 s1, s8, s12 977; GFX1064-NEXT: s_addc_u32 s8, s10, 0 978; GFX1064-NEXT: s_add_u32 s10, s1, s0 979; GFX1064-NEXT: s_addc_u32 s11, 0, s8 980; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10 981; GFX1064-NEXT: s_mul_i32 s1, s4, s11 982; GFX1064-NEXT: s_mul_i32 s9, s4, s10 983; GFX1064-NEXT: s_add_i32 s12, s0, s1 984; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 985; GFX1064-NEXT: s_mul_i32 s8, s5, s10 986; GFX1064-NEXT: s_add_i32 s12, s12, s8 987; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4 988; GFX1064-NEXT: s_sub_i32 s13, s7, s12 989; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 990; GFX1064-NEXT: s_subb_u32 s13, s13, s5 991; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 992; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 993; GFX1064-NEXT: s_subb_u32 s8, s13, 0 994; GFX1064-NEXT: s_cmp_ge_u32 s8, s5 995; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 996; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 997; GFX1064-NEXT: s_cmp_eq_u32 s8, s5 998; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 999; GFX1064-NEXT: s_add_u32 s8, s10, 1 1000; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc 1001; GFX1064-NEXT: s_addc_u32 s9, s11, 0 1002; GFX1064-NEXT: s_add_u32 s13, s10, 2 1003; GFX1064-NEXT: s_addc_u32 s14, s11, 0 1004; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 1005; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 1006; GFX1064-NEXT: s_subb_u32 s0, s7, s12 1007; GFX1064-NEXT: v_mov_b32_e32 v2, s13 1008; GFX1064-NEXT: s_cmp_ge_u32 s0, s5 1009; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1010; GFX1064-NEXT: s_cselect_b32 s7, -1, 0 1011; GFX1064-NEXT: s_cmp_eq_u32 s0, s5 1012; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1013; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 1014; GFX1064-NEXT: v_mov_b32_e32 v1, s14 1015; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] 1016; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc 1017; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc 1018; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1019; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc 1020; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc 1021; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 1022; GFX1064-NEXT: .LBB15_2: 1023; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 1024; GFX1064-NEXT: s_sub_i32 s1, 0, s4 1025; GFX1064-NEXT: v_rcp_iflag_f32_e32 v0, v0 1026; GFX1064-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1027; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 1028; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 1029; GFX1064-NEXT: s_mul_i32 s1, s1, s0 1030; GFX1064-NEXT: s_mul_hi_u32 s1, s0, s1 1031; GFX1064-NEXT: s_add_i32 s0, s0, s1 1032; GFX1064-NEXT: s_mul_hi_u32 s0, s6, s0 1033; GFX1064-NEXT: s_mul_i32 s1, s0, s4 1034; GFX1064-NEXT: s_add_i32 s5, s0, 1 1035; GFX1064-NEXT: s_sub_i32 s1, s6, s1 1036; GFX1064-NEXT: s_sub_i32 s6, s1, s4 1037; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 1038; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 1039; GFX1064-NEXT: s_cselect_b32 s1, s6, s1 1040; GFX1064-NEXT: s_add_i32 s5, s0, 1 1041; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 1042; GFX1064-NEXT: s_mov_b32 s1, 0 1043; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 1044; GFX1064-NEXT: v_mov_b32_e32 v0, s0 1045; GFX1064-NEXT: v_mov_b32_e32 v1, s1 1046; GFX1064-NEXT: .LBB15_3: 1047; GFX1064-NEXT: v_mov_b32_e32 v2, 0 1048; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 1049; GFX1064-NEXT: s_endpgm 1050; GFX1064-NEXT: .LBB15_4: 1051; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 1052; GFX1064-NEXT: s_branch .LBB15_2 1053bb: 1054 %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1 1055 %tmp1 = load i64, ptr addrspace(1) %tmp, align 8 1056 %tmp2 = load i64, ptr addrspace(1) %arg, align 8 1057 %tmp3 = udiv i64 %tmp1, %tmp2 1058 %tmp4 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 2 1059 store i64 %tmp3, ptr addrspace(1) %tmp4, align 8 1060 ret void 1061} 1062 1063define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1064; GFX1032-LABEL: test_div_scale_f32: 1065; GFX1032: ; %bb.0: 1066; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1067; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1068; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1069; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 1070; GFX1032-NEXT: s_waitcnt vmcnt(0) 1071; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 1072; GFX1032-NEXT: s_waitcnt vmcnt(0) 1073; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1074; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1 1075; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 1076; GFX1032-NEXT: s_endpgm 1077; 1078; GFX1064-LABEL: test_div_scale_f32: 1079; GFX1064: ; %bb.0: 1080; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1081; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1082; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1083; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 1084; GFX1064-NEXT: s_waitcnt vmcnt(0) 1085; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 1086; GFX1064-NEXT: s_waitcnt vmcnt(0) 1087; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1088; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1 1089; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 1090; GFX1064-NEXT: s_endpgm 1091 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 1092 %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid 1093 %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 1094 1095 %a = load volatile float, ptr addrspace(1) %gep.0, align 4 1096 %b = load volatile float, ptr addrspace(1) %gep.1, align 4 1097 1098 %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone 1099 %result0 = extractvalue { float, i1 } %result, 0 1100 store float %result0, ptr addrspace(1) %out, align 4 1101 ret void 1102} 1103 1104define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %in) #0 { 1105; GFX1032-LABEL: test_div_scale_f64: 1106; GFX1032: ; %bb.0: 1107; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1108; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1109; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1110; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc 1111; GFX1032-NEXT: s_waitcnt vmcnt(0) 1112; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc 1113; GFX1032-NEXT: s_waitcnt vmcnt(0) 1114; GFX1032-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[2:3], v[0:1] 1115; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1116; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1117; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1118; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1119; GFX1032-NEXT: s_endpgm 1120; 1121; GFX1064-LABEL: test_div_scale_f64: 1122; GFX1064: ; %bb.0: 1123; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1124; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1125; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1126; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc dlc 1127; GFX1064-NEXT: s_waitcnt vmcnt(0) 1128; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc dlc 1129; GFX1064-NEXT: s_waitcnt vmcnt(0) 1130; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[2:3], v[0:1] 1131; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1132; GFX1064-NEXT: v_mov_b32_e32 v2, 0 1133; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1134; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1135; GFX1064-NEXT: s_endpgm 1136 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 1137 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid 1138 %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 1139 1140 %a = load volatile double, ptr addrspace(1) %gep.0, align 8 1141 %b = load volatile double, ptr addrspace(1) %gep.1, align 8 1142 1143 %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone 1144 %result0 = extractvalue { double, i1 } %result, 0 1145 store double %result0, ptr addrspace(1) %out, align 8 1146 ret void 1147} 1148 1149define i64 @test_mad_i64_i32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 1150; GFX1032-LABEL: test_mad_i64_i32: 1151; GFX1032: ; %bb.0: 1152; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1153; GFX1032-NEXT: v_mad_i64_i32 v[0:1], s4, v0, v1, v[2:3] 1154; GFX1032-NEXT: s_setpc_b64 s[30:31] 1155; 1156; GFX1064-LABEL: test_mad_i64_i32: 1157; GFX1064: ; %bb.0: 1158; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1159; GFX1064-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] 1160; GFX1064-NEXT: s_setpc_b64 s[30:31] 1161 %sext0 = sext i32 %arg0 to i64 1162 %sext1 = sext i32 %arg1 to i64 1163 %mul = mul i64 %sext0, %sext1 1164 %mad = add i64 %mul, %arg2 1165 ret i64 %mad 1166} 1167 1168define i64 @test_mad_u64_u32(i32 %arg0, i32 %arg1, i64 %arg2) #0 { 1169; GFX1032-LABEL: test_mad_u64_u32: 1170; GFX1032: ; %bb.0: 1171; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1172; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s4, v0, v1, v[2:3] 1173; GFX1032-NEXT: s_setpc_b64 s[30:31] 1174; 1175; GFX1064-LABEL: test_mad_u64_u32: 1176; GFX1064: ; %bb.0: 1177; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1178; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] 1179; GFX1064-NEXT: s_setpc_b64 s[30:31] 1180 %sext0 = zext i32 %arg0 to i64 1181 %sext1 = zext i32 %arg1 to i64 1182 %mul = mul i64 %sext0, %sext1 1183 %mad = add i64 %mul, %arg2 1184 ret i64 %mad 1185} 1186 1187define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i1 %d) nounwind { 1188; GFX1032-LABEL: test_div_fmas_f32: 1189; GFX1032: ; %bb.0: 1190; GFX1032-NEXT: s_clause 0x1 1191; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1192; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1193; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1194; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX1032-NEXT: v_mov_b32_e32 v0, s1 1196; GFX1032-NEXT: v_mov_b32_e32 v1, s2 1197; GFX1032-NEXT: s_bitcmp1_b32 s3, 0 1198; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 1199; GFX1032-NEXT: v_div_fmas_f32 v0, s0, v0, v1 1200; GFX1032-NEXT: global_store_dword v2, v0, s[6:7] 1201; GFX1032-NEXT: s_endpgm 1202; 1203; GFX1064-LABEL: test_div_fmas_f32: 1204; GFX1064: ; %bb.0: 1205; GFX1064-NEXT: s_clause 0x1 1206; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1207; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1208; GFX1064-NEXT: v_mov_b32_e32 v2, 0 1209; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1210; GFX1064-NEXT: v_mov_b32_e32 v0, s1 1211; GFX1064-NEXT: v_mov_b32_e32 v1, s2 1212; GFX1064-NEXT: s_bitcmp1_b32 s3, 0 1213; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 1214; GFX1064-NEXT: v_div_fmas_f32 v0, s0, v0, v1 1215; GFX1064-NEXT: global_store_dword v2, v0, s[6:7] 1216; GFX1064-NEXT: s_endpgm 1217 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone 1218 store float %result, ptr addrspace(1) %out, align 4 1219 ret void 1220} 1221 1222define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind { 1223; GFX1032-LABEL: test_div_fmas_f64: 1224; GFX1032: ; %bb.0: 1225; GFX1032-NEXT: s_clause 0x1 1226; GFX1032-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1227; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x44 1228; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1229; GFX1032-NEXT: v_mov_b32_e32 v0, s12 1230; GFX1032-NEXT: v_mov_b32_e32 v1, s13 1231; GFX1032-NEXT: v_mov_b32_e32 v2, s14 1232; GFX1032-NEXT: v_mov_b32_e32 v3, s15 1233; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 1234; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 1235; GFX1032-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] 1236; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1237; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 1238; GFX1032-NEXT: s_endpgm 1239; 1240; GFX1064-LABEL: test_div_fmas_f64: 1241; GFX1064: ; %bb.0: 1242; GFX1064-NEXT: s_clause 0x1 1243; GFX1064-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1244; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x44 1245; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX1064-NEXT: v_mov_b32_e32 v0, s12 1247; GFX1064-NEXT: v_mov_b32_e32 v1, s13 1248; GFX1064-NEXT: v_mov_b32_e32 v2, s14 1249; GFX1064-NEXT: v_mov_b32_e32 v3, s15 1250; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 1251; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 1252; GFX1064-NEXT: v_div_fmas_f64 v[0:1], s[10:11], v[0:1], v[2:3] 1253; GFX1064-NEXT: v_mov_b32_e32 v2, 0 1254; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 1255; GFX1064-NEXT: s_endpgm 1256 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone 1257 store double %result, ptr addrspace(1) %out, align 8 1258 ret void 1259} 1260 1261 1262 1263define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { 1264; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: 1265; GFX1032: ; %bb.0: ; %entry 1266; GFX1032-NEXT: s_clause 0x1 1267; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1268; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 1269; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1270; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 1271; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 1272; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1273; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] 1274; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 1275; GFX1032-NEXT: s_cbranch_execz .LBB22_2 1276; GFX1032-NEXT: ; %bb.1: ; %bb 1277; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1278; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 1279; GFX1032-NEXT: s_waitcnt vmcnt(0) 1280; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1281; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo 1282; GFX1032-NEXT: .LBB22_2: ; %exit 1283; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 1284; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1285; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1286; GFX1032-NEXT: s_waitcnt vmcnt(0) 1287; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 1288; GFX1032-NEXT: global_store_dword v0, v1, s[8:9] offset:8 1289; GFX1032-NEXT: s_endpgm 1290; 1291; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: 1292; GFX1064: ; %bb.0: ; %entry 1293; GFX1064-NEXT: s_clause 0x1 1294; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1295; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1296; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1297; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 1298; GFX1064-NEXT: s_mov_b64 vcc, 0 1299; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1300; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[10:11] 1301; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 1302; GFX1064-NEXT: s_cbranch_execz .LBB22_2 1303; GFX1064-NEXT: ; %bb.1: ; %bb 1304; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1305; GFX1064-NEXT: global_load_dword v0, v0, s[6:7] glc dlc 1306; GFX1064-NEXT: s_waitcnt vmcnt(0) 1307; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1308; GFX1064-NEXT: s_and_b64 vcc, vcc, exec 1309; GFX1064-NEXT: .LBB22_2: ; %exit 1310; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 1311; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1312; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1313; GFX1064-NEXT: s_waitcnt vmcnt(0) 1314; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 1315; GFX1064-NEXT: global_store_dword v0, v1, s[8:9] offset:8 1316; GFX1064-NEXT: s_endpgm 1317entry: 1318 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 1319 %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2 1320 %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid 1321 %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1 1322 %gep.c = getelementptr float, ptr addrspace(1) %gep.a, i32 2 1323 1324 %a = load float, ptr addrspace(1) %gep.a 1325 %b = load float, ptr addrspace(1) %gep.b 1326 %c = load float, ptr addrspace(1) %gep.c 1327 1328 %cmp0 = icmp eq i32 %tid, 0 1329 br i1 %cmp0, label %bb, label %exit 1330 1331bb: 1332 %val = load volatile i32, ptr addrspace(1) %dummy 1333 %cmp1 = icmp ne i32 %val, 0 1334 br label %exit 1335 1336exit: 1337 %cond = phi i1 [false, %entry], [%cmp1, %bb] 1338 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond) nounwind readnone 1339 store float %result, ptr addrspace(1) %gep.out, align 4 1340 ret void 1341} 1342 1343 1344define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { 1345; GFX1032-LABEL: fdiv_f32: 1346; GFX1032: ; %bb.0: ; %entry 1347; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1348; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1349; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 1350; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 1351; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 1352; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 1353; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 1354; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 1355; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 1356; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 1357; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 1358; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 1359; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1360; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 1361; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] 1362; GFX1032-NEXT: s_endpgm 1363; 1364; GFX1064-LABEL: fdiv_f32: 1365; GFX1064: ; %bb.0: ; %entry 1366; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1367; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 1369; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 1370; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 1371; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 1372; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 1373; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 1374; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 1375; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 1376; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 1377; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 1378; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1379; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 1380; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] 1381; GFX1064-NEXT: s_endpgm 1382entry: 1383 %fdiv = fdiv float %a, %b 1384 store float %fdiv, ptr addrspace(1) %out 1385 ret void 1386} 1387 1388define amdgpu_kernel void @test_br_cc_f16( 1389; GFX1032-LABEL: test_br_cc_f16: 1390; GFX1032: ; %bb.0: ; %entry 1391; GFX1032-NEXT: s_clause 0x1 1392; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1393; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1394; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1395; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1396; GFX1032-NEXT: s_clause 0x1 1397; GFX1032-NEXT: global_load_ushort v1, v0, s[2:3] 1398; GFX1032-NEXT: global_load_ushort v2, v0, s[6:7] 1399; GFX1032-NEXT: s_waitcnt vmcnt(0) 1400; GFX1032-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v2 1401; GFX1032-NEXT: s_cbranch_vccnz .LBB24_2 1402; GFX1032-NEXT: ; %bb.1: ; %one 1403; GFX1032-NEXT: global_store_short v0, v1, s[0:1] 1404; GFX1032-NEXT: s_endpgm 1405; GFX1032-NEXT: .LBB24_2: ; %two 1406; GFX1032-NEXT: global_store_short v0, v2, s[0:1] 1407; GFX1032-NEXT: s_endpgm 1408; 1409; GFX1064-LABEL: test_br_cc_f16: 1410; GFX1064: ; %bb.0: ; %entry 1411; GFX1064-NEXT: s_clause 0x1 1412; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1413; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1414; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1415; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1416; GFX1064-NEXT: s_clause 0x1 1417; GFX1064-NEXT: global_load_ushort v1, v0, s[2:3] 1418; GFX1064-NEXT: global_load_ushort v2, v0, s[6:7] 1419; GFX1064-NEXT: s_waitcnt vmcnt(0) 1420; GFX1064-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v2 1421; GFX1064-NEXT: s_cbranch_vccnz .LBB24_2 1422; GFX1064-NEXT: ; %bb.1: ; %one 1423; GFX1064-NEXT: global_store_short v0, v1, s[0:1] 1424; GFX1064-NEXT: s_endpgm 1425; GFX1064-NEXT: .LBB24_2: ; %two 1426; GFX1064-NEXT: global_store_short v0, v2, s[0:1] 1427; GFX1064-NEXT: s_endpgm 1428 ptr addrspace(1) %r, 1429 ptr addrspace(1) %a, 1430 ptr addrspace(1) %b) { 1431entry: 1432 %a.val = load half, ptr addrspace(1) %a 1433 %b.val = load half, ptr addrspace(1) %b 1434 %fcmp = fcmp olt half %a.val, %b.val 1435 br i1 %fcmp, label %one, label %two 1436 1437one: 1438 store half %a.val, ptr addrspace(1) %r 1439 ret void 1440 1441two: 1442 store half %b.val, ptr addrspace(1) %r 1443 ret void 1444} 1445 1446define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %val) #0 { 1447; GCN-LABEL: test_brcc_i1: 1448; GCN: ; %bb.0: 1449; GCN-NEXT: s_load_dword s0, s[4:5], 0x34 1450; GCN-NEXT: s_waitcnt lgkmcnt(0) 1451; GCN-NEXT: s_bitcmp0_b32 s0, 0 1452; GCN-NEXT: s_cbranch_scc1 .LBB25_2 1453; GCN-NEXT: ; %bb.1: ; %store 1454; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1455; GCN-NEXT: v_mov_b32_e32 v0, 0 1456; GCN-NEXT: v_mov_b32_e32 v1, 0xde 1457; GCN-NEXT: s_waitcnt lgkmcnt(0) 1458; GCN-NEXT: global_store_dword v0, v1, s[0:1] 1459; GCN-NEXT: .LBB25_2: ; %end 1460; GCN-NEXT: s_endpgm 1461 %cmp0 = icmp ne i1 %val, 0 1462 br i1 %cmp0, label %store, label %end 1463 1464store: 1465 store i32 222, ptr addrspace(1) %out 1466 ret void 1467 1468end: 1469 ret void 1470} 1471 1472define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) #0 { 1473; GFX1032-LABEL: test_preserve_condition_undef_flag: 1474; GFX1032: ; %bb.0: ; %bb0 1475; GFX1032-NEXT: s_clause 0x1 1476; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c 1477; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x24 1478; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1479; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 1480; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 1481; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 1482; GFX1032-NEXT: s_or_b32 s1, s2, s1 1483; GFX1032-NEXT: s_or_b32 s0, s1, s0 1484; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 1485; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 1486; GFX1032-NEXT: ; %bb.1: ; %bb1 1487; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1488; GFX1032-NEXT: global_store_dword v[0:1], v0, off 1489; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1490; GFX1032-NEXT: .LBB26_2: ; %bb2 1491; GFX1032-NEXT: s_endpgm 1492; 1493; GFX1064-LABEL: test_preserve_condition_undef_flag: 1494; GFX1064: ; %bb.0: ; %bb0 1495; GFX1064-NEXT: s_clause 0x1 1496; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c 1497; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x24 1498; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s6, 1.0 1500; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 1501; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s6, 0 1502; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1503; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] 1504; GFX1064-NEXT: s_and_b64 vcc, exec, s[0:1] 1505; GFX1064-NEXT: s_cbranch_vccnz .LBB26_2 1506; GFX1064-NEXT: ; %bb.1: ; %bb1 1507; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1508; GFX1064-NEXT: global_store_dword v[0:1], v0, off 1509; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1510; GFX1064-NEXT: .LBB26_2: ; %bb2 1511; GFX1064-NEXT: s_endpgm 1512bb0: 1513 %tmp = icmp sgt i32 %arg1, 4 1514 %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef) 1515 %tmp4 = select i1 %undef, float %arg, float 1.000000e+00 1516 %tmp5 = fcmp ogt float %arg2, 0.000000e+00 1517 %tmp6 = fcmp olt float %arg2, 1.000000e+00 1518 %tmp7 = fcmp olt float %arg, %tmp4 1519 %tmp8 = and i1 %tmp5, %tmp6 1520 %tmp9 = and i1 %tmp8, %tmp7 1521 br i1 %tmp9, label %bb1, label %bb2 1522 1523bb1: 1524 store volatile i32 0, ptr addrspace(1) undef 1525 br label %bb2 1526 1527bb2: 1528 ret void 1529} 1530 1531define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { 1532; GFX1032-LABEL: test_invert_true_phi_cond_break_loop: 1533; GFX1032: ; %bb.0: ; %bb 1534; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x24 1535; GFX1032-NEXT: ; implicit-def: $sgpr1 1536; GFX1032-NEXT: ; implicit-def: $sgpr2 1537; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1538; GFX1032-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 1539; GFX1032-NEXT: s_mov_b32 s0, 0 1540; GFX1032-NEXT: s_branch .LBB27_2 1541; GFX1032-NEXT: .LBB27_1: ; %Flow 1542; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 1543; GFX1032-NEXT: s_xor_b32 s3, s1, -1 1544; GFX1032-NEXT: s_add_i32 s2, s2, 1 1545; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3 1546; GFX1032-NEXT: s_or_b32 s0, s3, s0 1547; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 1548; GFX1032-NEXT: s_cbranch_execz .LBB27_4 1549; GFX1032-NEXT: .LBB27_2: ; %bb1 1550; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 1551; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo 1552; GFX1032-NEXT: s_cmp_gt_i32 s2, -1 1553; GFX1032-NEXT: s_cbranch_scc1 .LBB27_1 1554; GFX1032-NEXT: ; %bb.3: ; %bb4 1555; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 1556; GFX1032-NEXT: global_load_dword v1, v[0:1], off glc dlc 1557; GFX1032-NEXT: s_waitcnt vmcnt(0) 1558; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo 1559; GFX1032-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1 1560; GFX1032-NEXT: s_and_b32 s3, vcc_lo, exec_lo 1561; GFX1032-NEXT: s_or_b32 s1, s1, s3 1562; GFX1032-NEXT: s_branch .LBB27_1 1563; GFX1032-NEXT: .LBB27_4: ; %bb9 1564; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 1565; GFX1032-NEXT: v_mov_b32_e32 v0, 7 1566; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 1567; GFX1032-NEXT: ds_write_b32 v0, v0 1568; GFX1032-NEXT: s_endpgm 1569; 1570; GFX1064-LABEL: test_invert_true_phi_cond_break_loop: 1571; GFX1064: ; %bb.0: ; %bb 1572; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x24 1573; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3 1574; GFX1064-NEXT: ; implicit-def: $sgpr4 1575; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1576; GFX1064-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 1577; GFX1064-NEXT: s_mov_b64 s[0:1], 0 1578; GFX1064-NEXT: s_branch .LBB27_2 1579; GFX1064-NEXT: .LBB27_1: ; %Flow 1580; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1 1581; GFX1064-NEXT: s_xor_b64 s[6:7], s[2:3], -1 1582; GFX1064-NEXT: s_add_i32 s4, s4, 1 1583; GFX1064-NEXT: s_and_b64 s[6:7], exec, s[6:7] 1584; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] 1585; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] 1586; GFX1064-NEXT: s_cbranch_execz .LBB27_4 1587; GFX1064-NEXT: .LBB27_2: ; %bb1 1588; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 1589; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], exec 1590; GFX1064-NEXT: s_cmp_gt_i32 s4, -1 1591; GFX1064-NEXT: s_cbranch_scc1 .LBB27_1 1592; GFX1064-NEXT: ; %bb.3: ; %bb4 1593; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1 1594; GFX1064-NEXT: global_load_dword v1, v[0:1], off glc dlc 1595; GFX1064-NEXT: s_waitcnt vmcnt(0) 1596; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], exec 1597; GFX1064-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 1598; GFX1064-NEXT: s_and_b64 s[6:7], vcc, exec 1599; GFX1064-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] 1600; GFX1064-NEXT: s_branch .LBB27_1 1601; GFX1064-NEXT: .LBB27_4: ; %bb9 1602; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] 1603; GFX1064-NEXT: v_mov_b32_e32 v0, 7 1604; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 1605; GFX1064-NEXT: ds_write_b32 v0, v0 1606; GFX1064-NEXT: s_endpgm 1607bb: 1608 %id = call i32 @llvm.amdgcn.workitem.id.x() 1609 %tmp = sub i32 %id, %arg 1610 br label %bb1 1611 1612bb1: ; preds = %Flow, %bb 1613 %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] 1614 %lsr.iv.next = add i32 %lsr.iv, 1 1615 %cmp0 = icmp slt i32 %lsr.iv.next, 0 1616 br i1 %cmp0, label %bb4, label %Flow 1617 1618bb4: ; preds = %bb1 1619 %load = load volatile i32, ptr addrspace(1) undef, align 4 1620 %cmp1 = icmp sge i32 %tmp, %load 1621 br label %Flow 1622 1623Flow: ; preds = %bb4, %bb1 1624 %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] 1625 %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] 1626 br i1 %tmp3, label %bb1, label %bb9 1627 1628bb9: ; preds = %Flow 1629 store volatile i32 7, ptr addrspace(3) undef 1630 ret void 1631} 1632 1633define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1) %out) #0 { 1634; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr: 1635; GFX1032: ; %bb.0: ; %entry 1636; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 1637; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1638; GFX1032-NEXT: v_mov_b32_e32 v2, 0 1639; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 1640; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 1641; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 2, v0 1642; GFX1032-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc_lo 1643; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0 1644; GFX1032-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc_lo 1645; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX1032-NEXT: global_store_dword v2, v0, s[0:1] 1647; GFX1032-NEXT: s_endpgm 1648; 1649; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr: 1650; GFX1064: ; %bb.0: ; %entry 1651; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0 1652; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1653; GFX1064-NEXT: v_mov_b32_e32 v2, 0 1654; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 1655; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 1656; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 1657; GFX1064-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc 1658; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 1659; GFX1064-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc 1660; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1661; GFX1064-NEXT: global_store_dword v2, v0, s[0:1] 1662; GFX1064-NEXT: s_endpgm 1663entry: 1664 %id = call i32 @llvm.amdgcn.workitem.id.x() #1 1665 %index = add i32 %id, -512 1666 %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index 1667 store i32 %value, ptr addrspace(1) %out 1668 ret void 1669} 1670 1671define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0 { 1672; GFX1032-LABEL: test_set_inactive: 1673; GFX1032: ; %bb.0: 1674; GFX1032-NEXT: s_clause 0x1 1675; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c 1676; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1677; GFX1032-NEXT: s_or_saveexec_b32 s3, -1 1678; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1679; GFX1032-NEXT: v_cndmask_b32_e64 v0, 42, s2, s3 1680; GFX1032-NEXT: s_mov_b32 exec_lo, s3 1681; GFX1032-NEXT: v_mov_b32_e32 v1, 0 1682; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1683; GFX1032-NEXT: global_store_dword v1, v2, s[0:1] 1684; GFX1032-NEXT: s_endpgm 1685; 1686; GFX1064-LABEL: test_set_inactive: 1687; GFX1064: ; %bb.0: 1688; GFX1064-NEXT: s_clause 0x1 1689; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c 1690; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1691; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 1692; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX1064-NEXT: v_cndmask_b32_e64 v0, 42, s6, s[2:3] 1694; GFX1064-NEXT: s_mov_b64 exec, s[2:3] 1695; GFX1064-NEXT: v_mov_b32_e32 v1, 0 1696; GFX1064-NEXT: v_mov_b32_e32 v2, v0 1697; GFX1064-NEXT: global_store_dword v1, v2, s[0:1] 1698; GFX1064-NEXT: s_endpgm 1699 %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) 1700 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) 1701 store i32 %tmp, ptr addrspace(1) %out 1702 ret void 1703} 1704 1705define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 { 1706; GFX1032-LABEL: test_set_inactive_64: 1707; GFX1032: ; %bb.0: 1708; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1709; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 1710; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, s4 1712; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s2, s4 1713; GFX1032-NEXT: s_mov_b32 exec_lo, s4 1714; GFX1032-NEXT: v_mov_b32_e32 v2, v0 1715; GFX1032-NEXT: v_mov_b32_e32 v4, 0 1716; GFX1032-NEXT: v_mov_b32_e32 v3, v1 1717; GFX1032-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 1718; GFX1032-NEXT: s_endpgm 1719; 1720; GFX1064-LABEL: test_set_inactive_64: 1721; GFX1064: ; %bb.0: 1722; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1723; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1724; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 1725; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5] 1726; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s2, s[4:5] 1727; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1728; GFX1064-NEXT: v_mov_b32_e32 v2, v0 1729; GFX1064-NEXT: v_mov_b32_e32 v4, 0 1730; GFX1064-NEXT: v_mov_b32_e32 v3, v1 1731; GFX1064-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] 1732; GFX1064-NEXT: s_endpgm 1733 %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) 1734 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) 1735 store i64 %tmp, ptr addrspace(1) %out 1736 ret void 1737} 1738 1739define amdgpu_ps void @test_kill_i1_terminator_float() #0 { 1740; GFX1032-LABEL: test_kill_i1_terminator_float: 1741; GFX1032: ; %bb.0: 1742; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, exec_lo 1743; GFX1032-NEXT: s_cbranch_scc0 .LBB31_1 1744; GFX1032-NEXT: s_endpgm 1745; GFX1032-NEXT: .LBB31_1: 1746; GFX1032-NEXT: s_mov_b32 exec_lo, 0 1747; GFX1032-NEXT: exp null off, off, off, off done vm 1748; GFX1032-NEXT: s_endpgm 1749; 1750; GFX1064-LABEL: test_kill_i1_terminator_float: 1751; GFX1064: ; %bb.0: 1752; GFX1064-NEXT: s_andn2_b64 exec, exec, exec 1753; GFX1064-NEXT: s_cbranch_scc0 .LBB31_1 1754; GFX1064-NEXT: s_endpgm 1755; GFX1064-NEXT: .LBB31_1: 1756; GFX1064-NEXT: s_mov_b64 exec, 0 1757; GFX1064-NEXT: exp null off, off, off, off done vm 1758; GFX1064-NEXT: s_endpgm 1759 call void @llvm.amdgcn.kill(i1 false) 1760 ret void 1761} 1762 1763define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { 1764; GFX1032-LABEL: test_kill_i1_terminator_i1: 1765; GFX1032: ; %bb.0: 1766; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v1 1767; GFX1032-NEXT: v_cmp_lt_i32_e64 s0, v2, v3 1768; GFX1032-NEXT: s_mov_b32 s1, exec_lo 1769; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 1770; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s0 1771; GFX1032-NEXT: s_andn2_b32 s1, s1, s0 1772; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s1 1773; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1774; GFX1032-NEXT: exp mrt0 off, off, off, off 1775; GFX1032-NEXT: s_endpgm 1776; GFX1032-NEXT: ; %bb.1: 1777; GFX1032-NEXT: s_mov_b32 exec_lo, 0 1778; GFX1032-NEXT: s_endpgm 1779; 1780; GFX1064-LABEL: test_kill_i1_terminator_i1: 1781; GFX1064: ; %bb.0: 1782; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 1783; GFX1064-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3 1784; GFX1064-NEXT: s_mov_b64 s[2:3], exec 1785; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1786; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] 1787; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] 1788; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3] 1789; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1790; GFX1064-NEXT: exp mrt0 off, off, off, off 1791; GFX1064-NEXT: s_endpgm 1792; GFX1064-NEXT: ; %bb.1: 1793; GFX1064-NEXT: s_mov_b64 exec, 0 1794; GFX1064-NEXT: s_endpgm 1795 %c1 = icmp slt i32 %a, %b 1796 %c2 = icmp slt i32 %c, %d 1797 %x = or i1 %c1, %c2 1798 call void @llvm.amdgcn.kill(i1 %x) 1799 call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) 1800 ret void 1801} 1802 1803define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) #0 { 1804; GFX1032-LABEL: test_loop_vcc: 1805; GFX1032: ; %bb.0: ; %entry 1806; GFX1032-NEXT: s_mov_b32 s0, exec_lo 1807; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo 1808; GFX1032-NEXT: v_mov_b32_e32 v7, v3 1809; GFX1032-NEXT: v_mov_b32_e32 v6, v2 1810; GFX1032-NEXT: v_mov_b32_e32 v5, v1 1811; GFX1032-NEXT: v_mov_b32_e32 v4, v0 1812; GFX1032-NEXT: v_mov_b32_e32 v8, 0 1813; GFX1032-NEXT: s_branch .LBB33_2 1814; GFX1032-NEXT: .LBB33_1: ; %body 1815; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 1816; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1817; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 1818; GFX1032-NEXT: s_cbranch_execz .LBB33_4 1819; GFX1032-NEXT: .LBB33_2: ; %loop 1820; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 1821; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 1822; GFX1032-NEXT: s_waitcnt vmcnt(0) 1823; GFX1032-NEXT: v_mov_b32_e32 v0, v4 1824; GFX1032-NEXT: v_mov_b32_e32 v1, v5 1825; GFX1032-NEXT: v_mov_b32_e32 v2, v6 1826; GFX1032-NEXT: v_mov_b32_e32 v3, v7 1827; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 1828; GFX1032-NEXT: ; %bb.3: 1829; GFX1032-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 1830; GFX1032-NEXT: ; implicit-def: $vgpr8 1831; GFX1032-NEXT: .LBB33_4: ; %break 1832; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 1833; GFX1032-NEXT: s_waitcnt vmcnt(0) 1834; GFX1032-NEXT: ; return to shader part epilog 1835; 1836; GFX1064-LABEL: test_loop_vcc: 1837; GFX1064: ; %bb.0: ; %entry 1838; GFX1064-NEXT: s_mov_b64 s[0:1], exec 1839; GFX1064-NEXT: s_wqm_b64 exec, exec 1840; GFX1064-NEXT: v_mov_b32_e32 v7, v3 1841; GFX1064-NEXT: v_mov_b32_e32 v6, v2 1842; GFX1064-NEXT: v_mov_b32_e32 v5, v1 1843; GFX1064-NEXT: v_mov_b32_e32 v4, v0 1844; GFX1064-NEXT: v_mov_b32_e32 v8, 0 1845; GFX1064-NEXT: s_branch .LBB33_2 1846; GFX1064-NEXT: .LBB33_1: ; %body 1847; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 1848; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D 1849; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 1850; GFX1064-NEXT: s_cbranch_execz .LBB33_4 1851; GFX1064-NEXT: .LBB33_2: ; %loop 1852; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 1853; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 1854; GFX1064-NEXT: s_waitcnt vmcnt(0) 1855; GFX1064-NEXT: v_mov_b32_e32 v0, v4 1856; GFX1064-NEXT: v_mov_b32_e32 v1, v5 1857; GFX1064-NEXT: v_mov_b32_e32 v2, v6 1858; GFX1064-NEXT: v_mov_b32_e32 v3, v7 1859; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 1860; GFX1064-NEXT: ; %bb.3: 1861; GFX1064-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 1862; GFX1064-NEXT: ; implicit-def: $vgpr8 1863; GFX1064-NEXT: .LBB33_4: ; %break 1864; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] 1865; GFX1064-NEXT: s_waitcnt vmcnt(0) 1866; GFX1064-NEXT: ; return to shader part epilog 1867entry: 1868 br label %loop 1869 1870loop: 1871 %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ] 1872 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ] 1873 %cc = fcmp ogt float %ctr.iv, 7.0 1874 br i1 %cc, label %break, label %body 1875 1876body: 1877 %c.iv0 = extractelement <4 x float> %c.iv, i32 0 1878 %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 0, i32 0, i32 0) 1879 %ctr.next = fadd float %ctr.iv, 2.0 1880 br label %loop 1881 1882break: 1883 ret <4 x float> %c.iv 1884} 1885 1886define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { 1887; GFX1032-LABEL: test_wwm1: 1888; GFX1032: ; %bb.0: ; %main_body 1889; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1890; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1891; GFX1032-NEXT: v_mov_b32_e32 v3, v0 1892; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 1893; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1894; GFX1032-NEXT: v_mov_b32_e32 v0, v2 1895; GFX1032-NEXT: ; return to shader part epilog 1896; 1897; GFX1064-LABEL: test_wwm1: 1898; GFX1064: ; %bb.0: ; %main_body 1899; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1900; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1901; GFX1064-NEXT: v_mov_b32_e32 v3, v0 1902; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 1903; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1904; GFX1064-NEXT: v_mov_b32_e32 v0, v2 1905; GFX1064-NEXT: ; return to shader part epilog 1906main_body: 1907 %out = fadd float %src0, %src1 1908 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 1909 ret float %out.0 1910} 1911 1912define amdgpu_ps float @test_wwm2(i32 inreg %idx) { 1913; GFX1032-LABEL: test_wwm2: 1914; GFX1032: ; %bb.0: ; %main_body 1915; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1916; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1917; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 1918; GFX1032-NEXT: v_mov_b32_e32 v0, 0 1919; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 1920; GFX1032-NEXT: s_cbranch_execz .LBB35_2 1921; GFX1032-NEXT: ; %bb.1: ; %if 1922; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 1923; GFX1032-NEXT: v_mov_b32_e32 v1, s0 1924; GFX1032-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1925; GFX1032-NEXT: s_waitcnt vmcnt(0) 1926; GFX1032-NEXT: v_add_f32_e32 v2, v1, v1 1927; GFX1032-NEXT: s_mov_b32 exec_lo, s2 1928; GFX1032-NEXT: v_mov_b32_e32 v0, v2 1929; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 1930; GFX1032-NEXT: .LBB35_2: ; %endif 1931; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 1932; GFX1032-NEXT: ; return to shader part epilog 1933; 1934; GFX1064-LABEL: test_wwm2: 1935; GFX1064: ; %bb.0: ; %main_body 1936; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 1937; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 1938; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 1939; GFX1064-NEXT: v_mov_b32_e32 v0, 0 1940; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 1941; GFX1064-NEXT: s_cbranch_execz .LBB35_2 1942; GFX1064-NEXT: ; %bb.1: ; %if 1943; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 1944; GFX1064-NEXT: v_mov_b32_e32 v1, s0 1945; GFX1064-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 1946; GFX1064-NEXT: s_waitcnt vmcnt(0) 1947; GFX1064-NEXT: v_add_f32_e32 v2, v1, v1 1948; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 1949; GFX1064-NEXT: v_mov_b32_e32 v0, v2 1950; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 1951; GFX1064-NEXT: .LBB35_2: ; %endif 1952; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 1953; GFX1064-NEXT: ; return to shader part epilog 1954main_body: 1955 ; use mbcnt to make sure the branch is divergent 1956 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 1957 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 1958 %cc = icmp uge i32 %hi, 16 1959 br i1 %cc, label %endif, label %if 1960 1961if: 1962 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 1963 %out = fadd float %src, %src 1964 %out.0 = call float @llvm.amdgcn.wwm.f32(float %out) 1965 %out.1 = fadd float %src, %out.0 1966 br label %endif 1967 1968endif: 1969 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 1970 ret float %out.2 1971} 1972 1973define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) { 1974; GFX1032-LABEL: test_strict_wwm1: 1975; GFX1032: ; %bb.0: ; %main_body 1976; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 1977; GFX1032-NEXT: v_mov_b32_e32 v2, v1 1978; GFX1032-NEXT: v_mov_b32_e32 v3, v0 1979; GFX1032-NEXT: v_add_f32_e32 v2, v3, v2 1980; GFX1032-NEXT: s_mov_b32 exec_lo, s0 1981; GFX1032-NEXT: v_mov_b32_e32 v0, v2 1982; GFX1032-NEXT: ; return to shader part epilog 1983; 1984; GFX1064-LABEL: test_strict_wwm1: 1985; GFX1064: ; %bb.0: ; %main_body 1986; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 1987; GFX1064-NEXT: v_mov_b32_e32 v2, v1 1988; GFX1064-NEXT: v_mov_b32_e32 v3, v0 1989; GFX1064-NEXT: v_add_f32_e32 v2, v3, v2 1990; GFX1064-NEXT: s_mov_b64 exec, s[0:1] 1991; GFX1064-NEXT: v_mov_b32_e32 v0, v2 1992; GFX1064-NEXT: ; return to shader part epilog 1993main_body: 1994 %out = fadd float %src0, %src1 1995 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 1996 ret float %out.0 1997} 1998 1999define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) { 2000; GFX1032-LABEL: test_strict_wwm2: 2001; GFX1032: ; %bb.0: ; %main_body 2002; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2003; GFX1032-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2004; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0 2005; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2006; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo 2007; GFX1032-NEXT: s_cbranch_execz .LBB37_2 2008; GFX1032-NEXT: ; %bb.1: ; %if 2009; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 2010; GFX1032-NEXT: v_mov_b32_e32 v1, s0 2011; GFX1032-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2012; GFX1032-NEXT: s_waitcnt vmcnt(0) 2013; GFX1032-NEXT: v_add_f32_e32 v2, v1, v1 2014; GFX1032-NEXT: s_mov_b32 exec_lo, s2 2015; GFX1032-NEXT: v_mov_b32_e32 v0, v2 2016; GFX1032-NEXT: v_add_f32_e32 v0, v1, v0 2017; GFX1032-NEXT: .LBB37_2: ; %endif 2018; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 2019; GFX1032-NEXT: ; return to shader part epilog 2020; 2021; GFX1064-LABEL: test_strict_wwm2: 2022; GFX1064: ; %bb.0: ; %main_body 2023; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 2024; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 2025; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0 2026; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2027; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc 2028; GFX1064-NEXT: s_cbranch_execz .LBB37_2 2029; GFX1064-NEXT: ; %bb.1: ; %if 2030; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 2031; GFX1064-NEXT: v_mov_b32_e32 v1, s0 2032; GFX1064-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen 2033; GFX1064-NEXT: s_waitcnt vmcnt(0) 2034; GFX1064-NEXT: v_add_f32_e32 v2, v1, v1 2035; GFX1064-NEXT: s_mov_b64 exec, s[4:5] 2036; GFX1064-NEXT: v_mov_b32_e32 v0, v2 2037; GFX1064-NEXT: v_add_f32_e32 v0, v1, v0 2038; GFX1064-NEXT: .LBB37_2: ; %endif 2039; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] 2040; GFX1064-NEXT: ; return to shader part epilog 2041main_body: 2042 ; use mbcnt to make sure the branch is divergent 2043 %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) 2044 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) 2045 %cc = icmp uge i32 %hi, 16 2046 br i1 %cc, label %endif, label %if 2047 2048if: 2049 %src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0) 2050 %out = fadd float %src, %src 2051 %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out) 2052 %out.1 = fadd float %src, %out.0 2053 br label %endif 2054 2055endif: 2056 %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ] 2057 ret float %out.2 2058} 2059 2060 2061define amdgpu_ps <4 x float> @test_wqm1(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #0 { 2062; GFX1032-LABEL: test_wqm1: 2063; GFX1032: ; %bb.0: ; %main_body 2064; GFX1032-NEXT: s_mov_b32 s0, exec_lo 2065; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo 2066; GFX1032-NEXT: s_mov_b32 m0, s3 2067; GFX1032-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 2068; GFX1032-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 2069; GFX1032-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 2070; GFX1032-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 2071; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 2072; GFX1032-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 2073; GFX1032-NEXT: s_waitcnt vmcnt(0) 2074; GFX1032-NEXT: ; return to shader part epilog 2075; 2076; GFX1064-LABEL: test_wqm1: 2077; GFX1064: ; %bb.0: ; %main_body 2078; GFX1064-NEXT: s_mov_b64 s[0:1], exec 2079; GFX1064-NEXT: s_wqm_b64 exec, exec 2080; GFX1064-NEXT: s_mov_b32 m0, s3 2081; GFX1064-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x 2082; GFX1064-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y 2083; GFX1064-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x 2084; GFX1064-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y 2085; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] 2086; GFX1064-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D 2087; GFX1064-NEXT: s_waitcnt vmcnt(0) 2088; GFX1064-NEXT: ; return to shader part epilog 2089main_body: 2090 %inst23 = extractelement <2 x float> %pos, i32 0 2091 %inst24 = extractelement <2 x float> %pos, i32 1 2092 %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0) 2093 %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0) 2094 %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0) 2095 %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0) 2096 %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) 2097 ret <4 x float> %tex 2098} 2099 2100define amdgpu_ps float @test_wqm2(i32 inreg %idx0, i32 inreg %idx1) #0 { 2101; GFX1032-LABEL: test_wqm2: 2102; GFX1032: ; %bb.0: ; %main_body 2103; GFX1032-NEXT: s_mov_b32 s2, exec_lo 2104; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo 2105; GFX1032-NEXT: v_mov_b32_e32 v0, s0 2106; GFX1032-NEXT: v_mov_b32_e32 v1, s1 2107; GFX1032-NEXT: s_clause 0x1 2108; GFX1032-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2109; GFX1032-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen 2110; GFX1032-NEXT: s_waitcnt vmcnt(0) 2111; GFX1032-NEXT: v_add_f32_e32 v0, v2, v3 2112; GFX1032-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2113; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s2 2114; GFX1032-NEXT: ; return to shader part epilog 2115; 2116; GFX1064-LABEL: test_wqm2: 2117; GFX1064: ; %bb.0: ; %main_body 2118; GFX1064-NEXT: s_mov_b64 s[2:3], exec 2119; GFX1064-NEXT: s_wqm_b64 exec, exec 2120; GFX1064-NEXT: v_mov_b32_e32 v0, s0 2121; GFX1064-NEXT: v_mov_b32_e32 v1, s1 2122; GFX1064-NEXT: s_clause 0x1 2123; GFX1064-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen 2124; GFX1064-NEXT: buffer_load_dword v3, v1, s[0:3], 0 idxen 2125; GFX1064-NEXT: s_waitcnt vmcnt(0) 2126; GFX1064-NEXT: v_add_f32_e32 v0, v2, v3 2127; GFX1064-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec 2128; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3] 2129; GFX1064-NEXT: ; return to shader part epilog 2130main_body: 2131 %src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0) 2132 %src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0) 2133 %out = fadd float %src0, %src1 2134 %out.0 = bitcast float %out to i32 2135 %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0) 2136 %out.2 = bitcast i32 %out.1 to float 2137 ret float %out.2 2138} 2139 2140define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { 2141; GFX1032-LABEL: test_intr_fcmp_i64: 2142; GFX1032: ; %bb.0: 2143; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2144; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2145; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2146; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| 2147; GFX1032-NEXT: v_mov_b32_e32 v0, s2 2148; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 2149; GFX1032-NEXT: s_endpgm 2150; 2151; GFX1064-LABEL: test_intr_fcmp_i64: 2152; GFX1064: ; %bb.0: 2153; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2154; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2155; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2156; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| 2157; GFX1064-NEXT: v_mov_b32_e32 v0, s2 2158; GFX1064-NEXT: v_mov_b32_e32 v1, s3 2159; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2160; GFX1064-NEXT: s_endpgm 2161 %temp = call float @llvm.fabs.f32(float %a) 2162 %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) 2163 store i64 %result, ptr addrspace(1) %out 2164 ret void 2165} 2166 2167define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { 2168; GFX1032-LABEL: test_intr_icmp_i64: 2169; GFX1032: ; %bb.0: 2170; GFX1032-NEXT: s_clause 0x1 2171; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c 2172; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2173; GFX1032-NEXT: v_mov_b32_e32 v1, 0 2174; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2175; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 2176; GFX1032-NEXT: v_mov_b32_e32 v0, s2 2177; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 2178; GFX1032-NEXT: s_endpgm 2179; 2180; GFX1064-LABEL: test_intr_icmp_i64: 2181; GFX1064: ; %bb.0: 2182; GFX1064-NEXT: s_clause 0x1 2183; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c 2184; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2185; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2186; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2187; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 2188; GFX1064-NEXT: v_mov_b32_e32 v0, s2 2189; GFX1064-NEXT: v_mov_b32_e32 v1, s3 2190; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2191; GFX1064-NEXT: s_endpgm 2192 %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) 2193 store i64 %result, ptr addrspace(1) %out 2194 ret void 2195} 2196 2197define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { 2198; GFX1032-LABEL: test_intr_fcmp_i32: 2199; GFX1032: ; %bb.0: 2200; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2201; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2202; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2203; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| 2204; GFX1032-NEXT: v_mov_b32_e32 v1, s2 2205; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 2206; GFX1032-NEXT: s_endpgm 2207; 2208; GFX1064-LABEL: test_intr_fcmp_i32: 2209; GFX1064: ; %bb.0: 2210; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2211; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2212; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| 2214; GFX1064-NEXT: v_mov_b32_e32 v1, s2 2215; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 2216; GFX1064-NEXT: s_endpgm 2217 %temp = call float @llvm.fabs.f32(float %a) 2218 %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) 2219 store i32 %result, ptr addrspace(1) %out 2220 ret void 2221} 2222 2223define amdgpu_kernel void @test_intr_icmp_i32(ptr addrspace(1) %out, i32 %src) { 2224; GFX1032-LABEL: test_intr_icmp_i32: 2225; GFX1032: ; %bb.0: 2226; GFX1032-NEXT: s_clause 0x1 2227; GFX1032-NEXT: s_load_dword s2, s[4:5], 0x2c 2228; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2229; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2230; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2 2232; GFX1032-NEXT: v_mov_b32_e32 v1, s2 2233; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] 2234; GFX1032-NEXT: s_endpgm 2235; 2236; GFX1064-LABEL: test_intr_icmp_i32: 2237; GFX1064: ; %bb.0: 2238; GFX1064-NEXT: s_clause 0x1 2239; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c 2240; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2241; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2242; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2 2244; GFX1064-NEXT: v_mov_b32_e32 v1, s2 2245; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] 2246; GFX1064-NEXT: s_endpgm 2247 %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 32) 2248 store i32 %result, ptr addrspace(1) %out 2249 ret void 2250} 2251 2252define amdgpu_ps void @test_wqm_vote(float %a) { 2253; GFX1032-LABEL: test_wqm_vote: 2254; GFX1032: ; %bb.0: 2255; GFX1032-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 2256; GFX1032-NEXT: s_mov_b32 s0, exec_lo 2257; GFX1032-NEXT: v_mov_b32_e32 v0, 0 2258; GFX1032-NEXT: s_wqm_b32 s1, vcc_lo 2259; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, s1 2260; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 2261; GFX1032-NEXT: s_cbranch_scc0 .LBB44_2 2262; GFX1032-NEXT: ; %bb.1: 2263; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 2264; GFX1032-NEXT: exp mrt0 off, off, off, off 2265; GFX1032-NEXT: s_endpgm 2266; GFX1032-NEXT: .LBB44_2: 2267; GFX1032-NEXT: s_mov_b32 exec_lo, 0 2268; GFX1032-NEXT: exp null off, off, off, off done vm 2269; GFX1032-NEXT: s_endpgm 2270; 2271; GFX1064-LABEL: test_wqm_vote: 2272; GFX1064: ; %bb.0: 2273; GFX1064-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 2274; GFX1064-NEXT: s_mov_b64 s[0:1], exec 2275; GFX1064-NEXT: v_mov_b32_e32 v0, 0 2276; GFX1064-NEXT: s_wqm_b64 s[2:3], vcc 2277; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, s[2:3] 2278; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 2279; GFX1064-NEXT: s_cbranch_scc0 .LBB44_2 2280; GFX1064-NEXT: ; %bb.1: 2281; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] 2282; GFX1064-NEXT: exp mrt0 off, off, off, off 2283; GFX1064-NEXT: s_endpgm 2284; GFX1064-NEXT: .LBB44_2: 2285; GFX1064-NEXT: s_mov_b64 exec, 0 2286; GFX1064-NEXT: exp null off, off, off, off done vm 2287; GFX1064-NEXT: s_endpgm 2288 %c1 = fcmp une float %a, 0.0 2289 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) 2290 call void @llvm.amdgcn.kill(i1 %c2) 2291 call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false) 2292 ret void 2293} 2294 2295define amdgpu_kernel void @test_branch_true() #2 { 2296; GFX1032-LABEL: test_branch_true: 2297; GFX1032: ; %bb.0: ; %entry 2298; GFX1032-NEXT: s_mov_b32 vcc_lo, exec_lo 2299; GFX1032-NEXT: s_cbranch_execnz .LBB45_2 2300; GFX1032-NEXT: ; %bb.1: ; %for.body.lr.ph 2301; GFX1032-NEXT: s_branch .LBB45_3 2302; GFX1032-NEXT: .LBB45_2: ; %Flow 2303; GFX1032-NEXT: s_branch .LBB45_5 2304; GFX1032-NEXT: .LBB45_3: ; %for.body 2305; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 2306; GFX1032-NEXT: ; %bb.4: ; %for.end.loopexit 2307; GFX1032-NEXT: s_branch .LBB45_2 2308; GFX1032-NEXT: .LBB45_5: ; %for.end 2309; GFX1032-NEXT: s_endpgm 2310; 2311; GFX1064-LABEL: test_branch_true: 2312; GFX1064: ; %bb.0: ; %entry 2313; GFX1064-NEXT: s_mov_b64 vcc, exec 2314; GFX1064-NEXT: s_cbranch_execnz .LBB45_2 2315; GFX1064-NEXT: ; %bb.1: ; %for.body.lr.ph 2316; GFX1064-NEXT: s_branch .LBB45_3 2317; GFX1064-NEXT: .LBB45_2: ; %Flow 2318; GFX1064-NEXT: s_branch .LBB45_5 2319; GFX1064-NEXT: .LBB45_3: ; %for.body 2320; GFX1064-NEXT: s_mov_b64 vcc, 0 2321; GFX1064-NEXT: ; %bb.4: ; %for.end.loopexit 2322; GFX1064-NEXT: s_branch .LBB45_2 2323; GFX1064-NEXT: .LBB45_5: ; %for.end 2324; GFX1064-NEXT: s_endpgm 2325entry: 2326 br i1 true, label %for.end, label %for.body.lr.ph 2327 2328for.body.lr.ph: ; preds = %entry 2329 br label %for.body 2330 2331for.body: ; preds = %for.body, %for.body.lr.ph 2332 br i1 undef, label %for.end, label %for.body 2333 2334for.end: ; preds = %for.body, %entry 2335 ret void 2336} 2337 2338define amdgpu_ps float @test_ps_live() #0 { 2339; GFX1032-LABEL: test_ps_live: 2340; GFX1032: ; %bb.0: 2341; GFX1032-NEXT: s_mov_b32 s0, exec_lo 2342; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 2343; GFX1032-NEXT: ; return to shader part epilog 2344; 2345; GFX1064-LABEL: test_ps_live: 2346; GFX1064: ; %bb.0: 2347; GFX1064-NEXT: s_mov_b64 s[0:1], exec 2348; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 2349; GFX1064-NEXT: ; return to shader part epilog 2350 %live = call i1 @llvm.amdgcn.ps.live() 2351 %live.32 = zext i1 %live to i32 2352 %r = bitcast i32 %live.32 to float 2353 ret float %r 2354} 2355 2356define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2357; GFX1032-LABEL: test_vccnz_ifcvt_triangle64: 2358; GFX1032: ; %bb.0: ; %entry 2359; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2360; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2362; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2363; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0 2364; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4 2365; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2 2366; GFX1032-NEXT: ; %bb.1: ; %if 2367; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] 2368; GFX1032-NEXT: s_branch .LBB47_3 2369; GFX1032-NEXT: .LBB47_2: 2370; GFX1032-NEXT: v_mov_b32_e32 v0, s2 2371; GFX1032-NEXT: v_mov_b32_e32 v1, s3 2372; GFX1032-NEXT: .LBB47_3: ; %endif 2373; GFX1032-NEXT: v_mov_b32_e32 v2, 0 2374; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2375; GFX1032-NEXT: s_endpgm 2376; 2377; GFX1064-LABEL: test_vccnz_ifcvt_triangle64: 2378; GFX1064: ; %bb.0: ; %entry 2379; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2380; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2381; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2382; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2383; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0 2384; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5] 2385; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2 2386; GFX1064-NEXT: ; %bb.1: ; %if 2387; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3] 2388; GFX1064-NEXT: s_branch .LBB47_3 2389; GFX1064-NEXT: .LBB47_2: 2390; GFX1064-NEXT: v_mov_b32_e32 v0, s2 2391; GFX1064-NEXT: v_mov_b32_e32 v1, s3 2392; GFX1064-NEXT: .LBB47_3: ; %endif 2393; GFX1064-NEXT: v_mov_b32_e32 v2, 0 2394; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2395; GFX1064-NEXT: s_endpgm 2396entry: 2397 %v = load double, ptr addrspace(1) %in 2398 %cc = fcmp oeq double %v, 1.000000e+00 2399 br i1 %cc, label %if, label %endif 2400 2401if: 2402 %u = fadd double %v, %v 2403 br label %endif 2404 2405endif: 2406 %r = phi double [ %v, %entry ], [ %u, %if ] 2407 store double %r, ptr addrspace(1) %out 2408 ret void 2409} 2410 2411define amdgpu_gs float @test_vgprblocks_w32_attr(float %a, float %b, float %c, float %d, float %e, 2412; GCN-LABEL: test_vgprblocks_w32_attr: 2413; GCN: ; %bb.0: ; %main_body 2414; GCN-NEXT: v_add_f32_e32 v0, v0, v1 2415; GCN-NEXT: v_add_f32_e32 v0, v0, v2 2416; GCN-NEXT: v_add_f32_e32 v0, v0, v3 2417; GCN-NEXT: v_add_f32_e32 v0, v0, v4 2418; GCN-NEXT: v_add_f32_e32 v0, v0, v5 2419; GCN-NEXT: v_add_f32_e32 v0, v0, v6 2420; GCN-NEXT: v_add_f32_e32 v0, v0, v7 2421; GCN-NEXT: v_add_f32_e32 v0, v0, v8 2422; GCN-NEXT: v_add_f32_e32 v0, v0, v9 2423; GCN-NEXT: v_add_f32_e32 v0, v0, v10 2424; GCN-NEXT: v_add_f32_e32 v0, v0, v11 2425; GCN-NEXT: ; return to shader part epilog 2426 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #3 { 2427main_body: 2428 %s = fadd float %a, %b 2429 %s.1 = fadd float %s, %c 2430 %s.2 = fadd float %s.1, %d 2431 %s.3 = fadd float %s.2, %e 2432 %s.4 = fadd float %s.3, %f 2433 %s.5 = fadd float %s.4, %g 2434 %s.6 = fadd float %s.5, %h 2435 %s.7 = fadd float %s.6, %i 2436 %s.8 = fadd float %s.7, %j 2437 %s.9 = fadd float %s.8, %k 2438 %s.10 = fadd float %s.9, %l 2439 ret float %s.10 2440} 2441 2442define amdgpu_gs float @test_vgprblocks_w64_attr(float %a, float %b, float %c, float %d, float %e, 2443; GCN-LABEL: test_vgprblocks_w64_attr: 2444; GCN: ; %bb.0: ; %main_body 2445; GCN-NEXT: v_add_f32_e32 v0, v0, v1 2446; GCN-NEXT: v_add_f32_e32 v0, v0, v2 2447; GCN-NEXT: v_add_f32_e32 v0, v0, v3 2448; GCN-NEXT: v_add_f32_e32 v0, v0, v4 2449; GCN-NEXT: v_add_f32_e32 v0, v0, v5 2450; GCN-NEXT: v_add_f32_e32 v0, v0, v6 2451; GCN-NEXT: v_add_f32_e32 v0, v0, v7 2452; GCN-NEXT: v_add_f32_e32 v0, v0, v8 2453; GCN-NEXT: v_add_f32_e32 v0, v0, v9 2454; GCN-NEXT: v_add_f32_e32 v0, v0, v10 2455; GCN-NEXT: v_add_f32_e32 v0, v0, v11 2456; GCN-NEXT: ; return to shader part epilog 2457 float %f, float %g, float %h, float %i, float %j, float %k, float %l) #4 { 2458main_body: 2459 %s = fadd float %a, %b 2460 %s.1 = fadd float %s, %c 2461 %s.2 = fadd float %s.1, %d 2462 %s.3 = fadd float %s.2, %e 2463 %s.4 = fadd float %s.3, %f 2464 %s.5 = fadd float %s.4, %g 2465 %s.6 = fadd float %s.5, %h 2466 %s.7 = fadd float %s.6, %i 2467 %s.8 = fadd float %s.7, %j 2468 %s.9 = fadd float %s.8, %k 2469 %s.10 = fadd float %s.9, %l 2470 ret float %s.10 2471} 2472 2473define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { 2474; GFX1032-LABEL: icmp64: 2475; GFX1032: ; %bb.0: ; %entry 2476; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 2477; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2478; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 2479; GFX1032-NEXT: s_sub_i32 s1, 0, s0 2480; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1 2481; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2482; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 2483; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 2484; GFX1032-NEXT: s_brev_b32 s1, 1 2485; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2 2486; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2487; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1 2488; GFX1032-NEXT: v_mul_lo_u32 v1, v1, s0 2489; GFX1032-NEXT: v_sub_nc_u32_e32 v0, v0, v1 2490; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2491; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0 2492; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 2493; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2494; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0 2495; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 2496; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2497; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 2498; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] 2499; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 2500; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 2501; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 2502; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 2503; GFX1032-NEXT: ; %bb.1: ; %if.then 2504; GFX1032-NEXT: ; divergent unreachable 2505; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2506; GFX1032-NEXT: s_endpgm 2507; 2508; GFX1064-LABEL: icmp64: 2509; GFX1064: ; %bb.0: ; %entry 2510; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x28 2511; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 2513; GFX1064-NEXT: s_sub_i32 s1, 0, s0 2514; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1 2515; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2516; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 2517; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1 2518; GFX1064-NEXT: v_mul_hi_u32 v2, v1, v2 2519; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2520; GFX1064-NEXT: v_mul_hi_u32 v1, v0, v1 2521; GFX1064-NEXT: v_mul_lo_u32 v1, v1, s0 2522; GFX1064-NEXT: v_sub_nc_u32_e32 v0, v0, v1 2523; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2524; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 2525; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2526; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2527; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 2528; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2529; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2530; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 2531; GFX1064-NEXT: s_bitset1_b32 s1, 31 2532; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] 2533; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 2534; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 2535; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 2536; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 2537; GFX1064-NEXT: ; %bb.1: ; %if.then 2538; GFX1064-NEXT: ; divergent unreachable 2539; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2540; GFX1064-NEXT: s_endpgm 2541entry: 2542 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2543 %mul4 = mul nsw i32 %s, %n 2544 %cmp = icmp slt i32 0, %mul4 2545 br label %if.end 2546 2547if.end: ; preds = %entry 2548 %rem = urem i32 %id, %s 2549 %icmp = tail call i64 @llvm.amdgcn.icmp.i64.i32(i32 %rem, i32 0, i32 32) 2550 %shr = lshr i64 %icmp, 1 2551 %notmask = shl nsw i64 -1, 0 2552 %and = and i64 %notmask, %shr 2553 %or = or i64 %and, -9223372036854775808 2554 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 2555 %cast = trunc i64 %cttz to i32 2556 %cmp3 = icmp ugt i32 10, %cast 2557 %cmp6 = icmp ne i32 %rem, 0 2558 %brmerge = or i1 %cmp6, %cmp3 2559 br i1 %brmerge, label %if.end2, label %if.then 2560 2561if.then: ; preds = %if.end 2562 unreachable 2563 2564if.end2: ; preds = %if.end 2565 ret void 2566} 2567 2568define amdgpu_kernel void @fcmp64(float %n, float %s) { 2569; GFX1032-LABEL: fcmp64: 2570; GFX1032: ; %bb.0: ; %entry 2571; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 2572; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 2573; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2574; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 2575; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 2576; GFX1032-NEXT: s_brev_b32 s1, 1 2577; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 2578; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 2579; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 2580; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 2581; GFX1032-NEXT: v_fma_f32 v5, -v1, v3, v4 2582; GFX1032-NEXT: v_fmac_f32_e32 v3, v5, v2 2583; GFX1032-NEXT: v_fma_f32 v1, -v1, v3, v4 2584; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 2585; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0 2586; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 2587; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 2588; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 2589; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 2590; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 2591; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] 2592; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 2593; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 2594; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 2595; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 2596; GFX1032-NEXT: ; %bb.1: ; %if.then 2597; GFX1032-NEXT: ; divergent unreachable 2598; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2599; GFX1032-NEXT: s_endpgm 2600; 2601; GFX1064-LABEL: fcmp64: 2602; GFX1064: ; %bb.0: ; %entry 2603; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 2604; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 2605; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2606; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 2607; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 2608; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 2609; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 2610; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 2611; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 2612; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 2613; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 2614; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 2615; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 2616; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 2617; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 2618; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 2619; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 2620; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 2621; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 2622; GFX1064-NEXT: s_bitset1_b32 s1, 31 2623; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] 2624; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 2625; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 2626; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 2627; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 2628; GFX1064-NEXT: ; %bb.1: ; %if.then 2629; GFX1064-NEXT: ; divergent unreachable 2630; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2631; GFX1064-NEXT: s_endpgm 2632entry: 2633 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2634 %id.f = uitofp i32 %id to float 2635 %mul4 = fmul float %s, %n 2636 %cmp = fcmp ult float 0.0, %mul4 2637 br label %if.end 2638 2639if.end: ; preds = %entry 2640 %rem.f = frem float %id.f, %s 2641 %fcmp = tail call i64 @llvm.amdgcn.fcmp.i64.f32(float %rem.f, float 0.0, i32 1) 2642 %shr = lshr i64 %fcmp, 1 2643 %notmask = shl nsw i64 -1, 0 2644 %and = and i64 %notmask, %shr 2645 %or = or i64 %and, -9223372036854775808 2646 %cttz = tail call i64 @llvm.cttz.i64(i64 %or, i1 true) 2647 %cast = trunc i64 %cttz to i32 2648 %cmp3 = icmp ugt i32 10, %cast 2649 %cmp6 = fcmp one float %rem.f, 0.0 2650 %brmerge = or i1 %cmp6, %cmp3 2651 br i1 %brmerge, label %if.end2, label %if.then 2652 2653if.then: ; preds = %if.end 2654 unreachable 2655 2656if.end2: ; preds = %if.end 2657 ret void 2658} 2659 2660define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { 2661; GFX1032-LABEL: icmp32: 2662; GFX1032: ; %bb.0: ; %entry 2663; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 2664; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2665; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s0 2666; GFX1032-NEXT: s_sub_i32 s1, 0, s0 2667; GFX1032-NEXT: v_rcp_iflag_f32_e32 v1, v1 2668; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2669; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 2670; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 2671; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2 2672; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 2673; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1 2674; GFX1032-NEXT: v_mul_lo_u32 v1, v1, s0 2675; GFX1032-NEXT: v_sub_nc_u32_e32 v0, v0, v1 2676; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2677; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0 2678; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 2679; GFX1032-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2680; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0 2681; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo 2682; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 2683; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 2684; GFX1032-NEXT: s_bitset1_b32 s0, 31 2685; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 2686; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 2687; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 2688; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 2689; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 2690; GFX1032-NEXT: ; %bb.1: ; %if.then 2691; GFX1032-NEXT: ; divergent unreachable 2692; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2693; GFX1032-NEXT: s_endpgm 2694; 2695; GFX1064-LABEL: icmp32: 2696; GFX1064: ; %bb.0: ; %entry 2697; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x28 2698; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2699; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s0 2700; GFX1064-NEXT: s_sub_i32 s1, 0, s0 2701; GFX1064-NEXT: v_rcp_iflag_f32_e32 v1, v1 2702; GFX1064-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2703; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 2704; GFX1064-NEXT: v_mul_lo_u32 v2, s1, v1 2705; GFX1064-NEXT: v_mul_hi_u32 v2, v1, v2 2706; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 2707; GFX1064-NEXT: v_mul_hi_u32 v1, v0, v1 2708; GFX1064-NEXT: v_mul_lo_u32 v1, v1, s0 2709; GFX1064-NEXT: v_sub_nc_u32_e32 v0, v0, v1 2710; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2711; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 2712; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2713; GFX1064-NEXT: v_subrev_nc_u32_e32 v1, s0, v0 2714; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 2715; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2716; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2717; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1 2718; GFX1064-NEXT: s_bitset1_b32 s0, 31 2719; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 2720; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 2721; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 2722; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 2723; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 2724; GFX1064-NEXT: ; %bb.1: ; %if.then 2725; GFX1064-NEXT: ; divergent unreachable 2726; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2727; GFX1064-NEXT: s_endpgm 2728entry: 2729 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2730 %mul4 = mul nsw i32 %s, %n 2731 %cmp = icmp slt i32 0, %mul4 2732 br label %if.end 2733 2734if.end: ; preds = %entry 2735 %rem = urem i32 %id, %s 2736 %icmp = tail call i32 @llvm.amdgcn.icmp.i32.i32(i32 %rem, i32 0, i32 32) 2737 %shr = lshr i32 %icmp, 1 2738 %notmask = shl nsw i32 -1, 0 2739 %and = and i32 %notmask, %shr 2740 %or = or i32 %and, 2147483648 2741 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 2742 %cmp3 = icmp ugt i32 10, %cttz 2743 %cmp6 = icmp ne i32 %rem, 0 2744 %brmerge = or i1 %cmp6, %cmp3 2745 br i1 %brmerge, label %if.end2, label %if.then 2746 2747if.then: ; preds = %if.end 2748 unreachable 2749 2750if.end2: ; preds = %if.end 2751 ret void 2752} 2753 2754define amdgpu_kernel void @fcmp32(float %n, float %s) { 2755; GFX1032-LABEL: fcmp32: 2756; GFX1032: ; %bb.0: ; %entry 2757; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x28 2758; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 2759; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2760; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 2761; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 2762; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 2763; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 2764; GFX1032-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s0, v0 2765; GFX1032-NEXT: v_mul_f32_e32 v4, v3, v2 2766; GFX1032-NEXT: v_fma_f32 v5, -v1, v4, v3 2767; GFX1032-NEXT: v_fmac_f32_e32 v4, v5, v2 2768; GFX1032-NEXT: v_fma_f32 v1, -v1, v4, v3 2769; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v4 2770; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0 2771; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 2772; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 2773; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 2774; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 2775; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 2776; GFX1032-NEXT: s_bitset1_b32 s0, 31 2777; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 2778; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 2779; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 2780; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 2781; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 2782; GFX1032-NEXT: ; %bb.1: ; %if.then 2783; GFX1032-NEXT: ; divergent unreachable 2784; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2785; GFX1032-NEXT: s_endpgm 2786; 2787; GFX1064-LABEL: fcmp32: 2788; GFX1064: ; %bb.0: ; %entry 2789; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x28 2790; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 2791; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2792; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 2793; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 2794; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 2795; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 2796; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 2797; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 2798; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 2799; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 2800; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 2801; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 2802; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 2803; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 2804; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 2805; GFX1064-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 2806; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1 2807; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 2808; GFX1064-NEXT: s_bitset1_b32 s0, 31 2809; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 2810; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 2811; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 2812; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 2813; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] 2814; GFX1064-NEXT: ; %bb.1: ; %if.then 2815; GFX1064-NEXT: ; divergent unreachable 2816; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock 2817; GFX1064-NEXT: s_endpgm 2818entry: 2819 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 2820 %id.f = uitofp i32 %id to float 2821 %mul4 = fmul float %s, %n 2822 %cmp = fcmp ult float 0.0, %mul4 2823 br label %if.end 2824 2825if.end: ; preds = %entry 2826 %rem.f = frem float %id.f, %s 2827 %fcmp = tail call i32 @llvm.amdgcn.fcmp.i32.f32(float %rem.f, float 0.0, i32 1) 2828 %shr = lshr i32 %fcmp, 1 2829 %notmask = shl nsw i32 -1, 0 2830 %and = and i32 %notmask, %shr 2831 %or = or i32 %and, 2147483648 2832 %cttz = tail call i32 @llvm.cttz.i32(i32 %or, i1 true) 2833 %cmp3 = icmp ugt i32 10, %cttz 2834 %cmp6 = fcmp one float %rem.f, 0.0 2835 %brmerge = or i1 %cmp6, %cmp3 2836 br i1 %brmerge, label %if.end2, label %if.then 2837 2838if.then: ; preds = %if.end 2839 unreachable 2840 2841if.end2: ; preds = %if.end 2842 ret void 2843} 2844 2845declare void @external_void_func_void() #1 2846 2847define void @callee_no_stack_with_call() #1 { 2848; GFX1032-LABEL: callee_no_stack_with_call: 2849; GFX1032: ; %bb.0: 2850; GFX1032-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2851; GFX1032-NEXT: s_mov_b32 s16, s33 2852; GFX1032-NEXT: s_mov_b32 s33, s32 2853; GFX1032-NEXT: s_or_saveexec_b32 s17, -1 2854; GFX1032-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 2855; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2856; GFX1032-NEXT: s_mov_b32 exec_lo, s17 2857; GFX1032-NEXT: s_addk_i32 s32, 0x200 2858; GFX1032-NEXT: v_writelane_b32 v40, s16, 2 2859; GFX1032-NEXT: s_getpc_b64 s[16:17] 2860; GFX1032-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 2861; GFX1032-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 2862; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 2863; GFX1032-NEXT: v_writelane_b32 v40, s30, 0 2864; GFX1032-NEXT: v_writelane_b32 v40, s31, 1 2865; GFX1032-NEXT: s_waitcnt lgkmcnt(0) 2866; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] 2867; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 2868; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 2869; GFX1032-NEXT: s_mov_b32 s32, s33 2870; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 2871; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 2872; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 2873; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 2874; GFX1032-NEXT: s_mov_b32 exec_lo, s5 2875; GFX1032-NEXT: s_mov_b32 s33, s4 2876; GFX1032-NEXT: s_waitcnt vmcnt(0) 2877; GFX1032-NEXT: s_setpc_b64 s[30:31] 2878; 2879; GFX1064-LABEL: callee_no_stack_with_call: 2880; GFX1064: ; %bb.0: 2881; GFX1064-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2882; GFX1064-NEXT: s_mov_b32 s16, s33 2883; GFX1064-NEXT: s_mov_b32 s33, s32 2884; GFX1064-NEXT: s_or_saveexec_b64 s[18:19], -1 2885; GFX1064-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill 2886; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2887; GFX1064-NEXT: s_mov_b64 exec, s[18:19] 2888; GFX1064-NEXT: s_addk_i32 s32, 0x400 2889; GFX1064-NEXT: v_writelane_b32 v40, s16, 2 2890; GFX1064-NEXT: s_getpc_b64 s[16:17] 2891; GFX1064-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 2892; GFX1064-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 2893; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 2894; GFX1064-NEXT: v_writelane_b32 v40, s30, 0 2895; GFX1064-NEXT: v_writelane_b32 v40, s31, 1 2896; GFX1064-NEXT: s_waitcnt lgkmcnt(0) 2897; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] 2898; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 2899; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 2900; GFX1064-NEXT: s_mov_b32 s32, s33 2901; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 2902; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 2903; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload 2904; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 2905; GFX1064-NEXT: s_mov_b64 exec, s[6:7] 2906; GFX1064-NEXT: s_mov_b32 s33, s4 2907; GFX1064-NEXT: s_waitcnt vmcnt(0) 2908; GFX1064-NEXT: s_setpc_b64 s[30:31] 2909 call void @external_void_func_void() 2910 ret void 2911} 2912 2913 2914declare i32 @llvm.amdgcn.workitem.id.x() 2915declare float @llvm.fabs.f32(float) 2916declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) 2917declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) 2918declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 2919declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) 2920declare i1 @llvm.amdgcn.class.f32(float, i32) 2921declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) 2922declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) 2923declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) 2924declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) 2925declare float @llvm.amdgcn.strict.wwm.f32(float) 2926declare i32 @llvm.amdgcn.strict.wwm.i32(i32) 2927declare i64 @llvm.amdgcn.strict.wwm.i64(i64) 2928declare float @llvm.amdgcn.wwm.f32(float) 2929declare i32 @llvm.amdgcn.wqm.i32(i32) 2930declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) 2931declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) 2932declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32 immarg) 2933declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) 2934declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) 2935declare i64 @llvm.amdgcn.fcmp.i64.f32(float, float, i32) 2936declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) 2937declare i32 @llvm.amdgcn.fcmp.i32.f32(float, float, i32) 2938declare i32 @llvm.amdgcn.icmp.i32.i32(i32, i32, i32) 2939declare void @llvm.amdgcn.kill(i1) 2940declare i1 @llvm.amdgcn.wqm.vote(i1) 2941declare i1 @llvm.amdgcn.ps.live() 2942declare i64 @llvm.cttz.i64(i64, i1) 2943declare i32 @llvm.cttz.i32(i32, i1) 2944declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #5 2945 2946attributes #0 = { nounwind readnone speculatable } 2947attributes #1 = { nounwind } 2948attributes #2 = { nounwind readnone optnone noinline } 2949attributes #3 = { "target-features"="+wavefrontsize32" } 2950attributes #4 = { "target-features"="+wavefrontsize64" } 2951attributes #5 = { inaccessiblememonly nounwind } 2952;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2953; GFX10DEFWAVE: {{.*}} 2954