1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s 3; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s 4 5; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead. 6 7define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) { 8; GFX9-O0-LABEL: no_cfg: 9; GFX9-O0: ; %bb.0: 10; GFX9-O0-NEXT: s_mov_b32 s6, s2 11; GFX9-O0-NEXT: s_mov_b32 s4, s0 12; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 13; GFX9-O0-NEXT: s_mov_b32 s7, s3 14; GFX9-O0-NEXT: s_mov_b32 s8, s7 15; GFX9-O0-NEXT: s_mov_b32 s9, s6 16; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 17; GFX9-O0-NEXT: s_mov_b32 s5, s1 18; GFX9-O0-NEXT: s_mov_b32 s10, s5 19; GFX9-O0-NEXT: s_mov_b32 s0, s4 20; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 21; GFX9-O0-NEXT: s_mov_b32 s1, s10 22; GFX9-O0-NEXT: s_mov_b32 s2, s9 23; GFX9-O0-NEXT: s_mov_b32 s3, s8 24; GFX9-O0-NEXT: s_mov_b32 s4, 0 25; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[0:3], s4 26; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 27; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 28; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 29; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 30; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 31; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 32; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] 33; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 34; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 35; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 36; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 37; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] 38; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 39; GFX9-O0-NEXT: s_nop 1 40; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf 41; GFX9-O0-NEXT: v_add_u32_e64 v0, v0, v2 42; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 43; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 44; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 45; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 46; GFX9-O0-NEXT: s_nop 1 47; GFX9-O0-NEXT: v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 48; GFX9-O0-NEXT: v_add_u32_e64 v0, v1, v0 49; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 50; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 51; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4 52; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] 53; GFX9-O0-NEXT: s_mov_b32 s5, 1 54; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3 55; GFX9-O0-NEXT: s_mov_b32 s5, 2 56; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s5 57; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 58; GFX9-O0-NEXT: s_endpgm 59; 60; GFX9-O3-LABEL: no_cfg: 61; GFX9-O3: ; %bb.0: 62; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 63; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 64; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 65; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 66; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 67; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] 68; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] 69; GFX9-O3-NEXT: s_nop 0 70; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 71; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 72; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 73; GFX9-O3-NEXT: v_add_u32_e32 v0, v3, v0 74; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 75; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 76; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 77; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 78; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 79; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4 80; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4 81; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:4 82; GFX9-O3-NEXT: s_endpgm 83 %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) 84 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> 85 %tmp102 = extractelement <2 x i32> %tmp101, i32 0 86 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 87 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) 88 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) 89 90 91 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) 92 %tmp121 = add i32 %tmp105, %tmp120 93 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) 94 95 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) 96 %tmp136 = add i32 %tmp107, %tmp135 97 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) 98 99 %tmp138 = icmp eq i32 %tmp122, %tmp137 100 %tmp139 = sext i1 %tmp138 to i32 101 %tmp140 = shl nsw i32 %tmp139, 1 102 %tmp141 = and i32 %tmp140, 2 103 %tmp145 = bitcast i32 %tmp141 to float 104 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 105 ret void 106} 107 108define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { 109; GFX9-O0-LABEL: cfg: 110; GFX9-O0: ; %bb.0: ; %entry 111; GFX9-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 112; GFX9-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 113; GFX9-O0-NEXT: s_mov_b32 s18, -1 114; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 115; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 116; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 117; GFX9-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane 118; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 0 119; GFX9-O0-NEXT: s_mov_b32 s4, s1 120; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 0 121; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 122; GFX9-O0-NEXT: s_mov_b32 s3, s1 123; GFX9-O0-NEXT: s_mov_b32 s8, s3 124; GFX9-O0-NEXT: s_mov_b32 s9, s2 125; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 126; GFX9-O0-NEXT: s_mov_b32 s1, s4 127; GFX9-O0-NEXT: s_mov_b32 s10, s1 128; GFX9-O0-NEXT: s_mov_b32 s4, s0 129; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 130; GFX9-O0-NEXT: s_mov_b32 s5, s10 131; GFX9-O0-NEXT: s_mov_b32 s6, s9 132; GFX9-O0-NEXT: s_mov_b32 s7, s8 133; GFX9-O0-NEXT: v_writelane_b32 v5, s2, 1 134; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 2 135; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 136; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 137; GFX9-O0-NEXT: s_mov_b32 s0, 0 138; GFX9-O0-NEXT: s_nop 2 139; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 140; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 141; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill 142; GFX9-O0-NEXT: s_nop 0 143; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill 144; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 145; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 146; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 147; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 148; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 149; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 150; GFX9-O0-NEXT: s_nop 1 151; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 152; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 153; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 154; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 155; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill 156; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 157; GFX9-O0-NEXT: v_mov_b32_e32 v0, s0 158; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill 159; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec 160; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 5 161; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 6 162; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 163; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill 164; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] 165; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 166; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 167; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 168; GFX9-O0-NEXT: ; %bb.1: ; %if 169; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload 170; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload 171; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 172; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 173; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 174; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 175; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 176; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 177; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 178; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 179; GFX9-O0-NEXT: s_nop 1 180; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 181; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 182; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 183; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 184; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill 185; GFX9-O0-NEXT: .LBB1_2: ; %merge 186; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 187; GFX9-O0-NEXT: buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload 188; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] 189; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 190; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 5 191; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 6 192; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] 193; GFX9-O0-NEXT: v_readlane_b32 s2, v5, 1 194; GFX9-O0-NEXT: v_readlane_b32 s3, v5, 2 195; GFX9-O0-NEXT: v_readlane_b32 s0, v5, 3 196; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 4 197; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload 198; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload 199; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 200; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 201; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 202; GFX9-O0-NEXT: s_mov_b32 s4, 1 203; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 204; GFX9-O0-NEXT: s_mov_b32 s4, 2 205; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 206; GFX9-O0-NEXT: s_mov_b32 s6, s1 207; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 208; GFX9-O0-NEXT: s_mov_b32 s4, s3 209; GFX9-O0-NEXT: s_mov_b32 s5, s2 210; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 211; GFX9-O0-NEXT: s_mov_b32 s1, s6 212; GFX9-O0-NEXT: s_mov_b32 s2, s5 213; GFX9-O0-NEXT: s_mov_b32 s3, s4 214; GFX9-O0-NEXT: s_mov_b32 s4, 0 215; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 216; GFX9-O0-NEXT: s_endpgm 217; 218; GFX9-O3-LABEL: cfg: 219; GFX9-O3: ; %bb.0: ; %entry 220; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 221; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 222; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 223; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 224; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 225; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] 226; GFX9-O3-NEXT: s_nop 1 227; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 228; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 229; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 230; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 231; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 232; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc 233; GFX9-O3-NEXT: ; %bb.1: ; %if 234; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 235; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 236; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] 237; GFX9-O3-NEXT: s_nop 1 238; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 239; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 240; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] 241; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 242; GFX9-O3-NEXT: ; %bb.2: ; %merge 243; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] 244; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 245; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 246; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 247; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 248; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 249; GFX9-O3-NEXT: s_endpgm 250entry: 251 %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) 252 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> 253 %tmp102 = extractelement <2 x i32> %tmp101, i32 0 254 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) 255 256 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) 257 %tmp121 = add i32 %tmp105, %tmp120 258 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) 259 260 %cond = icmp eq i32 %arg, 0 261 br i1 %cond, label %if, label %merge 262if: 263 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 264 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) 265 266 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) 267 %tmp136 = add i32 %tmp107, %tmp135 268 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) 269 br label %merge 270 271merge: 272 %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] 273 %tmp138 = icmp eq i32 %tmp122, %merge_value 274 %tmp139 = sext i1 %tmp138 to i32 275 %tmp140 = shl nsw i32 %tmp139, 1 276 %tmp141 = and i32 %tmp140, 2 277 %tmp145 = bitcast i32 %tmp141 to float 278 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 279 ret void 280} 281 282define hidden i32 @called(i32 %a) noinline { 283; GFX9-O0-LABEL: called: 284; GFX9-O0: ; %bb.0: 285; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0 287; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0 288; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1 289; GFX9-O0-NEXT: s_setpc_b64 s[30:31] 290; 291; GFX9-O3-LABEL: called: 292; GFX9-O3: ; %bb.0: 293; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; GFX9-O3-NEXT: v_add_u32_e32 v1, v0, v0 295; GFX9-O3-NEXT: v_mul_lo_u32 v0, v1, v0 296; GFX9-O3-NEXT: v_sub_u32_e32 v0, v0, v1 297; GFX9-O3-NEXT: s_setpc_b64 s[30:31] 298 %add = add i32 %a, %a 299 %mul = mul i32 %add, %a 300 %sub = sub i32 %mul, %add 301 ret i32 %sub 302} 303 304define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { 305; GFX9-O0-LABEL: call: 306; GFX9-O0: ; %bb.0: 307; GFX9-O0-NEXT: s_mov_b32 s32, 0 308; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 309; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 310; GFX9-O0-NEXT: s_mov_b32 s26, -1 311; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 312; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 313; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 314; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 315; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane 316; GFX9-O0-NEXT: v_writelane_b32 v3, s12, 0 317; GFX9-O0-NEXT: v_writelane_b32 v3, s13, 1 318; GFX9-O0-NEXT: s_mov_b32 s14, s10 319; GFX9-O0-NEXT: s_mov_b32 s13, s9 320; GFX9-O0-NEXT: s_mov_b32 s12, s8 321; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] 322; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 323; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 324; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] 325; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 326; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 327; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] 328; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 329; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 330; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 331; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 332; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 333; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 334; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 335; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c 336; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 337; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 338; GFX9-O0-NEXT: s_mov_b32 s3, s9 339; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 340; GFX9-O0-NEXT: s_mov_b32 s9, s17 341; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 342; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 343; GFX9-O0-NEXT: s_mov_b32 s17, s9 344; GFX9-O0-NEXT: s_mov_b32 s18, s8 345; GFX9-O0-NEXT: s_mov_b32 s19, s3 346; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 347; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 348; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 349; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 350; GFX9-O0-NEXT: s_mov_b32 s8, 0 351; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 8 352; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 353; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 354; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 355; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 356; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 357; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 358; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] 359; GFX9-O0-NEXT: s_mov_b64 s[8:9], 56 360; GFX9-O0-NEXT: s_mov_b32 s2, s0 361; GFX9-O0-NEXT: s_mov_b32 s0, s1 362; GFX9-O0-NEXT: s_mov_b32 s3, s8 363; GFX9-O0-NEXT: s_mov_b32 s1, s9 364; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 365; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 366; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 367; GFX9-O0-NEXT: s_mov_b32 s9, s0 368; GFX9-O0-NEXT: s_getpc_b64 s[16:17] 369; GFX9-O0-NEXT: s_add_u32 s16, s16, called@rel32@lo+4 370; GFX9-O0-NEXT: s_addc_u32 s17, s17, called@rel32@hi+12 371; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] 372; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] 373; GFX9-O0-NEXT: s_mov_b32 s15, 20 374; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 375; GFX9-O0-NEXT: s_mov_b32 s15, 10 376; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s15, v5 377; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 378; GFX9-O0-NEXT: ; implicit-def: $sgpr15 379; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 380; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 381; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] 382; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 383; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 384; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 385; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 386; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 387; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 388; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 389; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 390; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 391; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 392; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 393; GFX9-O0-NEXT: s_nop 0 394; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 395; GFX9-O0-NEXT: s_endpgm 396; 397; GFX9-O3-LABEL: call: 398; GFX9-O3: ; %bb.0: 399; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 400; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 401; GFX9-O3-NEXT: s_mov_b32 s26, -1 402; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 403; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 404; GFX9-O3-NEXT: s_mov_b32 s32, 0 405; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 406; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 407; GFX9-O3-NEXT: s_mov_b32 s14, s10 408; GFX9-O3-NEXT: s_mov_b32 s13, s9 409; GFX9-O3-NEXT: s_mov_b32 s12, s8 410; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] 411; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 412; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 413; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 414; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] 415; GFX9-O3-NEXT: s_load_dword s6, s[4:5], 0x34 416; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 417; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 418; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 419; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 420; GFX9-O3-NEXT: s_add_u32 s8, s4, 56 421; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 422; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 423; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] 424; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 425; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 426; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] 427; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] 428; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] 429; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 430; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] 431; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 432; GFX9-O3-NEXT: s_getpc_b64 s[22:23] 433; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 434; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 435; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] 436; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 437; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 438; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] 439; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 440; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 441; GFX9-O3-NEXT: s_endpgm 442 443 444 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) 445 %tmp134 = call i32 @called(i32 %tmp107) 446 %tmp136 = add i32 %tmp134, %tmp107 447 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) 448 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 449 ret void 450} 451 452define i64 @called_i64(i64 %a) noinline { 453; GFX9-O0-LABEL: called_i64: 454; GFX9-O0: ; %bb.0: 455; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 456; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 457; GFX9-O0-NEXT: ; implicit-def: $sgpr4 458; GFX9-O0-NEXT: ; implicit-def: $sgpr4 459; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 460; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 461; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 462; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 463; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 464; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 465; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 466; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v5 467; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5] 468; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec 469; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 470; GFX9-O0-NEXT: s_mov_b32 s4, 32 471; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 472; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 473; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] 474; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 475; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 476; GFX9-O0-NEXT: v_mul_lo_u32 v1, v0, v1 477; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 478; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[4:5] 479; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 480; GFX9-O0-NEXT: v_mul_lo_u32 v2, v2, v3 481; GFX9-O0-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0 482; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 483; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 484; GFX9-O0-NEXT: ; implicit-def: $sgpr5 485; GFX9-O0-NEXT: ; implicit-def: $sgpr6 486; GFX9-O0-NEXT: ; implicit-def: $sgpr6 487; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 488; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec 489; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 490; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] 491; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 492; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec 493; GFX9-O0-NEXT: s_mov_b32 s5, 0 494; GFX9-O0-NEXT: ; implicit-def: $sgpr5 495; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 496; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec 497; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 498; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 499; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3 500; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 501; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 502; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2 503; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec 504; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 505; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 506; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 507; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 508; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 509; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v3 510; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7] 511; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec 512; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 513; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 514; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] 515; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec 516; GFX9-O0-NEXT: s_setpc_b64 s[30:31] 517; 518; GFX9-O3-LABEL: called_i64: 519; GFX9-O3: ; %bb.0: 520; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 521; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v0 522; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v1, vcc 523; GFX9-O3-NEXT: v_mul_lo_u32 v4, v3, v0 524; GFX9-O3-NEXT: v_mul_lo_u32 v5, v2, v1 525; GFX9-O3-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 526; GFX9-O3-NEXT: v_add3_u32 v1, v1, v5, v4 527; GFX9-O3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 528; GFX9-O3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 529; GFX9-O3-NEXT: s_setpc_b64 s[30:31] 530 %add = add i64 %a, %a 531 %mul = mul i64 %add, %a 532 %sub = sub i64 %mul, %add 533 ret i64 %sub 534} 535 536define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { 537; GFX9-O0-LABEL: call_i64: 538; GFX9-O0: ; %bb.0: 539; GFX9-O0-NEXT: s_mov_b32 s32, 0 540; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 541; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 542; GFX9-O0-NEXT: s_mov_b32 s26, -1 543; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 544; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 545; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 546; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 547; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane 548; GFX9-O0-NEXT: v_writelane_b32 v8, s12, 0 549; GFX9-O0-NEXT: v_writelane_b32 v8, s13, 1 550; GFX9-O0-NEXT: s_mov_b32 s14, s10 551; GFX9-O0-NEXT: s_mov_b32 s13, s9 552; GFX9-O0-NEXT: s_mov_b32 s12, s8 553; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] 554; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 555; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 556; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] 557; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 558; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 559; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] 560; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 561; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 562; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 563; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 564; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 565; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 566; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 567; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c 568; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 569; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 570; GFX9-O0-NEXT: s_mov_b32 s8, s19 571; GFX9-O0-NEXT: s_mov_b32 s9, s18 572; GFX9-O0-NEXT: s_mov_b32 s15, s17 573; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 574; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 575; GFX9-O0-NEXT: s_mov_b32 s17, s15 576; GFX9-O0-NEXT: s_mov_b32 s18, s9 577; GFX9-O0-NEXT: s_mov_b32 s19, s8 578; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 579; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 580; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 581; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 582; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 583; GFX9-O0-NEXT: s_mov_b32 s15, s9 584; GFX9-O0-NEXT: s_mov_b32 s16, s3 585; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 586; GFX9-O0-NEXT: v_mov_b32_e32 v0, s16 587; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 588; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 589; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[16:17] 590; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] 591; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 592; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 593; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 594; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 595; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 596; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 597; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 598; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 599; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] 600; GFX9-O0-NEXT: ; implicit-def: $sgpr2 601; GFX9-O0-NEXT: ; implicit-def: $sgpr2 602; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 603; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 604; GFX9-O0-NEXT: s_mov_b32 s2, 32 605; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] 606; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 607; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 608; GFX9-O0-NEXT: s_mov_b32 s2, s0 609; GFX9-O0-NEXT: s_mov_b32 s0, s1 610; GFX9-O0-NEXT: s_mov_b32 s3, s8 611; GFX9-O0-NEXT: s_mov_b32 s1, s9 612; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 613; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 614; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 615; GFX9-O0-NEXT: s_mov_b32 s9, s0 616; GFX9-O0-NEXT: s_getpc_b64 s[0:1] 617; GFX9-O0-NEXT: s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4 618; GFX9-O0-NEXT: s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12 619; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 620; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] 621; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] 622; GFX9-O0-NEXT: s_mov_b32 s15, 20 623; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s15, v3 624; GFX9-O0-NEXT: s_mov_b32 s15, 10 625; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 626; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 627; GFX9-O0-NEXT: ; implicit-def: $sgpr15 628; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 629; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 630; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 631; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 632; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] 633; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 634; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 635; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 636; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 637; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 638; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 639; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 640; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 641; GFX9-O0-NEXT: ; implicit-def: $sgpr6 642; GFX9-O0-NEXT: ; implicit-def: $sgpr6 643; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 644; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 645; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 646; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] 647; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] 648; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 649; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 650; GFX9-O0-NEXT: s_mov_b32 s4, 0 651; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 652; GFX9-O0-NEXT: s_endpgm 653; 654; GFX9-O3-LABEL: call_i64: 655; GFX9-O3: ; %bb.0: 656; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 657; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 658; GFX9-O3-NEXT: s_mov_b32 s26, -1 659; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 660; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 661; GFX9-O3-NEXT: s_mov_b32 s32, 0 662; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 663; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 664; GFX9-O3-NEXT: s_mov_b32 s14, s10 665; GFX9-O3-NEXT: s_mov_b32 s13, s9 666; GFX9-O3-NEXT: s_mov_b32 s12, s8 667; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] 668; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 669; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 670; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 671; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] 672; GFX9-O3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 673; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 674; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 675; GFX9-O3-NEXT: v_mov_b32_e32 v0, s7 676; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 677; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] 678; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] 679; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 680; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 681; GFX9-O3-NEXT: s_add_u32 s8, s4, 60 682; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 683; GFX9-O3-NEXT: s_getpc_b64 s[4:5] 684; GFX9-O3-NEXT: s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4 685; GFX9-O3-NEXT: s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12 686; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 687; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 688; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 689; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] 690; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 691; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] 692; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] 693; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] 694; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 695; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] 696; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 697; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 698; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 699; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] 700; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 701; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 702; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 703; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 704; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] 705; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 706; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 707; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 708; GFX9-O3-NEXT: s_endpgm 709 710 711 712 %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) 713 %tmp134 = call i64 @called_i64(i64 %tmp107) 714 %tmp136 = add i64 %tmp134, %tmp107 715 %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136) 716 %tmp138 = bitcast i64 %tmp137 to <2 x i32> 717 call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 718 ret void 719} 720 721define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { 722; GFX9-O0-LABEL: _amdgpu_cs_main: 723; GFX9-O0: ; %bb.0: 724; GFX9-O0-NEXT: s_mov_b32 s4, s3 725; GFX9-O0-NEXT: s_mov_b32 s5, s2 726; GFX9-O0-NEXT: s_mov_b32 s6, s1 727; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 728; GFX9-O0-NEXT: s_mov_b32 s1, s6 729; GFX9-O0-NEXT: s_mov_b32 s2, s5 730; GFX9-O0-NEXT: s_mov_b32 s3, s4 731; GFX9-O0-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 732; GFX9-O0-NEXT: s_mov_b32 s4, 5 733; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 734; GFX9-O0-NEXT: s_mov_b32 s4, 0 735; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen 736; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 737; GFX9-O0-NEXT: s_waitcnt vmcnt(1) 738; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 739; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 740; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 741; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff 742; GFX9-O0-NEXT: s_mov_b32 s10, -1 743; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 744; GFX9-O0-NEXT: s_mov_b32 s11, s5 745; GFX9-O0-NEXT: s_mov_b32 s8, s11 746; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 747; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 748; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 749; GFX9-O0-NEXT: s_mov_b32 s5, s10 750; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 751; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 752; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] 753; GFX9-O0-NEXT: ; implicit-def: $sgpr9 754; GFX9-O0-NEXT: ; implicit-def: $sgpr9 755; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 756; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 757; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 758; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 759; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 760; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 761; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 762; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 763; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 764; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 765; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 766; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 767; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] 768; GFX9-O0-NEXT: ; implicit-def: $sgpr9 769; GFX9-O0-NEXT: ; implicit-def: $sgpr9 770; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 771; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 772; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 773; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 774; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 775; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 776; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 777; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 778; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 779; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 780; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 781; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec 782; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 783; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 784; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 785; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] 786; GFX9-O0-NEXT: ; implicit-def: $sgpr5 787; GFX9-O0-NEXT: ; implicit-def: $sgpr5 788; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 789; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 790; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 791; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 792; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 793; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 794; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 795; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 796; GFX9-O0-NEXT: ; implicit-def: $sgpr5 797; GFX9-O0-NEXT: ; implicit-def: $sgpr5 798; GFX9-O0-NEXT: ; implicit-def: $sgpr5 799; GFX9-O0-NEXT: ; implicit-def: $sgpr5 800; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec 801; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 802; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 803; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 804; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen 805; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 806; GFX9-O0-NEXT: s_endpgm 807; 808; GFX9-O3-LABEL: _amdgpu_cs_main: 809; GFX9-O3: ; %bb.0: 810; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 811; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen 812; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 813; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 814; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 815; GFX9-O3-NEXT: s_waitcnt vmcnt(1) 816; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] 817; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] 818; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] 819; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] 820; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 821; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] 822; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] 823; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 824; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 825; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 826; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 827; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 828; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 829; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 830; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen 831; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 832; GFX9-O3-NEXT: s_endpgm 833 %tmp17 = shl i32 %index, 5 834 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) 835 %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> 836 %tmp19 = or i32 %tmp17, 16 837 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) 838 %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 839 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) 840 %tmp97 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp22) 841 %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1 842 %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807) 843 %tmp174 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp99) 844 %.i25 = bitcast <2 x i32> %tmp20 to i64 845 %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807) 846 %tmp251 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp176) 847 %.cast = bitcast i64 %tmp97 to <2 x float> 848 %.cast6 = bitcast i64 %tmp174 to <2 x float> 849 %.cast7 = bitcast i64 %tmp251 to <2 x float> 850 %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 851 %desc.int = bitcast <4 x i32> %desc to i128 852 %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8) 853 tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0) 854 tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc.ptr, i32 %tmp19, i32 0, i32 0) 855 ret void 856} 857 858 859define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) { 860; GFX9-O0-LABEL: strict_wwm_no_cfg: 861; GFX9-O0: ; %bb.0: 862; GFX9-O0-NEXT: s_mov_b32 s6, s2 863; GFX9-O0-NEXT: s_mov_b32 s4, s0 864; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7 865; GFX9-O0-NEXT: s_mov_b32 s7, s3 866; GFX9-O0-NEXT: s_mov_b32 s8, s7 867; GFX9-O0-NEXT: s_mov_b32 s9, s6 868; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 869; GFX9-O0-NEXT: s_mov_b32 s5, s1 870; GFX9-O0-NEXT: s_mov_b32 s10, s5 871; GFX9-O0-NEXT: s_mov_b32 s0, s4 872; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 873; GFX9-O0-NEXT: s_mov_b32 s1, s10 874; GFX9-O0-NEXT: s_mov_b32 s2, s9 875; GFX9-O0-NEXT: s_mov_b32 s3, s8 876; GFX9-O0-NEXT: s_mov_b32 s4, 0 877; GFX9-O0-NEXT: buffer_load_dwordx2 v[5:6], off, s[0:3], s4 878; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 879; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5 880; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 881; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 882; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 883; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 884; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] 885; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 886; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 887; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 888; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4 889; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7] 890; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4 891; GFX9-O0-NEXT: s_nop 1 892; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf 893; GFX9-O0-NEXT: v_add_u32_e64 v0, v0, v2 894; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 895; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 896; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 897; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 898; GFX9-O0-NEXT: s_nop 1 899; GFX9-O0-NEXT: v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 900; GFX9-O0-NEXT: v_add_u32_e64 v0, v1, v0 901; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 902; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 903; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4 904; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] 905; GFX9-O0-NEXT: s_mov_b32 s5, 1 906; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3 907; GFX9-O0-NEXT: s_mov_b32 s5, 2 908; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s5 909; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s4 offset:4 910; GFX9-O0-NEXT: s_endpgm 911; 912; GFX9-O3-LABEL: strict_wwm_no_cfg: 913; GFX9-O3: ; %bb.0: 914; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 915; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 916; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0 917; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 918; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 919; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[4:5] 920; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[4:5] 921; GFX9-O3-NEXT: s_nop 0 922; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 923; GFX9-O3-NEXT: v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf 924; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 925; GFX9-O3-NEXT: v_add_u32_e32 v0, v3, v0 926; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 927; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 928; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 929; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 930; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 931; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4 932; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4 933; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:4 934; GFX9-O3-NEXT: s_endpgm 935 %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) 936 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> 937 %tmp102 = extractelement <2 x i32> %tmp101, i32 0 938 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 939 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) 940 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) 941 942 943 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) 944 %tmp121 = add i32 %tmp105, %tmp120 945 %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121) 946 947 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) 948 %tmp136 = add i32 %tmp107, %tmp135 949 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) 950 951 %tmp138 = icmp eq i32 %tmp122, %tmp137 952 %tmp139 = sext i1 %tmp138 to i32 953 %tmp140 = shl nsw i32 %tmp139, 1 954 %tmp141 = and i32 %tmp140, 2 955 %tmp145 = bitcast i32 %tmp141 to float 956 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 957 ret void 958} 959 960define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) { 961; GFX9-O0-LABEL: strict_wwm_cfg: 962; GFX9-O0: ; %bb.0: ; %entry 963; GFX9-O0-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0 964; GFX9-O0-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1 965; GFX9-O0-NEXT: s_mov_b32 s18, -1 966; GFX9-O0-NEXT: s_mov_b32 s19, 0xe00000 967; GFX9-O0-NEXT: s_add_u32 s16, s16, s4 968; GFX9-O0-NEXT: s_addc_u32 s17, s17, 0 969; GFX9-O0-NEXT: ; implicit-def: $vgpr5 : SGPR spill to VGPR lane 970; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 0 971; GFX9-O0-NEXT: s_mov_b32 s4, s1 972; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 0 973; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 974; GFX9-O0-NEXT: s_mov_b32 s3, s1 975; GFX9-O0-NEXT: s_mov_b32 s8, s3 976; GFX9-O0-NEXT: s_mov_b32 s9, s2 977; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 978; GFX9-O0-NEXT: s_mov_b32 s1, s4 979; GFX9-O0-NEXT: s_mov_b32 s10, s1 980; GFX9-O0-NEXT: s_mov_b32 s4, s0 981; GFX9-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 982; GFX9-O0-NEXT: s_mov_b32 s5, s10 983; GFX9-O0-NEXT: s_mov_b32 s6, s9 984; GFX9-O0-NEXT: s_mov_b32 s7, s8 985; GFX9-O0-NEXT: v_writelane_b32 v5, s2, 1 986; GFX9-O0-NEXT: v_writelane_b32 v5, s3, 2 987; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 3 988; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 4 989; GFX9-O0-NEXT: s_mov_b32 s0, 0 990; GFX9-O0-NEXT: s_nop 2 991; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], s0 992; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 993; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill 994; GFX9-O0-NEXT: s_nop 0 995; GFX9-O0-NEXT: buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill 996; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 997; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3 998; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 999; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0 1000; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] 1001; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0 1002; GFX9-O0-NEXT: s_nop 1 1003; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf 1004; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 1005; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 1006; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 1007; GFX9-O0-NEXT: buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill 1008; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 1009; GFX9-O0-NEXT: v_mov_b32_e32 v0, s0 1010; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill 1011; GFX9-O0-NEXT: s_mov_b64 s[0:1], exec 1012; GFX9-O0-NEXT: v_writelane_b32 v5, s0, 5 1013; GFX9-O0-NEXT: v_writelane_b32 v5, s1, 6 1014; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 1015; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill 1016; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] 1017; GFX9-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 1018; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 1019; GFX9-O0-NEXT: s_cbranch_execz .LBB8_2 1020; GFX9-O0-NEXT: ; %bb.1: ; %if 1021; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload 1022; GFX9-O0-NEXT: buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload 1023; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 1024; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 1025; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 1026; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 1027; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 1028; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1 1029; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1 1030; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1] 1031; GFX9-O0-NEXT: s_nop 1 1032; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1033; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 1034; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1] 1035; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 1036; GFX9-O0-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill 1037; GFX9-O0-NEXT: .LBB8_2: ; %merge 1038; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 1039; GFX9-O0-NEXT: buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload 1040; GFX9-O0-NEXT: s_mov_b64 exec, s[12:13] 1041; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 1042; GFX9-O0-NEXT: v_readlane_b32 s4, v5, 5 1043; GFX9-O0-NEXT: v_readlane_b32 s5, v5, 6 1044; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] 1045; GFX9-O0-NEXT: v_readlane_b32 s2, v5, 1 1046; GFX9-O0-NEXT: v_readlane_b32 s3, v5, 2 1047; GFX9-O0-NEXT: v_readlane_b32 s0, v5, 3 1048; GFX9-O0-NEXT: v_readlane_b32 s1, v5, 4 1049; GFX9-O0-NEXT: buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload 1050; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload 1051; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 1052; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3 1053; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 1054; GFX9-O0-NEXT: s_mov_b32 s4, 1 1055; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 1056; GFX9-O0-NEXT: s_mov_b32 s4, 2 1057; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s4 1058; GFX9-O0-NEXT: s_mov_b32 s6, s1 1059; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 1060; GFX9-O0-NEXT: s_mov_b32 s4, s3 1061; GFX9-O0-NEXT: s_mov_b32 s5, s2 1062; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 1063; GFX9-O0-NEXT: s_mov_b32 s1, s6 1064; GFX9-O0-NEXT: s_mov_b32 s2, s5 1065; GFX9-O0-NEXT: s_mov_b32 s3, s4 1066; GFX9-O0-NEXT: s_mov_b32 s4, 0 1067; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 1068; GFX9-O0-NEXT: s_endpgm 1069; 1070; GFX9-O3-LABEL: strict_wwm_cfg: 1071; GFX9-O3: ; %bb.0: ; %entry 1072; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 1073; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0 1074; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 1075; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 1076; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 1077; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[4:5] 1078; GFX9-O3-NEXT: s_nop 1 1079; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1080; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 1081; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 1082; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 1083; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1084; GFX9-O3-NEXT: s_and_saveexec_b64 s[4:5], vcc 1085; GFX9-O3-NEXT: ; %bb.1: ; %if 1086; GFX9-O3-NEXT: s_or_saveexec_b64 s[6:7], -1 1087; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0 1088; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[6:7] 1089; GFX9-O3-NEXT: s_nop 1 1090; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf 1091; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1 1092; GFX9-O3-NEXT: s_mov_b64 exec, s[6:7] 1093; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1 1094; GFX9-O3-NEXT: ; %bb.2: ; %merge 1095; GFX9-O3-NEXT: s_or_b64 exec, exec, s[4:5] 1096; GFX9-O3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1097; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1098; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1099; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0 1100; GFX9-O3-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 1101; GFX9-O3-NEXT: s_endpgm 1102entry: 1103 %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0) 1104 %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32> 1105 %tmp102 = extractelement <2 x i32> %tmp101, i32 0 1106 %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0) 1107 1108 %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) 1109 %tmp121 = add i32 %tmp105, %tmp120 1110 %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121) 1111 1112 %cond = icmp eq i32 %arg, 0 1113 br i1 %cond, label %if, label %merge 1114if: 1115 %tmp103 = extractelement <2 x i32> %tmp101, i32 1 1116 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0) 1117 1118 %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) 1119 %tmp136 = add i32 %tmp107, %tmp135 1120 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) 1121 br label %merge 1122 1123merge: 1124 %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] 1125 %tmp138 = icmp eq i32 %tmp122, %merge_value 1126 %tmp139 = sext i1 %tmp138 to i32 1127 %tmp140 = shl nsw i32 %tmp139, 1 1128 %tmp141 = and i32 %tmp140, 2 1129 %tmp145 = bitcast i32 %tmp141 to float 1130 call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 1131 ret void 1132} 1133 1134define hidden i32 @strict_wwm_called(i32 %a) noinline { 1135; GFX9-O0-LABEL: strict_wwm_called: 1136; GFX9-O0: ; %bb.0: 1137; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1138; GFX9-O0-NEXT: v_add_u32_e64 v1, v0, v0 1139; GFX9-O0-NEXT: v_mul_lo_u32 v0, v1, v0 1140; GFX9-O0-NEXT: v_sub_u32_e64 v0, v0, v1 1141; GFX9-O0-NEXT: s_setpc_b64 s[30:31] 1142; 1143; GFX9-O3-LABEL: strict_wwm_called: 1144; GFX9-O3: ; %bb.0: 1145; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1146; GFX9-O3-NEXT: v_add_u32_e32 v1, v0, v0 1147; GFX9-O3-NEXT: v_mul_lo_u32 v0, v1, v0 1148; GFX9-O3-NEXT: v_sub_u32_e32 v0, v0, v1 1149; GFX9-O3-NEXT: s_setpc_b64 s[30:31] 1150 %add = add i32 %a, %a 1151 %mul = mul i32 %add, %a 1152 %sub = sub i32 %mul, %add 1153 ret i32 %sub 1154} 1155 1156define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { 1157; GFX9-O0-LABEL: strict_wwm_call: 1158; GFX9-O0: ; %bb.0: 1159; GFX9-O0-NEXT: s_mov_b32 s32, 0 1160; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1161; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1162; GFX9-O0-NEXT: s_mov_b32 s26, -1 1163; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 1164; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 1165; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 1166; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 1167; GFX9-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane 1168; GFX9-O0-NEXT: v_writelane_b32 v3, s12, 0 1169; GFX9-O0-NEXT: v_writelane_b32 v3, s13, 1 1170; GFX9-O0-NEXT: s_mov_b32 s14, s10 1171; GFX9-O0-NEXT: s_mov_b32 s13, s9 1172; GFX9-O0-NEXT: s_mov_b32 s12, s8 1173; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] 1174; GFX9-O0-NEXT: v_writelane_b32 v3, s4, 2 1175; GFX9-O0-NEXT: v_writelane_b32 v3, s5, 3 1176; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] 1177; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 0 1178; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 1 1179; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] 1180; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 2 1181; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 3 1182; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 1183; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 1184; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0 1185; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 1186; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 1187; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c 1188; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34 1189; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX9-O0-NEXT: s_mov_b32 s3, s9 1191; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 1192; GFX9-O0-NEXT: s_mov_b32 s9, s17 1193; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 1194; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 1195; GFX9-O0-NEXT: s_mov_b32 s17, s9 1196; GFX9-O0-NEXT: s_mov_b32 s18, s8 1197; GFX9-O0-NEXT: s_mov_b32 s19, s3 1198; GFX9-O0-NEXT: v_writelane_b32 v3, s16, 4 1199; GFX9-O0-NEXT: v_writelane_b32 v3, s17, 5 1200; GFX9-O0-NEXT: v_writelane_b32 v3, s18, 6 1201; GFX9-O0-NEXT: v_writelane_b32 v3, s19, 7 1202; GFX9-O0-NEXT: s_mov_b32 s8, 0 1203; GFX9-O0-NEXT: v_writelane_b32 v3, s8, 8 1204; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 1205; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 1206; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 1207; GFX9-O0-NEXT: v_writelane_b32 v3, s2, 9 1208; GFX9-O0-NEXT: v_writelane_b32 v3, s3, 10 1209; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 1210; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] 1211; GFX9-O0-NEXT: s_mov_b64 s[8:9], 56 1212; GFX9-O0-NEXT: s_mov_b32 s2, s0 1213; GFX9-O0-NEXT: s_mov_b32 s0, s1 1214; GFX9-O0-NEXT: s_mov_b32 s3, s8 1215; GFX9-O0-NEXT: s_mov_b32 s1, s9 1216; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 1217; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 1218; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 1219; GFX9-O0-NEXT: s_mov_b32 s9, s0 1220; GFX9-O0-NEXT: s_getpc_b64 s[16:17] 1221; GFX9-O0-NEXT: s_add_u32 s16, s16, strict_wwm_called@rel32@lo+4 1222; GFX9-O0-NEXT: s_addc_u32 s17, s17, strict_wwm_called@rel32@hi+12 1223; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] 1224; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] 1225; GFX9-O0-NEXT: s_mov_b32 s15, 20 1226; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 1227; GFX9-O0-NEXT: s_mov_b32 s15, 10 1228; GFX9-O0-NEXT: v_lshlrev_b32_e64 v5, s15, v5 1229; GFX9-O0-NEXT: v_or3_b32 v4, v6, v5, v4 1230; GFX9-O0-NEXT: ; implicit-def: $sgpr15 1231; GFX9-O0-NEXT: v_mov_b32_e32 v31, v4 1232; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 1233; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] 1234; GFX9-O0-NEXT: v_readlane_b32 s0, v3, 4 1235; GFX9-O0-NEXT: v_readlane_b32 s1, v3, 5 1236; GFX9-O0-NEXT: v_readlane_b32 s2, v3, 6 1237; GFX9-O0-NEXT: v_readlane_b32 s3, v3, 7 1238; GFX9-O0-NEXT: v_readlane_b32 s6, v3, 9 1239; GFX9-O0-NEXT: v_readlane_b32 s7, v3, 10 1240; GFX9-O0-NEXT: v_readlane_b32 s4, v3, 8 1241; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 1242; GFX9-O0-NEXT: v_add_u32_e64 v3, v3, v7 1243; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 1244; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 1245; GFX9-O0-NEXT: s_nop 0 1246; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s4 offset:4 1247; GFX9-O0-NEXT: s_endpgm 1248; 1249; GFX9-O3-LABEL: strict_wwm_call: 1250; GFX9-O3: ; %bb.0: 1251; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1252; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1253; GFX9-O3-NEXT: s_mov_b32 s26, -1 1254; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 1255; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 1256; GFX9-O3-NEXT: s_mov_b32 s32, 0 1257; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 1258; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 1259; GFX9-O3-NEXT: s_mov_b32 s14, s10 1260; GFX9-O3-NEXT: s_mov_b32 s13, s9 1261; GFX9-O3-NEXT: s_mov_b32 s12, s8 1262; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] 1263; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 1264; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 1265; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 1266; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] 1267; GFX9-O3-NEXT: s_load_dword s6, s[4:5], 0x34 1268; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 1269; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 1271; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 1272; GFX9-O3-NEXT: s_add_u32 s8, s4, 56 1273; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 1274; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 1275; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21] 1276; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 1277; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 1278; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] 1279; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] 1280; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] 1281; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 1282; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] 1283; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 1284; GFX9-O3-NEXT: s_getpc_b64 s[22:23] 1285; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 1286; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 1287; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] 1288; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 1289; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 1290; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] 1291; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 1292; GFX9-O3-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4 1293; GFX9-O3-NEXT: s_endpgm 1294 1295 1296 %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0) 1297 %tmp134 = call i32 @strict_wwm_called(i32 %tmp107) 1298 %tmp136 = add i32 %tmp134, %tmp107 1299 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) 1300 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 1301 ret void 1302} 1303 1304define i64 @strict_wwm_called_i64(i64 %a) noinline { 1305; GFX9-O0-LABEL: strict_wwm_called_i64: 1306; GFX9-O0: ; %bb.0: 1307; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1308; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 1309; GFX9-O0-NEXT: ; implicit-def: $sgpr4 1310; GFX9-O0-NEXT: ; implicit-def: $sgpr4 1311; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec 1312; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 1313; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 1314; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 1315; GFX9-O0-NEXT: v_mov_b32_e32 v5, v2 1316; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 1317; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 1318; GFX9-O0-NEXT: v_add_co_u32_e64 v4, s[4:5], v4, v5 1319; GFX9-O0-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5] 1320; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec 1321; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 1322; GFX9-O0-NEXT: s_mov_b32 s4, 32 1323; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 1324; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 1325; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] 1326; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 1327; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 1328; GFX9-O0-NEXT: v_mul_lo_u32 v1, v0, v1 1329; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 1330; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s4, v[4:5] 1331; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6 1332; GFX9-O0-NEXT: v_mul_lo_u32 v2, v2, v3 1333; GFX9-O0-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0 1334; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 1335; GFX9-O0-NEXT: v_add3_u32 v0, v0, v1, v2 1336; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1337; GFX9-O0-NEXT: ; implicit-def: $sgpr6 1338; GFX9-O0-NEXT: ; implicit-def: $sgpr6 1339; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 1340; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec 1341; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 1342; GFX9-O0-NEXT: v_lshlrev_b64 v[1:2], s4, v[0:1] 1343; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 1344; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec 1345; GFX9-O0-NEXT: s_mov_b32 s5, 0 1346; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1347; GFX9-O0-NEXT: v_mov_b32_e32 v0, 0 1348; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec 1349; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 1350; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 1351; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3 1352; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 1353; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 1354; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2 1355; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec 1356; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 1357; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 1358; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 1359; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 1360; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 1361; GFX9-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v3 1362; GFX9-O0-NEXT: v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7] 1363; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec 1364; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 1365; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 1366; GFX9-O0-NEXT: v_lshrrev_b64 v[1:2], s4, v[1:2] 1367; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec 1368; GFX9-O0-NEXT: s_setpc_b64 s[30:31] 1369; 1370; GFX9-O3-LABEL: strict_wwm_called_i64: 1371; GFX9-O3: ; %bb.0: 1372; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1373; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v0, v0 1374; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v1, vcc 1375; GFX9-O3-NEXT: v_mul_lo_u32 v4, v3, v0 1376; GFX9-O3-NEXT: v_mul_lo_u32 v5, v2, v1 1377; GFX9-O3-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 1378; GFX9-O3-NEXT: v_add3_u32 v1, v1, v5, v4 1379; GFX9-O3-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 1380; GFX9-O3-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 1381; GFX9-O3-NEXT: s_setpc_b64 s[30:31] 1382 %add = add i64 %a, %a 1383 %mul = mul i64 %add, %a 1384 %sub = sub i64 %mul, %add 1385 ret i64 %sub 1386} 1387 1388define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) { 1389; GFX9-O0-LABEL: strict_wwm_call_i64: 1390; GFX9-O0: ; %bb.0: 1391; GFX9-O0-NEXT: s_mov_b32 s32, 0 1392; GFX9-O0-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1393; GFX9-O0-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1394; GFX9-O0-NEXT: s_mov_b32 s26, -1 1395; GFX9-O0-NEXT: s_mov_b32 s27, 0xe00000 1396; GFX9-O0-NEXT: s_add_u32 s24, s24, s11 1397; GFX9-O0-NEXT: s_addc_u32 s25, s25, 0 1398; GFX9-O0-NEXT: s_or_saveexec_b64 s[12:13], -1 1399; GFX9-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane 1400; GFX9-O0-NEXT: v_writelane_b32 v8, s12, 0 1401; GFX9-O0-NEXT: v_writelane_b32 v8, s13, 1 1402; GFX9-O0-NEXT: s_mov_b32 s14, s10 1403; GFX9-O0-NEXT: s_mov_b32 s13, s9 1404; GFX9-O0-NEXT: s_mov_b32 s12, s8 1405; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[6:7] 1406; GFX9-O0-NEXT: v_writelane_b32 v8, s4, 2 1407; GFX9-O0-NEXT: v_writelane_b32 v8, s5, 3 1408; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[2:3] 1409; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 0 1410; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 1 1411; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[0:1] 1412; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 2 1413; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 3 1414; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2 1415; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 1416; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 1417; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3] 1418; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 1419; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c 1420; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1421; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX9-O0-NEXT: s_mov_b32 s8, s19 1423; GFX9-O0-NEXT: s_mov_b32 s9, s18 1424; GFX9-O0-NEXT: s_mov_b32 s15, s17 1425; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17 1426; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19 1427; GFX9-O0-NEXT: s_mov_b32 s17, s15 1428; GFX9-O0-NEXT: s_mov_b32 s18, s9 1429; GFX9-O0-NEXT: s_mov_b32 s19, s8 1430; GFX9-O0-NEXT: v_writelane_b32 v8, s16, 4 1431; GFX9-O0-NEXT: v_writelane_b32 v8, s17, 5 1432; GFX9-O0-NEXT: v_writelane_b32 v8, s18, 6 1433; GFX9-O0-NEXT: v_writelane_b32 v8, s19, 7 1434; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0 1435; GFX9-O0-NEXT: s_mov_b32 s15, s9 1436; GFX9-O0-NEXT: s_mov_b32 s16, s3 1437; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 1438; GFX9-O0-NEXT: v_mov_b32_e32 v0, s16 1439; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 1440; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15 1441; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[16:17] 1442; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] 1443; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9 1444; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3 1445; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17 1446; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2 1447; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1 1448; GFX9-O0-NEXT: v_writelane_b32 v8, s2, 8 1449; GFX9-O0-NEXT: v_writelane_b32 v8, s3, 9 1450; GFX9-O0-NEXT: v_mov_b32_e32 v7, s8 1451; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3] 1452; GFX9-O0-NEXT: ; implicit-def: $sgpr2 1453; GFX9-O0-NEXT: ; implicit-def: $sgpr2 1454; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 1455; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 1456; GFX9-O0-NEXT: s_mov_b32 s2, 32 1457; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10] 1458; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 1459; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60 1460; GFX9-O0-NEXT: s_mov_b32 s2, s0 1461; GFX9-O0-NEXT: s_mov_b32 s0, s1 1462; GFX9-O0-NEXT: s_mov_b32 s3, s8 1463; GFX9-O0-NEXT: s_mov_b32 s1, s9 1464; GFX9-O0-NEXT: s_add_u32 s8, s2, s3 1465; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1 1466; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 1467; GFX9-O0-NEXT: s_mov_b32 s9, s0 1468; GFX9-O0-NEXT: s_getpc_b64 s[0:1] 1469; GFX9-O0-NEXT: s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4 1470; GFX9-O0-NEXT: s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12 1471; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 1472; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[24:25] 1473; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[26:27] 1474; GFX9-O0-NEXT: s_mov_b32 s15, 20 1475; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s15, v3 1476; GFX9-O0-NEXT: s_mov_b32 s15, 10 1477; GFX9-O0-NEXT: v_lshlrev_b32_e64 v4, s15, v4 1478; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3 1479; GFX9-O0-NEXT: ; implicit-def: $sgpr15 1480; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 1481; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 1482; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 1483; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] 1485; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4 1486; GFX9-O0-NEXT: v_readlane_b32 s1, v8, 5 1487; GFX9-O0-NEXT: v_readlane_b32 s2, v8, 6 1488; GFX9-O0-NEXT: v_readlane_b32 s3, v8, 7 1489; GFX9-O0-NEXT: v_readlane_b32 s4, v8, 8 1490; GFX9-O0-NEXT: v_readlane_b32 s5, v8, 9 1491; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 1492; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 1493; GFX9-O0-NEXT: ; implicit-def: $sgpr6 1494; GFX9-O0-NEXT: ; implicit-def: $sgpr6 1495; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 1496; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 1497; GFX9-O0-NEXT: v_add_co_u32_e64 v3, s[6:7], v3, v5 1498; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7] 1499; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] 1500; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3 1501; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 1502; GFX9-O0-NEXT: s_mov_b32 s4, 0 1503; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4 1504; GFX9-O0-NEXT: s_endpgm 1505; 1506; GFX9-O3-LABEL: strict_wwm_call_i64: 1507; GFX9-O3: ; %bb.0: 1508; GFX9-O3-NEXT: s_mov_b32 s24, SCRATCH_RSRC_DWORD0 1509; GFX9-O3-NEXT: s_mov_b32 s25, SCRATCH_RSRC_DWORD1 1510; GFX9-O3-NEXT: s_mov_b32 s26, -1 1511; GFX9-O3-NEXT: s_mov_b32 s27, 0xe00000 1512; GFX9-O3-NEXT: s_add_u32 s24, s24, s11 1513; GFX9-O3-NEXT: s_mov_b32 s32, 0 1514; GFX9-O3-NEXT: s_addc_u32 s25, s25, 0 1515; GFX9-O3-NEXT: s_or_saveexec_b64 s[16:17], -1 1516; GFX9-O3-NEXT: s_mov_b32 s14, s10 1517; GFX9-O3-NEXT: s_mov_b32 s13, s9 1518; GFX9-O3-NEXT: s_mov_b32 s12, s8 1519; GFX9-O3-NEXT: s_mov_b64 s[10:11], s[6:7] 1520; GFX9-O3-NEXT: v_mov_b32_e32 v3, v2 1521; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 1522; GFX9-O3-NEXT: v_mov_b32_e32 v5, v0 1523; GFX9-O3-NEXT: s_mov_b64 exec, s[16:17] 1524; GFX9-O3-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1525; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 1526; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 1527; GFX9-O3-NEXT: v_mov_b32_e32 v0, s7 1528; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1 1529; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9] 1530; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9] 1531; GFX9-O3-NEXT: v_mov_b32_e32 v0, s6 1532; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1 1533; GFX9-O3-NEXT: s_add_u32 s8, s4, 60 1534; GFX9-O3-NEXT: s_addc_u32 s9, s5, 0 1535; GFX9-O3-NEXT: s_getpc_b64 s[4:5] 1536; GFX9-O3-NEXT: s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4 1537; GFX9-O3-NEXT: s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12 1538; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 1539; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3 1540; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4 1541; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21] 1542; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3 1543; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] 1544; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] 1545; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] 1546; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 1547; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] 1548; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7 1549; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6 1550; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) 1551; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] 1552; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 1553; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1 1554; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 1555; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 1556; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21] 1557; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3 1558; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4 1559; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4 1560; GFX9-O3-NEXT: s_endpgm 1561 1562 1563 1564 %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0) 1565 %tmp134 = call i64 @strict_wwm_called_i64(i64 %tmp107) 1566 %tmp136 = add i64 %tmp134, %tmp107 1567 %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136) 1568 %tmp138 = bitcast i64 %tmp137 to <2 x i32> 1569 call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) 1570 ret void 1571} 1572 1573define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { 1574; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main: 1575; GFX9-O0: ; %bb.0: 1576; GFX9-O0-NEXT: s_mov_b32 s4, s3 1577; GFX9-O0-NEXT: s_mov_b32 s5, s2 1578; GFX9-O0-NEXT: s_mov_b32 s6, s1 1579; GFX9-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 1580; GFX9-O0-NEXT: s_mov_b32 s1, s6 1581; GFX9-O0-NEXT: s_mov_b32 s2, s5 1582; GFX9-O0-NEXT: s_mov_b32 s3, s4 1583; GFX9-O0-NEXT: ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3 1584; GFX9-O0-NEXT: s_mov_b32 s4, 5 1585; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0 1586; GFX9-O0-NEXT: s_mov_b32 s4, 0 1587; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen 1588; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 1589; GFX9-O0-NEXT: s_waitcnt vmcnt(1) 1590; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 1591; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 1592; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 1593; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff 1594; GFX9-O0-NEXT: s_mov_b32 s10, -1 1595; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11 1596; GFX9-O0-NEXT: s_mov_b32 s11, s5 1597; GFX9-O0-NEXT: s_mov_b32 s8, s11 1598; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13 1599; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 1600; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 1601; GFX9-O0-NEXT: s_mov_b32 s5, s10 1602; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 1603; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 1604; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] 1605; GFX9-O0-NEXT: ; implicit-def: $sgpr9 1606; GFX9-O0-NEXT: ; implicit-def: $sgpr9 1607; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 1608; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 1609; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 1610; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 1611; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 1612; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 1613; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 1614; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 1615; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 1616; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] 1617; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 1618; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 1619; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7] 1620; GFX9-O0-NEXT: ; implicit-def: $sgpr9 1621; GFX9-O0-NEXT: ; implicit-def: $sgpr9 1622; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 1623; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 1624; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 1625; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 1626; GFX9-O0-NEXT: s_waitcnt vmcnt(0) 1627; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 1628; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 1629; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11 1630; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 1631; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] 1632; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 1633; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec 1634; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 1635; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9 1636; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5 1637; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7] 1638; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1639; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1640; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 1641; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7] 1642; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2 1643; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3 1644; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 1645; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 1646; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 1647; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 1648; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1649; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1650; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1651; GFX9-O0-NEXT: ; implicit-def: $sgpr5 1652; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec 1653; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 1654; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 1655; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 1656; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen 1657; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16 1658; GFX9-O0-NEXT: s_endpgm 1659; 1660; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main: 1661; GFX9-O3: ; %bb.0: 1662; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0 1663; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen 1664; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 1665; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1 1666; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2 1667; GFX9-O3-NEXT: s_waitcnt vmcnt(1) 1668; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5] 1669; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5] 1670; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5] 1671; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5] 1672; GFX9-O3-NEXT: s_waitcnt vmcnt(0) 1673; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5] 1674; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5] 1675; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5] 1676; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 1677; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 1678; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 1679; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 1680; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6 1681; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7 1682; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen 1683; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16 1684; GFX9-O3-NEXT: s_endpgm 1685 %tmp17 = shl i32 %index, 5 1686 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) 1687 %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> 1688 %tmp19 = or i32 %tmp17, 16 1689 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) 1690 %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 1691 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807) 1692 %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22) 1693 %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1 1694 %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807) 1695 %tmp174 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp99) 1696 %.i25 = bitcast <2 x i32> %tmp20 to i64 1697 %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807) 1698 %tmp251 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp176) 1699 %.cast = bitcast i64 %tmp97 to <2 x float> 1700 %.cast6 = bitcast i64 %tmp174 to <2 x float> 1701 %.cast7 = bitcast i64 %tmp251 to <2 x float> 1702 %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1703 %desc.int = bitcast <4 x i32> %desc to i128 1704 %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8) 1705 tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0) 1706 tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8)%desc.ptr, i32 %tmp19, i32 0, i32 0) 1707 ret void 1708} 1709 1710declare i32 @llvm.amdgcn.strict.wwm.i32(i32) 1711declare i64 @llvm.amdgcn.strict.wwm.i64(i64) 1712declare i32 @llvm.amdgcn.wwm.i32(i32) 1713declare i64 @llvm.amdgcn.wwm.i64(i64) 1714declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) 1715declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) 1716declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) 1717declare <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) 1718declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32) 1719declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32) 1720declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32) 1721declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) 1722declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) 1723declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) 1724declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) 1725 1726!llvm.module.flags = !{!0} 1727!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 1728