1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s 11 12; -------------------------------------------------------------------- 13; float 14; -------------------------------------------------------------------- 15 16define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 17; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 18; GFX12: ; %bb.0: 19; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 20; GFX12-NEXT: s_wait_expcnt 0x0 21; GFX12-NEXT: s_wait_samplecnt 0x0 22; GFX12-NEXT: s_wait_bvhcnt 0x0 23; GFX12-NEXT: s_wait_kmcnt 0x0 24; GFX12-NEXT: v_mov_b32_e32 v1, s16 25; GFX12-NEXT: s_wait_storecnt 0x0 26; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 27; GFX12-NEXT: s_wait_loadcnt 0x0 28; GFX12-NEXT: global_inv scope:SCOPE_DEV 29; GFX12-NEXT: s_setpc_b64 s[30:31] 30; 31; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 32; GFX940: ; %bb.0: 33; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 34; GFX940-NEXT: v_mov_b32_e32 v1, v0 35; GFX940-NEXT: v_mov_b32_e32 v0, s16 36; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 37; GFX940-NEXT: s_add_i32 s6, s16, 0x400 38; GFX940-NEXT: s_mov_b64 s[4:5], 0 39; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 40; GFX940-NEXT: v_mov_b32_e32 v3, s6 41; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start 42; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 43; GFX940-NEXT: s_waitcnt vmcnt(0) 44; GFX940-NEXT: v_mov_b32_e32 v5, v0 45; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 46; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 47; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 48; GFX940-NEXT: buffer_wbl2 sc1 49; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 50; GFX940-NEXT: s_waitcnt vmcnt(0) 51; GFX940-NEXT: buffer_inv sc1 52; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 53; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 54; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 55; GFX940-NEXT: s_cbranch_execnz .LBB0_1 56; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 57; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 58; GFX940-NEXT: s_setpc_b64 s[30:31] 59; 60; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 61; GFX11: ; %bb.0: 62; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 63; GFX11-NEXT: v_mov_b32_e32 v1, s16 64; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 65; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 66; GFX11-NEXT: s_waitcnt vmcnt(0) 67; GFX11-NEXT: buffer_gl1_inv 68; GFX11-NEXT: buffer_gl0_inv 69; GFX11-NEXT: s_setpc_b64 s[30:31] 70; 71; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 72; GFX10: ; %bb.0: 73; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 74; GFX10-NEXT: v_mov_b32_e32 v1, s20 75; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 76; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 77; GFX10-NEXT: s_waitcnt vmcnt(0) 78; GFX10-NEXT: buffer_gl1_inv 79; GFX10-NEXT: buffer_gl0_inv 80; GFX10-NEXT: s_setpc_b64 s[30:31] 81; 82; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 83; GFX90A: ; %bb.0: 84; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 85; GFX90A-NEXT: v_mov_b32_e32 v1, v0 86; GFX90A-NEXT: v_mov_b32_e32 v0, s20 87; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 88; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 89; GFX90A-NEXT: s_mov_b64 s[4:5], 0 90; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 91; GFX90A-NEXT: v_mov_b32_e32 v3, s6 92; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start 93; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 94; GFX90A-NEXT: s_waitcnt vmcnt(0) 95; GFX90A-NEXT: v_mov_b32_e32 v5, v0 96; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 97; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 98; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 99; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 100; GFX90A-NEXT: s_waitcnt vmcnt(0) 101; GFX90A-NEXT: buffer_wbinvl1 102; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 103; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 104; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 105; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 106; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 107; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 108; GFX90A-NEXT: s_setpc_b64 s[30:31] 109; 110; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 111; GFX908: ; %bb.0: 112; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GFX908-NEXT: v_mov_b32_e32 v1, v0 114; GFX908-NEXT: v_mov_b32_e32 v0, s20 115; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 116; GFX908-NEXT: s_add_i32 s6, s20, 0x400 117; GFX908-NEXT: s_mov_b64 s[4:5], 0 118; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 119; GFX908-NEXT: v_mov_b32_e32 v3, s6 120; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 121; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 122; GFX908-NEXT: s_waitcnt vmcnt(0) 123; GFX908-NEXT: v_mov_b32_e32 v5, v0 124; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 125; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 126; GFX908-NEXT: v_mov_b32_e32 v0, v4 127; GFX908-NEXT: v_mov_b32_e32 v1, v5 128; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 129; GFX908-NEXT: s_waitcnt vmcnt(0) 130; GFX908-NEXT: buffer_wbinvl1 131; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 132; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 133; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 134; GFX908-NEXT: s_cbranch_execnz .LBB0_1 135; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 136; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 137; GFX908-NEXT: s_setpc_b64 s[30:31] 138; 139; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 140; GFX8: ; %bb.0: 141; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 142; GFX8-NEXT: v_mov_b32_e32 v1, v0 143; GFX8-NEXT: v_mov_b32_e32 v0, s20 144; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 145; GFX8-NEXT: s_add_i32 s6, s20, 0x400 146; GFX8-NEXT: s_mov_b64 s[4:5], 0 147; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 148; GFX8-NEXT: v_mov_b32_e32 v3, s6 149; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start 150; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 151; GFX8-NEXT: s_waitcnt vmcnt(0) 152; GFX8-NEXT: v_mov_b32_e32 v5, v0 153; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 154; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 155; GFX8-NEXT: v_mov_b32_e32 v0, v4 156; GFX8-NEXT: v_mov_b32_e32 v1, v5 157; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 158; GFX8-NEXT: s_waitcnt vmcnt(0) 159; GFX8-NEXT: buffer_wbinvl1 160; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 161; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 162; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 163; GFX8-NEXT: s_cbranch_execnz .LBB0_1 164; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 165; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 166; GFX8-NEXT: s_setpc_b64 s[30:31] 167; 168; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 169; GFX7: ; %bb.0: 170; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 171; GFX7-NEXT: v_mov_b32_e32 v1, s20 172; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 173; GFX7-NEXT: s_waitcnt vmcnt(0) 174; GFX7-NEXT: buffer_wbinvl1 175; GFX7-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 178; GFX6: ; %bb.0: 179; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX6-NEXT: v_mov_b32_e32 v1, s20 181; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 182; GFX6-NEXT: s_waitcnt vmcnt(0) 183; GFX6-NEXT: buffer_wbinvl1 184; GFX6-NEXT: s_waitcnt expcnt(0) 185; GFX6-NEXT: s_setpc_b64 s[30:31] 186 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 187 %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 188 ret float %result 189} 190 191define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 192; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 193; GFX12: ; %bb.0: 194; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 195; GFX12-NEXT: s_wait_expcnt 0x0 196; GFX12-NEXT: s_wait_samplecnt 0x0 197; GFX12-NEXT: s_wait_bvhcnt 0x0 198; GFX12-NEXT: s_wait_kmcnt 0x0 199; GFX12-NEXT: v_mov_b32_e32 v1, s16 200; GFX12-NEXT: s_wait_storecnt 0x0 201; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 202; GFX12-NEXT: s_wait_storecnt 0x0 203; GFX12-NEXT: global_inv scope:SCOPE_DEV 204; GFX12-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 207; GFX940: ; %bb.0: 208; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX940-NEXT: v_mov_b32_e32 v1, s16 210; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 211; GFX940-NEXT: s_add_i32 s6, s16, 0x400 212; GFX940-NEXT: s_mov_b64 s[4:5], 0 213; GFX940-NEXT: v_max_f32_e32 v2, v0, v0 214; GFX940-NEXT: v_mov_b32_e32 v3, s6 215; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start 216; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 217; GFX940-NEXT: s_waitcnt vmcnt(0) 218; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 219; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 220; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 221; GFX940-NEXT: buffer_wbl2 sc1 222; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 223; GFX940-NEXT: s_waitcnt vmcnt(0) 224; GFX940-NEXT: buffer_inv sc1 225; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 226; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 227; GFX940-NEXT: v_mov_b32_e32 v1, v4 228; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 229; GFX940-NEXT: s_cbranch_execnz .LBB1_1 230; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 231; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 232; GFX940-NEXT: s_setpc_b64 s[30:31] 233; 234; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 235; GFX11: ; %bb.0: 236; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 237; GFX11-NEXT: v_mov_b32_e32 v1, s16 238; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 239; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 240; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 241; GFX11-NEXT: buffer_gl1_inv 242; GFX11-NEXT: buffer_gl0_inv 243; GFX11-NEXT: s_setpc_b64 s[30:31] 244; 245; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 246; GFX10: ; %bb.0: 247; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GFX10-NEXT: v_mov_b32_e32 v1, s20 249; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 250; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 251; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 252; GFX10-NEXT: buffer_gl1_inv 253; GFX10-NEXT: buffer_gl0_inv 254; GFX10-NEXT: s_setpc_b64 s[30:31] 255; 256; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 257; GFX90A: ; %bb.0: 258; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GFX90A-NEXT: v_mov_b32_e32 v1, s20 260; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 261; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 262; GFX90A-NEXT: s_mov_b64 s[4:5], 0 263; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0 264; GFX90A-NEXT: v_mov_b32_e32 v3, s6 265; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start 266; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 267; GFX90A-NEXT: s_waitcnt vmcnt(0) 268; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 269; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 270; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 271; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 272; GFX90A-NEXT: s_waitcnt vmcnt(0) 273; GFX90A-NEXT: buffer_wbinvl1 274; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 275; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 276; GFX90A-NEXT: v_mov_b32_e32 v1, v4 277; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 278; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 279; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 280; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 281; GFX90A-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 284; GFX908: ; %bb.0: 285; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX908-NEXT: v_mov_b32_e32 v1, s20 287; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 288; GFX908-NEXT: s_add_i32 s6, s20, 0x400 289; GFX908-NEXT: s_mov_b64 s[4:5], 0 290; GFX908-NEXT: v_max_f32_e32 v2, v0, v0 291; GFX908-NEXT: v_mov_b32_e32 v3, s6 292; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start 293; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 294; GFX908-NEXT: s_waitcnt vmcnt(0) 295; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 296; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 297; GFX908-NEXT: v_mov_b32_e32 v5, v1 298; GFX908-NEXT: v_mov_b32_e32 v4, v0 299; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 300; GFX908-NEXT: s_waitcnt vmcnt(0) 301; GFX908-NEXT: buffer_wbinvl1 302; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 303; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 304; GFX908-NEXT: v_mov_b32_e32 v1, v4 305; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 306; GFX908-NEXT: s_cbranch_execnz .LBB1_1 307; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 308; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 309; GFX908-NEXT: s_setpc_b64 s[30:31] 310; 311; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 312; GFX8: ; %bb.0: 313; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 314; GFX8-NEXT: v_mov_b32_e32 v1, s20 315; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 316; GFX8-NEXT: s_add_i32 s6, s20, 0x400 317; GFX8-NEXT: s_mov_b64 s[4:5], 0 318; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0 319; GFX8-NEXT: v_mov_b32_e32 v3, s6 320; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start 321; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 322; GFX8-NEXT: s_waitcnt vmcnt(0) 323; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 324; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 325; GFX8-NEXT: v_mov_b32_e32 v5, v1 326; GFX8-NEXT: v_mov_b32_e32 v4, v0 327; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 328; GFX8-NEXT: s_waitcnt vmcnt(0) 329; GFX8-NEXT: buffer_wbinvl1 330; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 331; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 332; GFX8-NEXT: v_mov_b32_e32 v1, v4 333; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 334; GFX8-NEXT: s_cbranch_execnz .LBB1_1 335; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 336; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 337; GFX8-NEXT: s_setpc_b64 s[30:31] 338; 339; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 340; GFX7: ; %bb.0: 341; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX7-NEXT: v_mov_b32_e32 v1, s20 343; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 344; GFX7-NEXT: s_waitcnt vmcnt(0) 345; GFX7-NEXT: buffer_wbinvl1 346; GFX7-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_grained_memory: 349; GFX6: ; %bb.0: 350; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX6-NEXT: v_mov_b32_e32 v1, s20 352; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 353; GFX6-NEXT: s_waitcnt vmcnt(0) 354; GFX6-NEXT: buffer_wbinvl1 355; GFX6-NEXT: s_waitcnt expcnt(0) 356; GFX6-NEXT: s_setpc_b64 s[30:31] 357 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 358 %unused = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 359 ret void 360} 361 362define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, float %val) #0 { 363; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 364; GFX12: ; %bb.0: 365; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 366; GFX12-NEXT: s_wait_expcnt 0x0 367; GFX12-NEXT: s_wait_samplecnt 0x0 368; GFX12-NEXT: s_wait_bvhcnt 0x0 369; GFX12-NEXT: s_wait_kmcnt 0x0 370; GFX12-NEXT: s_mov_b32 s1, exec_lo 371; GFX12-NEXT: s_wait_storecnt 0x0 372; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 373; GFX12-NEXT: v_readfirstlane_b32 s4, v0 374; GFX12-NEXT: v_readfirstlane_b32 s5, v1 375; GFX12-NEXT: v_readfirstlane_b32 s6, v2 376; GFX12-NEXT: v_readfirstlane_b32 s7, v3 377; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 378; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 379; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 380; GFX12-NEXT: s_wait_alu 0xfffe 381; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 382; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 383; GFX12-NEXT: s_wait_alu 0xfffe 384; GFX12-NEXT: s_and_saveexec_b32 s0, s0 385; GFX12-NEXT: s_wait_loadcnt 0x0 386; GFX12-NEXT: buffer_atomic_min_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN 387; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 388; GFX12-NEXT: ; implicit-def: $vgpr4 389; GFX12-NEXT: s_wait_alu 0xfffe 390; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 391; GFX12-NEXT: s_cbranch_execnz .LBB2_1 392; GFX12-NEXT: ; %bb.2: 393; GFX12-NEXT: s_mov_b32 exec_lo, s1 394; GFX12-NEXT: s_wait_loadcnt 0x0 395; GFX12-NEXT: v_mov_b32_e32 v0, v5 396; GFX12-NEXT: global_inv scope:SCOPE_DEV 397; GFX12-NEXT: s_wait_alu 0xfffe 398; GFX12-NEXT: s_setpc_b64 s[30:31] 399; 400; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 401; GFX940: ; %bb.0: 402; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 403; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 404; GFX940-NEXT: s_mov_b64 s[2:3], exec 405; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 406; GFX940-NEXT: v_readfirstlane_b32 s4, v0 407; GFX940-NEXT: v_readfirstlane_b32 s5, v1 408; GFX940-NEXT: v_readfirstlane_b32 s6, v2 409; GFX940-NEXT: v_readfirstlane_b32 s7, v3 410; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 411; GFX940-NEXT: s_nop 0 412; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 413; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 414; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 415; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 416; GFX940-NEXT: ; implicit-def: $vgpr4 417; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 418; GFX940-NEXT: s_cbranch_execnz .LBB2_1 419; GFX940-NEXT: ; %bb.2: 420; GFX940-NEXT: s_mov_b64 exec, s[2:3] 421; GFX940-NEXT: s_mov_b64 s[2:3], 0 422; GFX940-NEXT: v_max_f32_e32 v9, v5, v5 423; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start 424; GFX940-NEXT: ; =>This Loop Header: Depth=1 425; GFX940-NEXT: ; Child Loop BB2_4 Depth 2 426; GFX940-NEXT: s_waitcnt vmcnt(0) 427; GFX940-NEXT: v_max_f32_e32 v4, v7, v7 428; GFX940-NEXT: v_min_f32_e32 v6, v4, v9 429; GFX940-NEXT: s_mov_b64 s[8:9], exec 430; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 431; GFX940-NEXT: buffer_wbl2 sc1 432; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 433; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 434; GFX940-NEXT: v_readfirstlane_b32 s4, v0 435; GFX940-NEXT: v_readfirstlane_b32 s5, v1 436; GFX940-NEXT: v_readfirstlane_b32 s6, v2 437; GFX940-NEXT: v_readfirstlane_b32 s7, v3 438; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 439; GFX940-NEXT: s_nop 0 440; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 441; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 442; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 443; GFX940-NEXT: s_waitcnt vmcnt(0) 444; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 445; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 446; GFX940-NEXT: s_cbranch_execnz .LBB2_4 447; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 448; GFX940-NEXT: s_mov_b64 exec, s[8:9] 449; GFX940-NEXT: s_waitcnt vmcnt(0) 450; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 451; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 452; GFX940-NEXT: v_mov_b32_e32 v7, v4 453; GFX940-NEXT: buffer_inv sc1 454; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 455; GFX940-NEXT: s_cbranch_execnz .LBB2_3 456; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 457; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 458; GFX940-NEXT: v_mov_b32_e32 v0, v4 459; GFX940-NEXT: s_setpc_b64 s[30:31] 460; 461; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 462; GFX11: ; %bb.0: 463; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 464; GFX11-NEXT: s_mov_b32 s1, exec_lo 465; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 466; GFX11-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 467; GFX11-NEXT: v_readfirstlane_b32 s4, v0 468; GFX11-NEXT: v_readfirstlane_b32 s5, v1 469; GFX11-NEXT: v_readfirstlane_b32 s6, v2 470; GFX11-NEXT: v_readfirstlane_b32 s7, v3 471; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 472; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 473; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 474; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 475; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 476; GFX11-NEXT: s_and_saveexec_b32 s0, s0 477; GFX11-NEXT: s_waitcnt vmcnt(0) 478; GFX11-NEXT: buffer_atomic_min_f32 v5, v4, s[4:7], 0 offen offset:1024 glc 479; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 480; GFX11-NEXT: ; implicit-def: $vgpr4 481; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 482; GFX11-NEXT: s_cbranch_execnz .LBB2_1 483; GFX11-NEXT: ; %bb.2: 484; GFX11-NEXT: s_mov_b32 exec_lo, s1 485; GFX11-NEXT: s_waitcnt vmcnt(0) 486; GFX11-NEXT: v_mov_b32_e32 v0, v5 487; GFX11-NEXT: buffer_gl1_inv 488; GFX11-NEXT: buffer_gl0_inv 489; GFX11-NEXT: s_setpc_b64 s[30:31] 490; 491; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 492; GFX10: ; %bb.0: 493; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 494; GFX10-NEXT: s_mov_b32 s5, exec_lo 495; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 496; GFX10-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 497; GFX10-NEXT: v_readfirstlane_b32 s8, v0 498; GFX10-NEXT: v_readfirstlane_b32 s9, v1 499; GFX10-NEXT: v_readfirstlane_b32 s10, v2 500; GFX10-NEXT: v_readfirstlane_b32 s11, v3 501; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 502; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 503; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 504; GFX10-NEXT: s_and_saveexec_b32 s4, s4 505; GFX10-NEXT: s_waitcnt vmcnt(0) 506; GFX10-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc 507; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 508; GFX10-NEXT: ; implicit-def: $vgpr4 509; GFX10-NEXT: s_waitcnt_depctr 0xffe3 510; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 511; GFX10-NEXT: s_cbranch_execnz .LBB2_1 512; GFX10-NEXT: ; %bb.2: 513; GFX10-NEXT: s_mov_b32 exec_lo, s5 514; GFX10-NEXT: s_waitcnt vmcnt(0) 515; GFX10-NEXT: v_mov_b32_e32 v0, v5 516; GFX10-NEXT: buffer_gl1_inv 517; GFX10-NEXT: buffer_gl0_inv 518; GFX10-NEXT: s_setpc_b64 s[30:31] 519; 520; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 521; GFX90A: ; %bb.0: 522; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 523; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 524; GFX90A-NEXT: s_mov_b64 s[6:7], exec 525; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 526; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 527; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 528; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 529; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 530; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 531; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 532; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 533; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 534; GFX90A-NEXT: s_nop 0 535; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 536; GFX90A-NEXT: ; implicit-def: $vgpr4 537; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 538; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 539; GFX90A-NEXT: ; %bb.2: 540; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 541; GFX90A-NEXT: s_mov_b64 s[6:7], 0 542; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5 543; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start 544; GFX90A-NEXT: ; =>This Loop Header: Depth=1 545; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2 546; GFX90A-NEXT: s_waitcnt vmcnt(0) 547; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7 548; GFX90A-NEXT: v_min_f32_e32 v6, v4, v9 549; GFX90A-NEXT: s_mov_b64 s[12:13], exec 550; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 551; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 552; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 553; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 554; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 555; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 556; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 557; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 558; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 559; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 560; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 561; GFX90A-NEXT: s_waitcnt vmcnt(0) 562; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 563; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 564; GFX90A-NEXT: s_cbranch_execnz .LBB2_4 565; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 566; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 567; GFX90A-NEXT: s_waitcnt vmcnt(0) 568; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 569; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 570; GFX90A-NEXT: v_mov_b32_e32 v7, v4 571; GFX90A-NEXT: buffer_wbinvl1 572; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 573; GFX90A-NEXT: s_cbranch_execnz .LBB2_3 574; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 575; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 576; GFX90A-NEXT: v_mov_b32_e32 v0, v4 577; GFX90A-NEXT: s_setpc_b64 s[30:31] 578; 579; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 580; GFX908: ; %bb.0: 581; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 583; GFX908-NEXT: s_mov_b64 s[6:7], exec 584; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 585; GFX908-NEXT: v_readfirstlane_b32 s8, v0 586; GFX908-NEXT: v_readfirstlane_b32 s9, v1 587; GFX908-NEXT: v_readfirstlane_b32 s10, v2 588; GFX908-NEXT: v_readfirstlane_b32 s11, v3 589; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 590; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 591; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 592; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 593; GFX908-NEXT: s_nop 0 594; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 595; GFX908-NEXT: ; implicit-def: $vgpr4 596; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 597; GFX908-NEXT: s_cbranch_execnz .LBB2_1 598; GFX908-NEXT: ; %bb.2: 599; GFX908-NEXT: s_mov_b64 exec, s[6:7] 600; GFX908-NEXT: s_mov_b64 s[6:7], 0 601; GFX908-NEXT: v_max_f32_e32 v8, v5, v5 602; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start 603; GFX908-NEXT: ; =>This Loop Header: Depth=1 604; GFX908-NEXT: ; Child Loop BB2_4 Depth 2 605; GFX908-NEXT: s_waitcnt vmcnt(0) 606; GFX908-NEXT: v_max_f32_e32 v4, v6, v6 607; GFX908-NEXT: v_min_f32_e32 v5, v4, v8 608; GFX908-NEXT: v_mov_b32_e32 v4, v5 609; GFX908-NEXT: s_mov_b64 s[12:13], exec 610; GFX908-NEXT: v_mov_b32_e32 v5, v6 611; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 612; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 613; GFX908-NEXT: v_readfirstlane_b32 s8, v0 614; GFX908-NEXT: v_readfirstlane_b32 s9, v1 615; GFX908-NEXT: v_readfirstlane_b32 s10, v2 616; GFX908-NEXT: v_readfirstlane_b32 s11, v3 617; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 618; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 619; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 620; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 621; GFX908-NEXT: s_waitcnt vmcnt(0) 622; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 623; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 624; GFX908-NEXT: s_cbranch_execnz .LBB2_4 625; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 626; GFX908-NEXT: s_mov_b64 exec, s[12:13] 627; GFX908-NEXT: s_waitcnt vmcnt(0) 628; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 629; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 630; GFX908-NEXT: v_mov_b32_e32 v6, v4 631; GFX908-NEXT: buffer_wbinvl1 632; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 633; GFX908-NEXT: s_cbranch_execnz .LBB2_3 634; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 635; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 636; GFX908-NEXT: v_mov_b32_e32 v0, v4 637; GFX908-NEXT: s_setpc_b64 s[30:31] 638; 639; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 640; GFX8: ; %bb.0: 641; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 642; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 643; GFX8-NEXT: s_mov_b64 s[6:7], exec 644; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 645; GFX8-NEXT: v_readfirstlane_b32 s8, v0 646; GFX8-NEXT: v_readfirstlane_b32 s9, v1 647; GFX8-NEXT: v_readfirstlane_b32 s10, v2 648; GFX8-NEXT: v_readfirstlane_b32 s11, v3 649; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 650; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 651; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 652; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 653; GFX8-NEXT: s_nop 0 654; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 655; GFX8-NEXT: ; implicit-def: $vgpr4 656; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 657; GFX8-NEXT: s_cbranch_execnz .LBB2_1 658; GFX8-NEXT: ; %bb.2: 659; GFX8-NEXT: s_mov_b64 exec, s[6:7] 660; GFX8-NEXT: s_mov_b64 s[6:7], 0 661; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5 662; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start 663; GFX8-NEXT: ; =>This Loop Header: Depth=1 664; GFX8-NEXT: ; Child Loop BB2_4 Depth 2 665; GFX8-NEXT: s_waitcnt vmcnt(0) 666; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6 667; GFX8-NEXT: v_min_f32_e32 v5, v4, v8 668; GFX8-NEXT: v_mov_b32_e32 v4, v5 669; GFX8-NEXT: s_mov_b64 s[12:13], exec 670; GFX8-NEXT: v_mov_b32_e32 v5, v6 671; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1 672; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 673; GFX8-NEXT: v_readfirstlane_b32 s8, v0 674; GFX8-NEXT: v_readfirstlane_b32 s9, v1 675; GFX8-NEXT: v_readfirstlane_b32 s10, v2 676; GFX8-NEXT: v_readfirstlane_b32 s11, v3 677; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 678; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 679; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 680; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 681; GFX8-NEXT: s_waitcnt vmcnt(0) 682; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 683; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 684; GFX8-NEXT: s_cbranch_execnz .LBB2_4 685; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1 686; GFX8-NEXT: s_mov_b64 exec, s[12:13] 687; GFX8-NEXT: s_waitcnt vmcnt(0) 688; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 689; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 690; GFX8-NEXT: v_mov_b32_e32 v6, v4 691; GFX8-NEXT: buffer_wbinvl1 692; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 693; GFX8-NEXT: s_cbranch_execnz .LBB2_3 694; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 695; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 696; GFX8-NEXT: v_mov_b32_e32 v0, v4 697; GFX8-NEXT: s_setpc_b64 s[30:31] 698; 699; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 700; GFX7: ; %bb.0: 701; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 702; GFX7-NEXT: s_mov_b64 s[6:7], exec 703; GFX7-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 704; GFX7-NEXT: v_readfirstlane_b32 s8, v0 705; GFX7-NEXT: v_readfirstlane_b32 s9, v1 706; GFX7-NEXT: v_readfirstlane_b32 s10, v2 707; GFX7-NEXT: v_readfirstlane_b32 s11, v3 708; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 709; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 710; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 711; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 712; GFX7-NEXT: s_waitcnt vmcnt(0) 713; GFX7-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc 714; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 715; GFX7-NEXT: ; implicit-def: $vgpr4 716; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 717; GFX7-NEXT: s_cbranch_execnz .LBB2_1 718; GFX7-NEXT: ; %bb.2: 719; GFX7-NEXT: s_mov_b64 exec, s[6:7] 720; GFX7-NEXT: s_waitcnt vmcnt(0) 721; GFX7-NEXT: v_mov_b32_e32 v0, v5 722; GFX7-NEXT: buffer_wbinvl1 723; GFX7-NEXT: s_setpc_b64 s[30:31] 724; 725; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory: 726; GFX6: ; %bb.0: 727; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 728; GFX6-NEXT: s_mov_b64 s[6:7], exec 729; GFX6-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 730; GFX6-NEXT: v_readfirstlane_b32 s8, v0 731; GFX6-NEXT: v_readfirstlane_b32 s9, v1 732; GFX6-NEXT: v_readfirstlane_b32 s10, v2 733; GFX6-NEXT: v_readfirstlane_b32 s11, v3 734; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 735; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 736; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 737; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 738; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 739; GFX6-NEXT: buffer_atomic_fmin v5, v4, s[8:11], 0 offen offset:1024 glc 740; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 741; GFX6-NEXT: ; implicit-def: $vgpr4 742; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 743; GFX6-NEXT: s_cbranch_execnz .LBB2_1 744; GFX6-NEXT: ; %bb.2: 745; GFX6-NEXT: s_mov_b64 exec, s[6:7] 746; GFX6-NEXT: s_waitcnt vmcnt(0) 747; GFX6-NEXT: v_mov_b32_e32 v0, v5 748; GFX6-NEXT: buffer_wbinvl1 749; GFX6-NEXT: s_waitcnt expcnt(0) 750; GFX6-NEXT: s_setpc_b64 s[30:31] 751 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 752 %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 753 ret float %result 754} 755 756define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 757; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 758; GFX12: ; %bb.0: 759; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 760; GFX12-NEXT: s_wait_expcnt 0x0 761; GFX12-NEXT: s_wait_samplecnt 0x0 762; GFX12-NEXT: s_wait_bvhcnt 0x0 763; GFX12-NEXT: s_wait_kmcnt 0x0 764; GFX12-NEXT: v_mov_b32_e32 v1, s16 765; GFX12-NEXT: s_wait_storecnt 0x0 766; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 767; GFX12-NEXT: s_wait_loadcnt 0x0 768; GFX12-NEXT: global_inv scope:SCOPE_DEV 769; GFX12-NEXT: s_setpc_b64 s[30:31] 770; 771; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 772; GFX940: ; %bb.0: 773; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 774; GFX940-NEXT: v_mov_b32_e32 v1, v0 775; GFX940-NEXT: v_mov_b32_e32 v0, s16 776; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 777; GFX940-NEXT: s_add_i32 s6, s16, 0x400 778; GFX940-NEXT: s_mov_b64 s[4:5], 0 779; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 780; GFX940-NEXT: v_mov_b32_e32 v3, s6 781; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start 782; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 783; GFX940-NEXT: s_waitcnt vmcnt(0) 784; GFX940-NEXT: v_mov_b32_e32 v5, v0 785; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 786; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 787; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 788; GFX940-NEXT: buffer_wbl2 sc1 789; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 790; GFX940-NEXT: s_waitcnt vmcnt(0) 791; GFX940-NEXT: buffer_inv sc1 792; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 793; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 794; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 795; GFX940-NEXT: s_cbranch_execnz .LBB3_1 796; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 797; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 798; GFX940-NEXT: s_setpc_b64 s[30:31] 799; 800; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 801; GFX11: ; %bb.0: 802; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 803; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 804; GFX11-NEXT: s_add_i32 s4, s16, 0x400 805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) 806; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 807; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 808; GFX11-NEXT: s_mov_b32 s4, 0 809; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start 810; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 811; GFX11-NEXT: s_waitcnt vmcnt(0) 812; GFX11-NEXT: v_mov_b32_e32 v5, v0 813; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 814; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 815; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 816; GFX11-NEXT: v_min_f32_e32 v4, v0, v2 817; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 818; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 819; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 820; GFX11-NEXT: s_waitcnt vmcnt(0) 821; GFX11-NEXT: buffer_gl1_inv 822; GFX11-NEXT: buffer_gl0_inv 823; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 824; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 825; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 826; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 827; GFX11-NEXT: s_cbranch_execnz .LBB3_1 828; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 829; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 830; GFX11-NEXT: s_setpc_b64 s[30:31] 831; 832; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 833; GFX10: ; %bb.0: 834; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 835; GFX10-NEXT: v_mov_b32_e32 v1, v0 836; GFX10-NEXT: v_mov_b32_e32 v0, s20 837; GFX10-NEXT: s_add_i32 s4, s20, 0x400 838; GFX10-NEXT: v_mov_b32_e32 v3, s4 839; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 840; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 841; GFX10-NEXT: s_mov_b32 s4, 0 842; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start 843; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 844; GFX10-NEXT: s_waitcnt vmcnt(0) 845; GFX10-NEXT: v_mov_b32_e32 v5, v0 846; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 847; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 848; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 849; GFX10-NEXT: v_mov_b32_e32 v0, v4 850; GFX10-NEXT: v_mov_b32_e32 v1, v5 851; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 852; GFX10-NEXT: s_waitcnt vmcnt(0) 853; GFX10-NEXT: buffer_gl1_inv 854; GFX10-NEXT: buffer_gl0_inv 855; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 856; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 857; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 858; GFX10-NEXT: s_cbranch_execnz .LBB3_1 859; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 860; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 861; GFX10-NEXT: s_setpc_b64 s[30:31] 862; 863; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 864; GFX90A: ; %bb.0: 865; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 866; GFX90A-NEXT: v_mov_b32_e32 v1, v0 867; GFX90A-NEXT: v_mov_b32_e32 v0, s20 868; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 869; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 870; GFX90A-NEXT: s_mov_b64 s[4:5], 0 871; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 872; GFX90A-NEXT: v_mov_b32_e32 v3, s6 873; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start 874; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 875; GFX90A-NEXT: s_waitcnt vmcnt(0) 876; GFX90A-NEXT: v_mov_b32_e32 v5, v0 877; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 878; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 879; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 880; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 881; GFX90A-NEXT: s_waitcnt vmcnt(0) 882; GFX90A-NEXT: buffer_wbinvl1 883; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 884; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 885; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 886; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 887; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 888; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 889; GFX90A-NEXT: s_setpc_b64 s[30:31] 890; 891; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 892; GFX908: ; %bb.0: 893; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 894; GFX908-NEXT: v_mov_b32_e32 v1, v0 895; GFX908-NEXT: v_mov_b32_e32 v0, s20 896; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 897; GFX908-NEXT: s_add_i32 s6, s20, 0x400 898; GFX908-NEXT: s_mov_b64 s[4:5], 0 899; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 900; GFX908-NEXT: v_mov_b32_e32 v3, s6 901; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start 902; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 903; GFX908-NEXT: s_waitcnt vmcnt(0) 904; GFX908-NEXT: v_mov_b32_e32 v5, v0 905; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 906; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 907; GFX908-NEXT: v_mov_b32_e32 v0, v4 908; GFX908-NEXT: v_mov_b32_e32 v1, v5 909; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 910; GFX908-NEXT: s_waitcnt vmcnt(0) 911; GFX908-NEXT: buffer_wbinvl1 912; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 913; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 914; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 915; GFX908-NEXT: s_cbranch_execnz .LBB3_1 916; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 917; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 918; GFX908-NEXT: s_setpc_b64 s[30:31] 919; 920; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 921; GFX8: ; %bb.0: 922; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 923; GFX8-NEXT: v_mov_b32_e32 v1, v0 924; GFX8-NEXT: v_mov_b32_e32 v0, s20 925; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 926; GFX8-NEXT: s_add_i32 s6, s20, 0x400 927; GFX8-NEXT: s_mov_b64 s[4:5], 0 928; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 929; GFX8-NEXT: v_mov_b32_e32 v3, s6 930; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start 931; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 932; GFX8-NEXT: s_waitcnt vmcnt(0) 933; GFX8-NEXT: v_mov_b32_e32 v5, v0 934; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 935; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 936; GFX8-NEXT: v_mov_b32_e32 v0, v4 937; GFX8-NEXT: v_mov_b32_e32 v1, v5 938; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 939; GFX8-NEXT: s_waitcnt vmcnt(0) 940; GFX8-NEXT: buffer_wbinvl1 941; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 942; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 943; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 944; GFX8-NEXT: s_cbranch_execnz .LBB3_1 945; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 946; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 947; GFX8-NEXT: s_setpc_b64 s[30:31] 948; 949; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 950; GFX7: ; %bb.0: 951; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 952; GFX7-NEXT: v_mov_b32_e32 v1, v0 953; GFX7-NEXT: v_mov_b32_e32 v0, s20 954; GFX7-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 955; GFX7-NEXT: s_add_i32 s6, s20, 0x400 956; GFX7-NEXT: s_mov_b64 s[4:5], 0 957; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 958; GFX7-NEXT: v_mov_b32_e32 v3, s6 959; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start 960; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 961; GFX7-NEXT: s_waitcnt vmcnt(0) 962; GFX7-NEXT: v_mov_b32_e32 v5, v0 963; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 964; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 965; GFX7-NEXT: v_mov_b32_e32 v0, v4 966; GFX7-NEXT: v_mov_b32_e32 v1, v5 967; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 968; GFX7-NEXT: s_waitcnt vmcnt(0) 969; GFX7-NEXT: buffer_wbinvl1 970; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 971; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 972; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 973; GFX7-NEXT: s_cbranch_execnz .LBB3_1 974; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 975; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 976; GFX7-NEXT: s_setpc_b64 s[30:31] 977; 978; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote_memory: 979; GFX6: ; %bb.0: 980; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 981; GFX6-NEXT: v_mov_b32_e32 v1, v0 982; GFX6-NEXT: v_mov_b32_e32 v0, s20 983; GFX6-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 984; GFX6-NEXT: s_add_i32 s6, s20, 0x400 985; GFX6-NEXT: s_mov_b64 s[4:5], 0 986; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 987; GFX6-NEXT: v_mov_b32_e32 v3, s6 988; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start 989; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 990; GFX6-NEXT: s_waitcnt vmcnt(0) 991; GFX6-NEXT: v_mov_b32_e32 v5, v0 992; GFX6-NEXT: s_waitcnt expcnt(0) 993; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 994; GFX6-NEXT: v_min_f32_e32 v4, v0, v2 995; GFX6-NEXT: v_mov_b32_e32 v0, v4 996; GFX6-NEXT: v_mov_b32_e32 v1, v5 997; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 998; GFX6-NEXT: s_waitcnt vmcnt(0) 999; GFX6-NEXT: buffer_wbinvl1 1000; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1001; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1002; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 1003; GFX6-NEXT: s_cbranch_execnz .LBB3_1 1004; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 1005; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 1006; GFX6-NEXT: s_waitcnt expcnt(0) 1007; GFX6-NEXT: s_setpc_b64 s[30:31] 1008 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1009 %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1010 ret float %result 1011} 1012 1013define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 1014; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1015; GFX12: ; %bb.0: 1016; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1017; GFX12-NEXT: s_wait_expcnt 0x0 1018; GFX12-NEXT: s_wait_samplecnt 0x0 1019; GFX12-NEXT: s_wait_bvhcnt 0x0 1020; GFX12-NEXT: s_wait_kmcnt 0x0 1021; GFX12-NEXT: v_mov_b32_e32 v1, s16 1022; GFX12-NEXT: s_wait_storecnt 0x0 1023; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 1024; GFX12-NEXT: s_wait_loadcnt 0x0 1025; GFX12-NEXT: global_inv scope:SCOPE_DEV 1026; GFX12-NEXT: s_setpc_b64 s[30:31] 1027; 1028; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1029; GFX940: ; %bb.0: 1030; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1031; GFX940-NEXT: v_mov_b32_e32 v1, v0 1032; GFX940-NEXT: v_mov_b32_e32 v0, s16 1033; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 1034; GFX940-NEXT: s_add_i32 s6, s16, 0x400 1035; GFX940-NEXT: s_mov_b64 s[4:5], 0 1036; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 1037; GFX940-NEXT: v_mov_b32_e32 v3, s6 1038; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start 1039; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1040; GFX940-NEXT: s_waitcnt vmcnt(0) 1041; GFX940-NEXT: v_mov_b32_e32 v5, v0 1042; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 1043; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 1044; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 1045; GFX940-NEXT: buffer_wbl2 sc1 1046; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 1047; GFX940-NEXT: s_waitcnt vmcnt(0) 1048; GFX940-NEXT: buffer_inv sc1 1049; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1050; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1051; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 1052; GFX940-NEXT: s_cbranch_execnz .LBB4_1 1053; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1054; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 1055; GFX940-NEXT: s_setpc_b64 s[30:31] 1056; 1057; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1058; GFX11: ; %bb.0: 1059; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1060; GFX11-NEXT: v_mov_b32_e32 v1, s16 1061; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1062; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 1063; GFX11-NEXT: s_waitcnt vmcnt(0) 1064; GFX11-NEXT: buffer_gl1_inv 1065; GFX11-NEXT: buffer_gl0_inv 1066; GFX11-NEXT: s_setpc_b64 s[30:31] 1067; 1068; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1069; GFX10: ; %bb.0: 1070; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1071; GFX10-NEXT: v_mov_b32_e32 v1, s20 1072; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1073; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 1074; GFX10-NEXT: s_waitcnt vmcnt(0) 1075; GFX10-NEXT: buffer_gl1_inv 1076; GFX10-NEXT: buffer_gl0_inv 1077; GFX10-NEXT: s_setpc_b64 s[30:31] 1078; 1079; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1080; GFX90A: ; %bb.0: 1081; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1082; GFX90A-NEXT: v_mov_b32_e32 v1, v0 1083; GFX90A-NEXT: v_mov_b32_e32 v0, s20 1084; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1085; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 1086; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1087; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 1088; GFX90A-NEXT: v_mov_b32_e32 v3, s6 1089; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start 1090; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1091; GFX90A-NEXT: s_waitcnt vmcnt(0) 1092; GFX90A-NEXT: v_mov_b32_e32 v5, v0 1093; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 1094; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 1095; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 1096; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1097; GFX90A-NEXT: s_waitcnt vmcnt(0) 1098; GFX90A-NEXT: buffer_wbinvl1 1099; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1100; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1101; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1102; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 1103; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1104; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1105; GFX90A-NEXT: s_setpc_b64 s[30:31] 1106; 1107; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1108; GFX908: ; %bb.0: 1109; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1110; GFX908-NEXT: v_mov_b32_e32 v1, v0 1111; GFX908-NEXT: v_mov_b32_e32 v0, s20 1112; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1113; GFX908-NEXT: s_add_i32 s6, s20, 0x400 1114; GFX908-NEXT: s_mov_b64 s[4:5], 0 1115; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 1116; GFX908-NEXT: v_mov_b32_e32 v3, s6 1117; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start 1118; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1119; GFX908-NEXT: s_waitcnt vmcnt(0) 1120; GFX908-NEXT: v_mov_b32_e32 v5, v0 1121; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 1122; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 1123; GFX908-NEXT: v_mov_b32_e32 v0, v4 1124; GFX908-NEXT: v_mov_b32_e32 v1, v5 1125; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1126; GFX908-NEXT: s_waitcnt vmcnt(0) 1127; GFX908-NEXT: buffer_wbinvl1 1128; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1129; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1130; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1131; GFX908-NEXT: s_cbranch_execnz .LBB4_1 1132; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1133; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1134; GFX908-NEXT: s_setpc_b64 s[30:31] 1135; 1136; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1137; GFX8: ; %bb.0: 1138; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1139; GFX8-NEXT: v_mov_b32_e32 v1, v0 1140; GFX8-NEXT: v_mov_b32_e32 v0, s20 1141; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 1142; GFX8-NEXT: s_add_i32 s6, s20, 0x400 1143; GFX8-NEXT: s_mov_b64 s[4:5], 0 1144; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 1145; GFX8-NEXT: v_mov_b32_e32 v3, s6 1146; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 1147; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1148; GFX8-NEXT: s_waitcnt vmcnt(0) 1149; GFX8-NEXT: v_mov_b32_e32 v5, v0 1150; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 1151; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 1152; GFX8-NEXT: v_mov_b32_e32 v0, v4 1153; GFX8-NEXT: v_mov_b32_e32 v1, v5 1154; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 1155; GFX8-NEXT: s_waitcnt vmcnt(0) 1156; GFX8-NEXT: buffer_wbinvl1 1157; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1158; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1159; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1160; GFX8-NEXT: s_cbranch_execnz .LBB4_1 1161; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1162; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1163; GFX8-NEXT: s_setpc_b64 s[30:31] 1164; 1165; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1166; GFX7: ; %bb.0: 1167; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1168; GFX7-NEXT: v_mov_b32_e32 v1, s20 1169; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 1170; GFX7-NEXT: s_waitcnt vmcnt(0) 1171; GFX7-NEXT: buffer_wbinvl1 1172; GFX7-NEXT: s_setpc_b64 s[30:31] 1173; 1174; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1175; GFX6: ; %bb.0: 1176; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1177; GFX6-NEXT: v_mov_b32_e32 v1, s20 1178; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 1179; GFX6-NEXT: s_waitcnt vmcnt(0) 1180; GFX6-NEXT: buffer_wbinvl1 1181; GFX6-NEXT: s_waitcnt expcnt(0) 1182; GFX6-NEXT: s_setpc_b64 s[30:31] 1183 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 1184 %result = atomicrmw fmin ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 1185 ret float %result 1186} 1187 1188; -------------------------------------------------------------------- 1189; double 1190; -------------------------------------------------------------------- 1191 1192define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 1193; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1194; GFX12: ; %bb.0: 1195; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1196; GFX12-NEXT: s_wait_expcnt 0x0 1197; GFX12-NEXT: s_wait_samplecnt 0x0 1198; GFX12-NEXT: s_wait_bvhcnt 0x0 1199; GFX12-NEXT: s_wait_kmcnt 0x0 1200; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 1201; GFX12-NEXT: v_mov_b32_e32 v0, s16 1202; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 1203; GFX12-NEXT: s_wait_alu 0xfffe 1204; GFX12-NEXT: v_mov_b32_e32 v6, s4 1205; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 1206; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 1207; GFX12-NEXT: s_mov_b32 s4, 0 1208; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start 1209; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1210; GFX12-NEXT: s_wait_loadcnt 0x0 1211; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1212; GFX12-NEXT: s_wait_storecnt 0x0 1213; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1214; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] 1215; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] 1216; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1217; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1218; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1219; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 1220; GFX12-NEXT: s_wait_loadcnt 0x0 1221; GFX12-NEXT: global_inv scope:SCOPE_DEV 1222; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1223; GFX12-NEXT: s_wait_alu 0xfffe 1224; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 1225; GFX12-NEXT: s_wait_alu 0xfffe 1226; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1227; GFX12-NEXT: s_cbranch_execnz .LBB5_1 1228; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1229; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 1230; GFX12-NEXT: s_wait_alu 0xfffe 1231; GFX12-NEXT: s_setpc_b64 s[30:31] 1232; 1233; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1234; GFX940: ; %bb.0: 1235; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GFX940-NEXT: v_mov_b32_e32 v2, s16 1237; GFX940-NEXT: buffer_wbl2 sc1 1238; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 1239; GFX940-NEXT: s_waitcnt vmcnt(0) 1240; GFX940-NEXT: buffer_inv sc1 1241; GFX940-NEXT: s_setpc_b64 s[30:31] 1242; 1243; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1244; GFX11: ; %bb.0: 1245; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1246; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 1247; GFX11-NEXT: v_mov_b32_e32 v0, s16 1248; GFX11-NEXT: s_add_i32 s4, s16, 0x800 1249; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) 1250; GFX11-NEXT: v_mov_b32_e32 v6, s4 1251; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1252; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 1253; GFX11-NEXT: s_mov_b32 s4, 0 1254; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start 1255; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1256; GFX11-NEXT: s_waitcnt vmcnt(0) 1257; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1258; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1259; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1260; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1261; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 1262; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1263; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1264; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1265; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 1266; GFX11-NEXT: s_waitcnt vmcnt(0) 1267; GFX11-NEXT: buffer_gl1_inv 1268; GFX11-NEXT: buffer_gl0_inv 1269; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1270; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1271; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1272; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1273; GFX11-NEXT: s_cbranch_execnz .LBB5_1 1274; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1275; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1276; GFX11-NEXT: s_setpc_b64 s[30:31] 1277; 1278; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1279; GFX10: ; %bb.0: 1280; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1281; GFX10-NEXT: v_mov_b32_e32 v2, s20 1282; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1283; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 1284; GFX10-NEXT: s_waitcnt vmcnt(0) 1285; GFX10-NEXT: buffer_gl1_inv 1286; GFX10-NEXT: buffer_gl0_inv 1287; GFX10-NEXT: s_setpc_b64 s[30:31] 1288; 1289; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1290; GFX90A: ; %bb.0: 1291; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1292; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1293; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 1294; GFX90A-NEXT: s_waitcnt vmcnt(0) 1295; GFX90A-NEXT: buffer_wbinvl1 1296; GFX90A-NEXT: s_setpc_b64 s[30:31] 1297; 1298; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1299; GFX908: ; %bb.0: 1300; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1301; GFX908-NEXT: v_mov_b32_e32 v2, v0 1302; GFX908-NEXT: v_mov_b32_e32 v0, s20 1303; GFX908-NEXT: v_mov_b32_e32 v3, v1 1304; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 1305; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1306; GFX908-NEXT: s_add_i32 s6, s20, 0x800 1307; GFX908-NEXT: s_mov_b64 s[4:5], 0 1308; GFX908-NEXT: v_mov_b32_e32 v6, s6 1309; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start 1310; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1311; GFX908-NEXT: s_waitcnt vmcnt(0) 1312; GFX908-NEXT: v_mov_b32_e32 v10, v1 1313; GFX908-NEXT: v_mov_b32_e32 v9, v0 1314; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1315; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 1316; GFX908-NEXT: v_mov_b32_e32 v0, v7 1317; GFX908-NEXT: v_mov_b32_e32 v1, v8 1318; GFX908-NEXT: v_mov_b32_e32 v2, v9 1319; GFX908-NEXT: v_mov_b32_e32 v3, v10 1320; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 1321; GFX908-NEXT: s_waitcnt vmcnt(0) 1322; GFX908-NEXT: buffer_wbinvl1 1323; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 1324; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1325; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1326; GFX908-NEXT: s_cbranch_execnz .LBB5_1 1327; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1328; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1329; GFX908-NEXT: s_setpc_b64 s[30:31] 1330; 1331; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1332; GFX8: ; %bb.0: 1333; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1334; GFX8-NEXT: v_mov_b32_e32 v2, v0 1335; GFX8-NEXT: v_mov_b32_e32 v0, s20 1336; GFX8-NEXT: v_mov_b32_e32 v3, v1 1337; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 1338; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1339; GFX8-NEXT: s_add_i32 s6, s20, 0x800 1340; GFX8-NEXT: s_mov_b64 s[4:5], 0 1341; GFX8-NEXT: v_mov_b32_e32 v6, s6 1342; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 1343; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1344; GFX8-NEXT: s_waitcnt vmcnt(0) 1345; GFX8-NEXT: v_mov_b32_e32 v10, v1 1346; GFX8-NEXT: v_mov_b32_e32 v9, v0 1347; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1348; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 1349; GFX8-NEXT: v_mov_b32_e32 v0, v7 1350; GFX8-NEXT: v_mov_b32_e32 v1, v8 1351; GFX8-NEXT: v_mov_b32_e32 v2, v9 1352; GFX8-NEXT: v_mov_b32_e32 v3, v10 1353; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 1354; GFX8-NEXT: s_waitcnt vmcnt(0) 1355; GFX8-NEXT: buffer_wbinvl1 1356; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 1357; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1358; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1359; GFX8-NEXT: s_cbranch_execnz .LBB5_1 1360; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1361; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1362; GFX8-NEXT: s_setpc_b64 s[30:31] 1363; 1364; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1365; GFX7: ; %bb.0: 1366; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1367; GFX7-NEXT: v_mov_b32_e32 v2, s20 1368; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 1369; GFX7-NEXT: s_waitcnt vmcnt(0) 1370; GFX7-NEXT: buffer_wbinvl1 1371; GFX7-NEXT: s_setpc_b64 s[30:31] 1372; 1373; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory: 1374; GFX6: ; %bb.0: 1375; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1376; GFX6-NEXT: v_mov_b32_e32 v2, s20 1377; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 1378; GFX6-NEXT: s_waitcnt vmcnt(0) 1379; GFX6-NEXT: buffer_wbinvl1 1380; GFX6-NEXT: s_waitcnt expcnt(0) 1381; GFX6-NEXT: s_setpc_b64 s[30:31] 1382 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 1383 %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1384 ret double %result 1385} 1386 1387define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 1388; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1389; GFX12: ; %bb.0: 1390; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1391; GFX12-NEXT: s_wait_expcnt 0x0 1392; GFX12-NEXT: s_wait_samplecnt 0x0 1393; GFX12-NEXT: s_wait_bvhcnt 0x0 1394; GFX12-NEXT: s_wait_kmcnt 0x0 1395; GFX12-NEXT: v_mov_b32_e32 v2, s16 1396; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] 1397; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 1398; GFX12-NEXT: s_wait_alu 0xfffe 1399; GFX12-NEXT: v_mov_b32_e32 v6, s4 1400; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048 1401; GFX12-NEXT: s_mov_b32 s4, 0 1402; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start 1403; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1404; GFX12-NEXT: s_wait_loadcnt 0x0 1405; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 1406; GFX12-NEXT: s_wait_storecnt 0x0 1407; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1408; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 1409; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 1410; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 1411; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 1412; GFX12-NEXT: s_wait_loadcnt 0x0 1413; GFX12-NEXT: global_inv scope:SCOPE_DEV 1414; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] 1415; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 1416; GFX12-NEXT: s_wait_alu 0xfffe 1417; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 1418; GFX12-NEXT: s_wait_alu 0xfffe 1419; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1420; GFX12-NEXT: s_cbranch_execnz .LBB6_1 1421; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1422; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 1423; GFX12-NEXT: s_wait_alu 0xfffe 1424; GFX12-NEXT: s_setpc_b64 s[30:31] 1425; 1426; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1427; GFX940: ; %bb.0: 1428; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1429; GFX940-NEXT: v_mov_b32_e32 v2, s16 1430; GFX940-NEXT: buffer_wbl2 sc1 1431; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 1432; GFX940-NEXT: s_waitcnt vmcnt(0) 1433; GFX940-NEXT: buffer_inv sc1 1434; GFX940-NEXT: s_setpc_b64 s[30:31] 1435; 1436; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1437; GFX11: ; %bb.0: 1438; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1439; GFX11-NEXT: v_mov_b32_e32 v2, s16 1440; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 1441; GFX11-NEXT: s_add_i32 s4, s16, 0x800 1442; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1443; GFX11-NEXT: v_mov_b32_e32 v6, s4 1444; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048 1445; GFX11-NEXT: s_mov_b32 s4, 0 1446; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start 1447; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1448; GFX11-NEXT: s_waitcnt vmcnt(0) 1449; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 1450; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1451; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1452; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 1453; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 1454; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 1455; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc 1456; GFX11-NEXT: s_waitcnt vmcnt(0) 1457; GFX11-NEXT: buffer_gl1_inv 1458; GFX11-NEXT: buffer_gl0_inv 1459; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] 1460; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 1461; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1462; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1463; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1464; GFX11-NEXT: s_cbranch_execnz .LBB6_1 1465; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1466; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1467; GFX11-NEXT: s_setpc_b64 s[30:31] 1468; 1469; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1470; GFX10: ; %bb.0: 1471; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1472; GFX10-NEXT: v_mov_b32_e32 v2, s20 1473; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1474; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 1475; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1476; GFX10-NEXT: buffer_gl1_inv 1477; GFX10-NEXT: buffer_gl0_inv 1478; GFX10-NEXT: s_setpc_b64 s[30:31] 1479; 1480; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1481; GFX90A: ; %bb.0: 1482; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1483; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1484; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 1485; GFX90A-NEXT: s_waitcnt vmcnt(0) 1486; GFX90A-NEXT: buffer_wbinvl1 1487; GFX90A-NEXT: s_setpc_b64 s[30:31] 1488; 1489; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1490; GFX908: ; %bb.0: 1491; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1492; GFX908-NEXT: v_mov_b32_e32 v2, s20 1493; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 1494; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 1495; GFX908-NEXT: s_add_i32 s6, s20, 0x800 1496; GFX908-NEXT: s_mov_b64 s[4:5], 0 1497; GFX908-NEXT: v_mov_b32_e32 v6, s6 1498; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 1499; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1500; GFX908-NEXT: s_waitcnt vmcnt(0) 1501; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 1502; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 1503; GFX908-NEXT: v_mov_b32_e32 v10, v3 1504; GFX908-NEXT: v_mov_b32_e32 v9, v2 1505; GFX908-NEXT: v_mov_b32_e32 v8, v1 1506; GFX908-NEXT: v_mov_b32_e32 v7, v0 1507; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 1508; GFX908-NEXT: s_waitcnt vmcnt(0) 1509; GFX908-NEXT: buffer_wbinvl1 1510; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] 1511; GFX908-NEXT: v_mov_b32_e32 v2, v7 1512; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1513; GFX908-NEXT: v_mov_b32_e32 v3, v8 1514; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1515; GFX908-NEXT: s_cbranch_execnz .LBB6_1 1516; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1517; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1518; GFX908-NEXT: s_setpc_b64 s[30:31] 1519; 1520; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1521; GFX8: ; %bb.0: 1522; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1523; GFX8-NEXT: v_mov_b32_e32 v2, s20 1524; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048 1525; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 1526; GFX8-NEXT: s_add_i32 s6, s20, 0x800 1527; GFX8-NEXT: s_mov_b64 s[4:5], 0 1528; GFX8-NEXT: v_mov_b32_e32 v6, s6 1529; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 1530; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1531; GFX8-NEXT: s_waitcnt vmcnt(0) 1532; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 1533; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 1534; GFX8-NEXT: v_mov_b32_e32 v10, v3 1535; GFX8-NEXT: v_mov_b32_e32 v9, v2 1536; GFX8-NEXT: v_mov_b32_e32 v8, v1 1537; GFX8-NEXT: v_mov_b32_e32 v7, v0 1538; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 1539; GFX8-NEXT: s_waitcnt vmcnt(0) 1540; GFX8-NEXT: buffer_wbinvl1 1541; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] 1542; GFX8-NEXT: v_mov_b32_e32 v2, v7 1543; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1544; GFX8-NEXT: v_mov_b32_e32 v3, v8 1545; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1546; GFX8-NEXT: s_cbranch_execnz .LBB6_1 1547; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1548; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1549; GFX8-NEXT: s_setpc_b64 s[30:31] 1550; 1551; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1552; GFX7: ; %bb.0: 1553; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1554; GFX7-NEXT: v_mov_b32_e32 v2, s20 1555; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 1556; GFX7-NEXT: s_waitcnt vmcnt(0) 1557; GFX7-NEXT: buffer_wbinvl1 1558; GFX7-NEXT: s_setpc_b64 s[30:31] 1559; 1560; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_grained_memory: 1561; GFX6: ; %bb.0: 1562; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1563; GFX6-NEXT: v_mov_b32_e32 v2, s20 1564; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 1565; GFX6-NEXT: s_waitcnt vmcnt(0) 1566; GFX6-NEXT: buffer_wbinvl1 1567; GFX6-NEXT: s_waitcnt expcnt(0) 1568; GFX6-NEXT: s_setpc_b64 s[30:31] 1569 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 1570 %unused = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1571 ret void 1572} 1573 1574define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 { 1575; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1576; GFX12: ; %bb.0: 1577; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1578; GFX12-NEXT: s_wait_expcnt 0x0 1579; GFX12-NEXT: s_wait_samplecnt 0x0 1580; GFX12-NEXT: s_wait_bvhcnt 0x0 1581; GFX12-NEXT: s_wait_kmcnt 0x0 1582; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 1583; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1584; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 1585; GFX12-NEXT: s_mov_b32 s1, exec_lo 1586; GFX12-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1587; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 1588; GFX12-NEXT: v_readfirstlane_b32 s4, v9 1589; GFX12-NEXT: v_readfirstlane_b32 s5, v10 1590; GFX12-NEXT: v_readfirstlane_b32 s6, v7 1591; GFX12-NEXT: v_readfirstlane_b32 s7, v8 1592; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 1593; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 1594; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 1595; GFX12-NEXT: s_wait_alu 0xfffe 1596; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1597; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 1598; GFX12-NEXT: s_wait_alu 0xfffe 1599; GFX12-NEXT: s_and_saveexec_b32 s0, s0 1600; GFX12-NEXT: s_wait_loadcnt 0x0 1601; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048 1602; GFX12-NEXT: ; implicit-def: $vgpr4 1603; GFX12-NEXT: s_wait_alu 0xfffe 1604; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1605; GFX12-NEXT: s_cbranch_execnz .LBB7_1 1606; GFX12-NEXT: ; %bb.2: 1607; GFX12-NEXT: s_mov_b32 exec_lo, s1 1608; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6] 1609; GFX12-NEXT: s_mov_b32 s1, 0 1610; GFX12-NEXT: .LBB7_3: ; %atomicrmw.start 1611; GFX12-NEXT: ; =>This Loop Header: Depth=1 1612; GFX12-NEXT: ; Child Loop BB7_4 Depth 2 1613; GFX12-NEXT: s_wait_loadcnt 0x0 1614; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14] 1615; GFX12-NEXT: s_mov_b32 s2, exec_lo 1616; GFX12-NEXT: s_wait_storecnt 0x0 1617; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1618; GFX12-NEXT: v_min_num_f64_e32 v[11:12], v[0:1], v[4:5] 1619; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 1620; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 1621; GFX12-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 1622; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 1623; GFX12-NEXT: v_readfirstlane_b32 s4, v9 1624; GFX12-NEXT: v_readfirstlane_b32 s5, v10 1625; GFX12-NEXT: v_readfirstlane_b32 s6, v7 1626; GFX12-NEXT: v_readfirstlane_b32 s7, v8 1627; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 1628; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 1629; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 1630; GFX12-NEXT: s_wait_alu 0xfffe 1631; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1632; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 1633; GFX12-NEXT: s_wait_alu 0xfffe 1634; GFX12-NEXT: s_and_saveexec_b32 s0, s0 1635; GFX12-NEXT: s_wait_loadcnt 0x0 1636; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN 1637; GFX12-NEXT: s_wait_alu 0xfffe 1638; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1639; GFX12-NEXT: s_cbranch_execnz .LBB7_4 1640; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 1641; GFX12-NEXT: s_mov_b32 exec_lo, s2 1642; GFX12-NEXT: s_wait_loadcnt 0x0 1643; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] 1644; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 1645; GFX12-NEXT: global_inv scope:SCOPE_DEV 1646; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 1647; GFX12-NEXT: s_wait_alu 0xfffe 1648; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 1649; GFX12-NEXT: s_cbranch_execnz .LBB7_3 1650; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 1651; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 1652; GFX12-NEXT: s_wait_alu 0xfffe 1653; GFX12-NEXT: s_setpc_b64 s[30:31] 1654; 1655; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1656; GFX940: ; %bb.0: 1657; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1658; GFX940-NEXT: v_mov_b32_e32 v7, v6 1659; GFX940-NEXT: v_mov_b32_e32 v6, v5 1660; GFX940-NEXT: s_mov_b64 s[2:3], exec 1661; GFX940-NEXT: buffer_wbl2 sc1 1662; GFX940-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1663; GFX940-NEXT: v_readfirstlane_b32 s4, v0 1664; GFX940-NEXT: v_readfirstlane_b32 s5, v1 1665; GFX940-NEXT: v_readfirstlane_b32 s6, v2 1666; GFX940-NEXT: v_readfirstlane_b32 s7, v3 1667; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 1668; GFX940-NEXT: s_nop 0 1669; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 1670; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 1671; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 1672; GFX940-NEXT: s_waitcnt vmcnt(0) 1673; GFX940-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0 1674; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1675; GFX940-NEXT: ; implicit-def: $vgpr4 1676; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 1677; GFX940-NEXT: s_cbranch_execnz .LBB7_1 1678; GFX940-NEXT: ; %bb.2: 1679; GFX940-NEXT: s_mov_b64 exec, s[2:3] 1680; GFX940-NEXT: s_waitcnt vmcnt(0) 1681; GFX940-NEXT: v_mov_b32_e32 v0, v6 1682; GFX940-NEXT: v_mov_b32_e32 v1, v7 1683; GFX940-NEXT: buffer_inv sc1 1684; GFX940-NEXT: s_setpc_b64 s[30:31] 1685; 1686; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1687; GFX11: ; %bb.0: 1688; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1689; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 1690; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1691; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4 1692; GFX11-NEXT: s_mov_b32 s1, 0 1693; GFX11-NEXT: s_mov_b32 s2, exec_lo 1694; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1695; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) 1696; GFX11-NEXT: v_readfirstlane_b32 s4, v9 1697; GFX11-NEXT: v_readfirstlane_b32 s5, v10 1698; GFX11-NEXT: v_readfirstlane_b32 s6, v7 1699; GFX11-NEXT: v_readfirstlane_b32 s7, v8 1700; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 1701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1702; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 1703; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 1704; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1705; GFX11-NEXT: s_and_saveexec_b32 s0, s0 1706; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048 1707; GFX11-NEXT: ; implicit-def: $vgpr4 1708; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1709; GFX11-NEXT: s_cbranch_execnz .LBB7_1 1710; GFX11-NEXT: ; %bb.2: 1711; GFX11-NEXT: s_mov_b32 exec_lo, s2 1712; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] 1713; GFX11-NEXT: .p2align 6 1714; GFX11-NEXT: .LBB7_3: ; %atomicrmw.start 1715; GFX11-NEXT: ; =>This Loop Header: Depth=1 1716; GFX11-NEXT: ; Child Loop BB7_4 Depth 2 1717; GFX11-NEXT: s_waitcnt vmcnt(0) 1718; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] 1719; GFX11-NEXT: s_mov_b32 s2, exec_lo 1720; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1722; GFX11-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] 1723; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12 1724; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14 1725; GFX11-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 1726; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 1727; GFX11-NEXT: v_readfirstlane_b32 s4, v9 1728; GFX11-NEXT: v_readfirstlane_b32 s5, v10 1729; GFX11-NEXT: v_readfirstlane_b32 s6, v7 1730; GFX11-NEXT: v_readfirstlane_b32 s7, v8 1731; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 1732; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] 1733; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] 1734; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1735; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 1736; GFX11-NEXT: s_and_saveexec_b32 s0, s0 1737; GFX11-NEXT: s_waitcnt vmcnt(0) 1738; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc 1739; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 1740; GFX11-NEXT: s_cbranch_execnz .LBB7_4 1741; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 1742; GFX11-NEXT: s_mov_b32 exec_lo, s2 1743; GFX11-NEXT: s_waitcnt vmcnt(0) 1744; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14] 1745; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0 1746; GFX11-NEXT: buffer_gl1_inv 1747; GFX11-NEXT: buffer_gl0_inv 1748; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 1749; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1750; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 1751; GFX11-NEXT: s_cbranch_execnz .LBB7_3 1752; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 1753; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 1754; GFX11-NEXT: s_setpc_b64 s[30:31] 1755; 1756; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1757; GFX10: ; %bb.0: 1758; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1759; GFX10-NEXT: s_mov_b32 s5, exec_lo 1760; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1761; GFX10-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1762; GFX10-NEXT: v_readfirstlane_b32 s8, v0 1763; GFX10-NEXT: v_readfirstlane_b32 s9, v1 1764; GFX10-NEXT: v_readfirstlane_b32 s10, v2 1765; GFX10-NEXT: v_readfirstlane_b32 s11, v3 1766; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 1767; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 1768; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 1769; GFX10-NEXT: s_and_saveexec_b32 s4, s4 1770; GFX10-NEXT: s_waitcnt vmcnt(0) 1771; GFX10-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc 1772; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1773; GFX10-NEXT: ; implicit-def: $vgpr4 1774; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1775; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 1776; GFX10-NEXT: s_cbranch_execnz .LBB7_1 1777; GFX10-NEXT: ; %bb.2: 1778; GFX10-NEXT: s_mov_b32 exec_lo, s5 1779; GFX10-NEXT: s_waitcnt vmcnt(0) 1780; GFX10-NEXT: v_mov_b32_e32 v0, v5 1781; GFX10-NEXT: v_mov_b32_e32 v1, v6 1782; GFX10-NEXT: buffer_gl1_inv 1783; GFX10-NEXT: buffer_gl0_inv 1784; GFX10-NEXT: s_setpc_b64 s[30:31] 1785; 1786; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1787; GFX90A: ; %bb.0: 1788; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1789; GFX90A-NEXT: v_mov_b32_e32 v7, v6 1790; GFX90A-NEXT: v_mov_b32_e32 v6, v5 1791; GFX90A-NEXT: s_mov_b64 s[6:7], exec 1792; GFX90A-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1793; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 1794; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 1795; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 1796; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 1797; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 1798; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 1799; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1800; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1801; GFX90A-NEXT: s_waitcnt vmcnt(0) 1802; GFX90A-NEXT: buffer_atomic_min_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc 1803; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1804; GFX90A-NEXT: ; implicit-def: $vgpr4 1805; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 1806; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 1807; GFX90A-NEXT: ; %bb.2: 1808; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 1809; GFX90A-NEXT: s_waitcnt vmcnt(0) 1810; GFX90A-NEXT: v_mov_b32_e32 v0, v6 1811; GFX90A-NEXT: v_mov_b32_e32 v1, v7 1812; GFX90A-NEXT: buffer_wbinvl1 1813; GFX90A-NEXT: s_setpc_b64 s[30:31] 1814; 1815; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1816; GFX908: ; %bb.0: 1817; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1818; GFX908-NEXT: v_mov_b32_e32 v8, v3 1819; GFX908-NEXT: v_mov_b32_e32 v7, v2 1820; GFX908-NEXT: v_mov_b32_e32 v10, v1 1821; GFX908-NEXT: v_mov_b32_e32 v9, v0 1822; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4 1823; GFX908-NEXT: s_mov_b64 s[6:7], exec 1824; GFX908-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1825; GFX908-NEXT: v_readfirstlane_b32 s8, v9 1826; GFX908-NEXT: v_readfirstlane_b32 s9, v10 1827; GFX908-NEXT: v_readfirstlane_b32 s10, v7 1828; GFX908-NEXT: v_readfirstlane_b32 s11, v8 1829; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 1830; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 1831; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1832; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1833; GFX908-NEXT: s_nop 0 1834; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 1835; GFX908-NEXT: ; implicit-def: $vgpr4 1836; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 1837; GFX908-NEXT: s_cbranch_execnz .LBB7_1 1838; GFX908-NEXT: ; %bb.2: 1839; GFX908-NEXT: s_mov_b64 exec, s[6:7] 1840; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] 1841; GFX908-NEXT: s_mov_b64 s[6:7], 0 1842; GFX908-NEXT: .LBB7_3: ; %atomicrmw.start 1843; GFX908-NEXT: ; =>This Loop Header: Depth=1 1844; GFX908-NEXT: ; Child Loop BB7_4 Depth 2 1845; GFX908-NEXT: s_waitcnt vmcnt(0) 1846; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] 1847; GFX908-NEXT: s_mov_b64 s[12:13], exec 1848; GFX908-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] 1849; GFX908-NEXT: v_mov_b32_e32 v0, v11 1850; GFX908-NEXT: v_mov_b32_e32 v1, v12 1851; GFX908-NEXT: v_mov_b32_e32 v2, v13 1852; GFX908-NEXT: v_mov_b32_e32 v3, v14 1853; GFX908-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 1854; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 1855; GFX908-NEXT: v_readfirstlane_b32 s8, v9 1856; GFX908-NEXT: v_readfirstlane_b32 s9, v10 1857; GFX908-NEXT: v_readfirstlane_b32 s10, v7 1858; GFX908-NEXT: v_readfirstlane_b32 s11, v8 1859; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 1860; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 1861; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1862; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1863; GFX908-NEXT: s_waitcnt vmcnt(0) 1864; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 1865; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 1866; GFX908-NEXT: s_cbranch_execnz .LBB7_4 1867; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 1868; GFX908-NEXT: s_mov_b64 exec, s[12:13] 1869; GFX908-NEXT: s_waitcnt vmcnt(0) 1870; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 1871; GFX908-NEXT: v_mov_b32_e32 v14, v1 1872; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 1873; GFX908-NEXT: v_mov_b32_e32 v13, v0 1874; GFX908-NEXT: buffer_wbinvl1 1875; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 1876; GFX908-NEXT: s_cbranch_execnz .LBB7_3 1877; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 1878; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 1879; GFX908-NEXT: s_setpc_b64 s[30:31] 1880; 1881; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1882; GFX8: ; %bb.0: 1883; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1884; GFX8-NEXT: v_mov_b32_e32 v8, v3 1885; GFX8-NEXT: v_mov_b32_e32 v7, v2 1886; GFX8-NEXT: v_mov_b32_e32 v10, v1 1887; GFX8-NEXT: v_mov_b32_e32 v9, v0 1888; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4 1889; GFX8-NEXT: s_mov_b64 s[6:7], exec 1890; GFX8-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1891; GFX8-NEXT: v_readfirstlane_b32 s8, v9 1892; GFX8-NEXT: v_readfirstlane_b32 s9, v10 1893; GFX8-NEXT: v_readfirstlane_b32 s10, v7 1894; GFX8-NEXT: v_readfirstlane_b32 s11, v8 1895; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 1896; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 1897; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1898; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1899; GFX8-NEXT: s_nop 0 1900; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048 1901; GFX8-NEXT: ; implicit-def: $vgpr4 1902; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 1903; GFX8-NEXT: s_cbranch_execnz .LBB7_1 1904; GFX8-NEXT: ; %bb.2: 1905; GFX8-NEXT: s_mov_b64 exec, s[6:7] 1906; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6] 1907; GFX8-NEXT: s_mov_b64 s[6:7], 0 1908; GFX8-NEXT: .LBB7_3: ; %atomicrmw.start 1909; GFX8-NEXT: ; =>This Loop Header: Depth=1 1910; GFX8-NEXT: ; Child Loop BB7_4 Depth 2 1911; GFX8-NEXT: s_waitcnt vmcnt(0) 1912; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14] 1913; GFX8-NEXT: s_mov_b64 s[12:13], exec 1914; GFX8-NEXT: v_min_f64 v[11:12], v[0:1], v[4:5] 1915; GFX8-NEXT: v_mov_b32_e32 v0, v11 1916; GFX8-NEXT: v_mov_b32_e32 v1, v12 1917; GFX8-NEXT: v_mov_b32_e32 v2, v13 1918; GFX8-NEXT: v_mov_b32_e32 v3, v14 1919; GFX8-NEXT: .LBB7_4: ; Parent Loop BB7_3 Depth=1 1920; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 1921; GFX8-NEXT: v_readfirstlane_b32 s8, v9 1922; GFX8-NEXT: v_readfirstlane_b32 s9, v10 1923; GFX8-NEXT: v_readfirstlane_b32 s10, v7 1924; GFX8-NEXT: v_readfirstlane_b32 s11, v8 1925; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10] 1926; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8] 1927; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1928; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1929; GFX8-NEXT: s_waitcnt vmcnt(0) 1930; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc 1931; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 1932; GFX8-NEXT: s_cbranch_execnz .LBB7_4 1933; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB7_3 Depth=1 1934; GFX8-NEXT: s_mov_b64 exec, s[12:13] 1935; GFX8-NEXT: s_waitcnt vmcnt(0) 1936; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14] 1937; GFX8-NEXT: v_mov_b32_e32 v14, v1 1938; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 1939; GFX8-NEXT: v_mov_b32_e32 v13, v0 1940; GFX8-NEXT: buffer_wbinvl1 1941; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 1942; GFX8-NEXT: s_cbranch_execnz .LBB7_3 1943; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 1944; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 1945; GFX8-NEXT: s_setpc_b64 s[30:31] 1946; 1947; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1948; GFX7: ; %bb.0: 1949; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1950; GFX7-NEXT: s_mov_b64 s[6:7], exec 1951; GFX7-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1952; GFX7-NEXT: v_readfirstlane_b32 s8, v0 1953; GFX7-NEXT: v_readfirstlane_b32 s9, v1 1954; GFX7-NEXT: v_readfirstlane_b32 s10, v2 1955; GFX7-NEXT: v_readfirstlane_b32 s11, v3 1956; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 1957; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 1958; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1959; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1960; GFX7-NEXT: s_waitcnt vmcnt(0) 1961; GFX7-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc 1962; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1963; GFX7-NEXT: ; implicit-def: $vgpr4 1964; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 1965; GFX7-NEXT: s_cbranch_execnz .LBB7_1 1966; GFX7-NEXT: ; %bb.2: 1967; GFX7-NEXT: s_mov_b64 exec, s[6:7] 1968; GFX7-NEXT: s_waitcnt vmcnt(0) 1969; GFX7-NEXT: v_mov_b32_e32 v0, v5 1970; GFX7-NEXT: v_mov_b32_e32 v1, v6 1971; GFX7-NEXT: buffer_wbinvl1 1972; GFX7-NEXT: s_setpc_b64 s[30:31] 1973; 1974; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory: 1975; GFX6: ; %bb.0: 1976; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1977; GFX6-NEXT: s_mov_b64 s[6:7], exec 1978; GFX6-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 1979; GFX6-NEXT: v_readfirstlane_b32 s8, v0 1980; GFX6-NEXT: v_readfirstlane_b32 s9, v1 1981; GFX6-NEXT: v_readfirstlane_b32 s10, v2 1982; GFX6-NEXT: v_readfirstlane_b32 s11, v3 1983; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 1984; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 1985; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 1986; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 1987; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 1988; GFX6-NEXT: buffer_atomic_fmin_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc 1989; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 1990; GFX6-NEXT: ; implicit-def: $vgpr4 1991; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 1992; GFX6-NEXT: s_cbranch_execnz .LBB7_1 1993; GFX6-NEXT: ; %bb.2: 1994; GFX6-NEXT: s_mov_b64 exec, s[6:7] 1995; GFX6-NEXT: s_waitcnt vmcnt(0) 1996; GFX6-NEXT: v_mov_b32_e32 v0, v5 1997; GFX6-NEXT: v_mov_b32_e32 v1, v6 1998; GFX6-NEXT: buffer_wbinvl1 1999; GFX6-NEXT: s_waitcnt expcnt(0) 2000; GFX6-NEXT: s_setpc_b64 s[30:31] 2001 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2002 %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2003 ret double %result 2004} 2005 2006define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 2007; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2008; GFX12: ; %bb.0: 2009; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2010; GFX12-NEXT: s_wait_expcnt 0x0 2011; GFX12-NEXT: s_wait_samplecnt 0x0 2012; GFX12-NEXT: s_wait_bvhcnt 0x0 2013; GFX12-NEXT: s_wait_kmcnt 0x0 2014; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2015; GFX12-NEXT: v_mov_b32_e32 v0, s16 2016; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 2017; GFX12-NEXT: s_wait_alu 0xfffe 2018; GFX12-NEXT: v_mov_b32_e32 v6, s4 2019; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 2020; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 2021; GFX12-NEXT: s_mov_b32 s4, 0 2022; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start 2023; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2024; GFX12-NEXT: s_wait_loadcnt 0x0 2025; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2026; GFX12-NEXT: s_wait_storecnt 0x0 2027; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2028; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] 2029; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] 2030; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2031; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2032; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2033; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 2034; GFX12-NEXT: s_wait_loadcnt 0x0 2035; GFX12-NEXT: global_inv scope:SCOPE_DEV 2036; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2037; GFX12-NEXT: s_wait_alu 0xfffe 2038; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 2039; GFX12-NEXT: s_wait_alu 0xfffe 2040; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2041; GFX12-NEXT: s_cbranch_execnz .LBB8_1 2042; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2043; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 2044; GFX12-NEXT: s_wait_alu 0xfffe 2045; GFX12-NEXT: s_setpc_b64 s[30:31] 2046; 2047; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2048; GFX940: ; %bb.0: 2049; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2050; GFX940-NEXT: v_mov_b32_e32 v2, s16 2051; GFX940-NEXT: buffer_wbl2 sc1 2052; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 2053; GFX940-NEXT: s_waitcnt vmcnt(0) 2054; GFX940-NEXT: buffer_inv sc1 2055; GFX940-NEXT: s_setpc_b64 s[30:31] 2056; 2057; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2058; GFX11: ; %bb.0: 2059; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2060; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2061; GFX11-NEXT: v_mov_b32_e32 v0, s16 2062; GFX11-NEXT: s_add_i32 s4, s16, 0x800 2063; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) 2064; GFX11-NEXT: v_mov_b32_e32 v6, s4 2065; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2066; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 2067; GFX11-NEXT: s_mov_b32 s4, 0 2068; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 2069; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2070; GFX11-NEXT: s_waitcnt vmcnt(0) 2071; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2072; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2073; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2074; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2075; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2076; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2077; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2078; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2079; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 2080; GFX11-NEXT: s_waitcnt vmcnt(0) 2081; GFX11-NEXT: buffer_gl1_inv 2082; GFX11-NEXT: buffer_gl0_inv 2083; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2084; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 2085; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2086; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2087; GFX11-NEXT: s_cbranch_execnz .LBB8_1 2088; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2089; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 2090; GFX11-NEXT: s_setpc_b64 s[30:31] 2091; 2092; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2093; GFX10: ; %bb.0: 2094; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2095; GFX10-NEXT: v_mov_b32_e32 v2, v0 2096; GFX10-NEXT: v_mov_b32_e32 v0, s20 2097; GFX10-NEXT: v_mov_b32_e32 v3, v1 2098; GFX10-NEXT: s_add_i32 s4, s20, 0x800 2099; GFX10-NEXT: v_mov_b32_e32 v6, s4 2100; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2101; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2102; GFX10-NEXT: s_mov_b32 s4, 0 2103; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 2104; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2105; GFX10-NEXT: s_waitcnt vmcnt(0) 2106; GFX10-NEXT: v_mov_b32_e32 v10, v1 2107; GFX10-NEXT: v_mov_b32_e32 v9, v0 2108; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2109; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2110; GFX10-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2111; GFX10-NEXT: v_mov_b32_e32 v0, v7 2112; GFX10-NEXT: v_mov_b32_e32 v1, v8 2113; GFX10-NEXT: v_mov_b32_e32 v2, v9 2114; GFX10-NEXT: v_mov_b32_e32 v3, v10 2115; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2116; GFX10-NEXT: s_waitcnt vmcnt(0) 2117; GFX10-NEXT: buffer_gl1_inv 2118; GFX10-NEXT: buffer_gl0_inv 2119; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2120; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 2121; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 2122; GFX10-NEXT: s_cbranch_execnz .LBB8_1 2123; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2124; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2125; GFX10-NEXT: s_setpc_b64 s[30:31] 2126; 2127; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2128; GFX90A: ; %bb.0: 2129; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2130; GFX90A-NEXT: v_mov_b32_e32 v2, v0 2131; GFX90A-NEXT: v_mov_b32_e32 v0, s20 2132; GFX90A-NEXT: v_mov_b32_e32 v3, v1 2133; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2134; GFX90A-NEXT: s_add_i32 s6, s20, 0x800 2135; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2136; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2137; GFX90A-NEXT: v_mov_b32_e32 v6, s6 2138; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start 2139; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2140; GFX90A-NEXT: s_waitcnt vmcnt(0) 2141; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] 2142; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] 2143; GFX90A-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] 2144; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] 2145; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] 2146; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2147; GFX90A-NEXT: s_waitcnt vmcnt(0) 2148; GFX90A-NEXT: buffer_wbinvl1 2149; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] 2150; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2151; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2152; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 2153; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2154; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2155; GFX90A-NEXT: s_setpc_b64 s[30:31] 2156; 2157; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2158; GFX908: ; %bb.0: 2159; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2160; GFX908-NEXT: v_mov_b32_e32 v2, v0 2161; GFX908-NEXT: v_mov_b32_e32 v0, s20 2162; GFX908-NEXT: v_mov_b32_e32 v3, v1 2163; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2164; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2165; GFX908-NEXT: s_add_i32 s6, s20, 0x800 2166; GFX908-NEXT: s_mov_b64 s[4:5], 0 2167; GFX908-NEXT: v_mov_b32_e32 v6, s6 2168; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 2169; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2170; GFX908-NEXT: s_waitcnt vmcnt(0) 2171; GFX908-NEXT: v_mov_b32_e32 v10, v1 2172; GFX908-NEXT: v_mov_b32_e32 v9, v0 2173; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2174; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2175; GFX908-NEXT: v_mov_b32_e32 v0, v7 2176; GFX908-NEXT: v_mov_b32_e32 v1, v8 2177; GFX908-NEXT: v_mov_b32_e32 v2, v9 2178; GFX908-NEXT: v_mov_b32_e32 v3, v10 2179; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2180; GFX908-NEXT: s_waitcnt vmcnt(0) 2181; GFX908-NEXT: buffer_wbinvl1 2182; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2183; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2184; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2185; GFX908-NEXT: s_cbranch_execnz .LBB8_1 2186; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2187; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2188; GFX908-NEXT: s_setpc_b64 s[30:31] 2189; 2190; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2191; GFX8: ; %bb.0: 2192; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2193; GFX8-NEXT: v_mov_b32_e32 v2, v0 2194; GFX8-NEXT: v_mov_b32_e32 v0, s20 2195; GFX8-NEXT: v_mov_b32_e32 v3, v1 2196; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2197; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2198; GFX8-NEXT: s_add_i32 s6, s20, 0x800 2199; GFX8-NEXT: s_mov_b64 s[4:5], 0 2200; GFX8-NEXT: v_mov_b32_e32 v6, s6 2201; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 2202; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2203; GFX8-NEXT: s_waitcnt vmcnt(0) 2204; GFX8-NEXT: v_mov_b32_e32 v10, v1 2205; GFX8-NEXT: v_mov_b32_e32 v9, v0 2206; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2207; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2208; GFX8-NEXT: v_mov_b32_e32 v0, v7 2209; GFX8-NEXT: v_mov_b32_e32 v1, v8 2210; GFX8-NEXT: v_mov_b32_e32 v2, v9 2211; GFX8-NEXT: v_mov_b32_e32 v3, v10 2212; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2213; GFX8-NEXT: s_waitcnt vmcnt(0) 2214; GFX8-NEXT: buffer_wbinvl1 2215; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2216; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2217; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2218; GFX8-NEXT: s_cbranch_execnz .LBB8_1 2219; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2220; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2221; GFX8-NEXT: s_setpc_b64 s[30:31] 2222; 2223; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2224; GFX7: ; %bb.0: 2225; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2226; GFX7-NEXT: v_mov_b32_e32 v2, v0 2227; GFX7-NEXT: v_mov_b32_e32 v0, s20 2228; GFX7-NEXT: v_mov_b32_e32 v3, v1 2229; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2230; GFX7-NEXT: s_add_i32 s6, s20, 0x800 2231; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2232; GFX7-NEXT: s_mov_b64 s[4:5], 0 2233; GFX7-NEXT: v_mov_b32_e32 v6, s6 2234; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start 2235; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2236; GFX7-NEXT: s_waitcnt vmcnt(0) 2237; GFX7-NEXT: v_mov_b32_e32 v10, v1 2238; GFX7-NEXT: v_mov_b32_e32 v9, v0 2239; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2240; GFX7-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2241; GFX7-NEXT: v_mov_b32_e32 v0, v7 2242; GFX7-NEXT: v_mov_b32_e32 v1, v8 2243; GFX7-NEXT: v_mov_b32_e32 v2, v9 2244; GFX7-NEXT: v_mov_b32_e32 v3, v10 2245; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2246; GFX7-NEXT: s_waitcnt vmcnt(0) 2247; GFX7-NEXT: buffer_wbinvl1 2248; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2249; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2250; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2251; GFX7-NEXT: s_cbranch_execnz .LBB8_1 2252; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2253; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2254; GFX7-NEXT: s_setpc_b64 s[30:31] 2255; 2256; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory: 2257; GFX6: ; %bb.0: 2258; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2259; GFX6-NEXT: v_mov_b32_e32 v2, v0 2260; GFX6-NEXT: v_mov_b32_e32 v0, s20 2261; GFX6-NEXT: v_mov_b32_e32 v3, v1 2262; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2263; GFX6-NEXT: s_add_i32 s6, s20, 0x800 2264; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2265; GFX6-NEXT: s_mov_b64 s[4:5], 0 2266; GFX6-NEXT: v_mov_b32_e32 v6, s6 2267; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start 2268; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 2269; GFX6-NEXT: s_waitcnt vmcnt(0) 2270; GFX6-NEXT: v_mov_b32_e32 v10, v1 2271; GFX6-NEXT: v_mov_b32_e32 v9, v0 2272; GFX6-NEXT: s_waitcnt expcnt(0) 2273; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2274; GFX6-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2275; GFX6-NEXT: v_mov_b32_e32 v0, v7 2276; GFX6-NEXT: v_mov_b32_e32 v1, v8 2277; GFX6-NEXT: v_mov_b32_e32 v2, v9 2278; GFX6-NEXT: v_mov_b32_e32 v3, v10 2279; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2280; GFX6-NEXT: s_waitcnt vmcnt(0) 2281; GFX6-NEXT: buffer_wbinvl1 2282; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2283; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2284; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 2285; GFX6-NEXT: s_cbranch_execnz .LBB8_1 2286; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 2287; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 2288; GFX6-NEXT: s_waitcnt expcnt(0) 2289; GFX6-NEXT: s_setpc_b64 s[30:31] 2290 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2291 %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 2292 ret double %result 2293} 2294 2295define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 { 2296; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2297; GFX12: ; %bb.0: 2298; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2299; GFX12-NEXT: s_wait_expcnt 0x0 2300; GFX12-NEXT: s_wait_samplecnt 0x0 2301; GFX12-NEXT: s_wait_bvhcnt 0x0 2302; GFX12-NEXT: s_wait_kmcnt 0x0 2303; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2304; GFX12-NEXT: v_mov_b32_e32 v0, s16 2305; GFX12-NEXT: s_add_co_i32 s4, s16, 0x800 2306; GFX12-NEXT: s_wait_alu 0xfffe 2307; GFX12-NEXT: v_mov_b32_e32 v6, s4 2308; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 2309; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048 2310; GFX12-NEXT: s_mov_b32 s4, 0 2311; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start 2312; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2313; GFX12-NEXT: s_wait_loadcnt 0x0 2314; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2315; GFX12-NEXT: s_wait_storecnt 0x0 2316; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2317; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] 2318; GFX12-NEXT: v_min_num_f64_e32 v[7:8], v[0:1], v[4:5] 2319; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2320; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2321; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2322; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 2323; GFX12-NEXT: s_wait_loadcnt 0x0 2324; GFX12-NEXT: global_inv scope:SCOPE_DEV 2325; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2326; GFX12-NEXT: s_wait_alu 0xfffe 2327; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 2328; GFX12-NEXT: s_wait_alu 0xfffe 2329; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2330; GFX12-NEXT: s_cbranch_execnz .LBB9_1 2331; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2332; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 2333; GFX12-NEXT: s_wait_alu 0xfffe 2334; GFX12-NEXT: s_setpc_b64 s[30:31] 2335; 2336; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2337; GFX940: ; %bb.0: 2338; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2339; GFX940-NEXT: v_mov_b32_e32 v2, s16 2340; GFX940-NEXT: buffer_wbl2 sc1 2341; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0 2342; GFX940-NEXT: s_waitcnt vmcnt(0) 2343; GFX940-NEXT: buffer_inv sc1 2344; GFX940-NEXT: s_setpc_b64 s[30:31] 2345; 2346; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2347; GFX11: ; %bb.0: 2348; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2349; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2350; GFX11-NEXT: v_mov_b32_e32 v0, s16 2351; GFX11-NEXT: s_add_i32 s4, s16, 0x800 2352; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3) 2353; GFX11-NEXT: v_mov_b32_e32 v6, s4 2354; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2355; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048 2356; GFX11-NEXT: s_mov_b32 s4, 0 2357; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start 2358; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2359; GFX11-NEXT: s_waitcnt vmcnt(0) 2360; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 2361; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2362; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2363; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2364; GFX11-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2365; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2366; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 2367; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 2368; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 2369; GFX11-NEXT: s_waitcnt vmcnt(0) 2370; GFX11-NEXT: buffer_gl1_inv 2371; GFX11-NEXT: buffer_gl0_inv 2372; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 2373; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 2374; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2375; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2376; GFX11-NEXT: s_cbranch_execnz .LBB9_1 2377; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2378; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 2379; GFX11-NEXT: s_setpc_b64 s[30:31] 2380; 2381; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2382; GFX10: ; %bb.0: 2383; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2384; GFX10-NEXT: v_mov_b32_e32 v2, s20 2385; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2386; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 2387; GFX10-NEXT: s_waitcnt vmcnt(0) 2388; GFX10-NEXT: buffer_gl1_inv 2389; GFX10-NEXT: buffer_gl0_inv 2390; GFX10-NEXT: s_setpc_b64 s[30:31] 2391; 2392; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2393; GFX90A: ; %bb.0: 2394; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2395; GFX90A-NEXT: v_mov_b32_e32 v2, s20 2396; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 2397; GFX90A-NEXT: s_waitcnt vmcnt(0) 2398; GFX90A-NEXT: buffer_wbinvl1 2399; GFX90A-NEXT: s_setpc_b64 s[30:31] 2400; 2401; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2402; GFX908: ; %bb.0: 2403; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2404; GFX908-NEXT: v_mov_b32_e32 v2, v0 2405; GFX908-NEXT: v_mov_b32_e32 v0, s20 2406; GFX908-NEXT: v_mov_b32_e32 v3, v1 2407; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2408; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2409; GFX908-NEXT: s_add_i32 s6, s20, 0x800 2410; GFX908-NEXT: s_mov_b64 s[4:5], 0 2411; GFX908-NEXT: v_mov_b32_e32 v6, s6 2412; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 2413; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2414; GFX908-NEXT: s_waitcnt vmcnt(0) 2415; GFX908-NEXT: v_mov_b32_e32 v10, v1 2416; GFX908-NEXT: v_mov_b32_e32 v9, v0 2417; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2418; GFX908-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2419; GFX908-NEXT: v_mov_b32_e32 v0, v7 2420; GFX908-NEXT: v_mov_b32_e32 v1, v8 2421; GFX908-NEXT: v_mov_b32_e32 v2, v9 2422; GFX908-NEXT: v_mov_b32_e32 v3, v10 2423; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2424; GFX908-NEXT: s_waitcnt vmcnt(0) 2425; GFX908-NEXT: buffer_wbinvl1 2426; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2427; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2428; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2429; GFX908-NEXT: s_cbranch_execnz .LBB9_1 2430; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2431; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2432; GFX908-NEXT: s_setpc_b64 s[30:31] 2433; 2434; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2435; GFX8: ; %bb.0: 2436; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2437; GFX8-NEXT: v_mov_b32_e32 v2, v0 2438; GFX8-NEXT: v_mov_b32_e32 v0, s20 2439; GFX8-NEXT: v_mov_b32_e32 v3, v1 2440; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048 2441; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2442; GFX8-NEXT: s_add_i32 s6, s20, 0x800 2443; GFX8-NEXT: s_mov_b64 s[4:5], 0 2444; GFX8-NEXT: v_mov_b32_e32 v6, s6 2445; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 2446; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2447; GFX8-NEXT: s_waitcnt vmcnt(0) 2448; GFX8-NEXT: v_mov_b32_e32 v10, v1 2449; GFX8-NEXT: v_mov_b32_e32 v9, v0 2450; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 2451; GFX8-NEXT: v_min_f64 v[7:8], v[0:1], v[4:5] 2452; GFX8-NEXT: v_mov_b32_e32 v0, v7 2453; GFX8-NEXT: v_mov_b32_e32 v1, v8 2454; GFX8-NEXT: v_mov_b32_e32 v2, v9 2455; GFX8-NEXT: v_mov_b32_e32 v3, v10 2456; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 2457; GFX8-NEXT: s_waitcnt vmcnt(0) 2458; GFX8-NEXT: buffer_wbinvl1 2459; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 2460; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2461; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2462; GFX8-NEXT: s_cbranch_execnz .LBB9_1 2463; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2464; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2465; GFX8-NEXT: s_setpc_b64 s[30:31] 2466; 2467; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2468; GFX7: ; %bb.0: 2469; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2470; GFX7-NEXT: v_mov_b32_e32 v2, s20 2471; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 2472; GFX7-NEXT: s_waitcnt vmcnt(0) 2473; GFX7-NEXT: buffer_wbinvl1 2474; GFX7-NEXT: s_setpc_b64 s[30:31] 2475; 2476; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 2477; GFX6: ; %bb.0: 2478; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2479; GFX6-NEXT: v_mov_b32_e32 v2, s20 2480; GFX6-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[16:19], 0 offen offset:2048 glc 2481; GFX6-NEXT: s_waitcnt vmcnt(0) 2482; GFX6-NEXT: buffer_wbinvl1 2483; GFX6-NEXT: s_waitcnt expcnt(0) 2484; GFX6-NEXT: s_setpc_b64 s[30:31] 2485 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 2486 %result = atomicrmw fmin ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 2487 ret double %result 2488} 2489 2490; -------------------------------------------------------------------- 2491; half 2492; -------------------------------------------------------------------- 2493 2494define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { 2495; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2496; GFX12: ; %bb.0: 2497; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2498; GFX12-NEXT: s_wait_expcnt 0x0 2499; GFX12-NEXT: s_wait_samplecnt 0x0 2500; GFX12-NEXT: s_wait_bvhcnt 0x0 2501; GFX12-NEXT: s_wait_kmcnt 0x0 2502; GFX12-NEXT: s_addk_co_i32 s16, 0x200 2503; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0 2504; GFX12-NEXT: s_wait_alu 0xfffe 2505; GFX12-NEXT: s_and_b32 s4, s16, -4 2506; GFX12-NEXT: s_wait_alu 0xfffe 2507; GFX12-NEXT: v_mov_b32_e32 v4, s4 2508; GFX12-NEXT: s_and_b32 s4, s16, 3 2509; GFX12-NEXT: s_wait_alu 0xfffe 2510; GFX12-NEXT: s_lshl_b32 s4, s4, 3 2511; GFX12-NEXT: s_wait_alu 0xfffe 2512; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 2513; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen 2514; GFX12-NEXT: s_wait_alu 0xfffe 2515; GFX12-NEXT: s_not_b32 s6, s5 2516; GFX12-NEXT: s_mov_b32 s5, 0 2517; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start 2518; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2519; GFX12-NEXT: s_wait_loadcnt 0x0 2520; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 2521; GFX12-NEXT: s_wait_storecnt 0x0 2522; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2523; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 2524; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v5 2525; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2526; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 2527; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 2528; GFX12-NEXT: s_wait_alu 0xfffe 2529; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2530; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 2531; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2532; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN 2533; GFX12-NEXT: s_wait_loadcnt 0x0 2534; GFX12-NEXT: global_inv scope:SCOPE_DEV 2535; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 2536; GFX12-NEXT: v_mov_b32_e32 v1, v2 2537; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 2538; GFX12-NEXT: s_wait_alu 0xfffe 2539; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 2540; GFX12-NEXT: s_cbranch_execnz .LBB10_1 2541; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2542; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 2543; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 2544; GFX12-NEXT: s_wait_alu 0xfffe 2545; GFX12-NEXT: s_setpc_b64 s[30:31] 2546; 2547; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2548; GFX940: ; %bb.0: 2549; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2550; GFX940-NEXT: s_addk_i32 s16, 0x200 2551; GFX940-NEXT: s_and_b32 s4, s16, -4 2552; GFX940-NEXT: v_mov_b32_e32 v4, s4 2553; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen 2554; GFX940-NEXT: s_and_b32 s4, s16, 3 2555; GFX940-NEXT: s_lshl_b32 s6, s4, 3 2556; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 2557; GFX940-NEXT: s_not_b32 s7, s4 2558; GFX940-NEXT: s_mov_b64 s[4:5], 0 2559; GFX940-NEXT: v_max_f16_e32 v5, v0, v0 2560; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start 2561; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2562; GFX940-NEXT: s_waitcnt vmcnt(0) 2563; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2564; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 2565; GFX940-NEXT: v_min_f16_e32 v0, v0, v5 2566; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2567; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 2568; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] 2569; GFX940-NEXT: buffer_wbl2 sc1 2570; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 2571; GFX940-NEXT: s_waitcnt vmcnt(0) 2572; GFX940-NEXT: buffer_inv sc1 2573; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2574; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2575; GFX940-NEXT: v_mov_b32_e32 v1, v2 2576; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 2577; GFX940-NEXT: s_cbranch_execnz .LBB10_1 2578; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2579; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 2580; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2581; GFX940-NEXT: s_setpc_b64 s[30:31] 2582; 2583; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2584; GFX11: ; %bb.0: 2585; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2586; GFX11-NEXT: s_addk_i32 s16, 0x200 2587; GFX11-NEXT: v_max_f16_e32 v5, v0, v0 2588; GFX11-NEXT: s_and_b32 s4, s16, -4 2589; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2590; GFX11-NEXT: v_mov_b32_e32 v4, s4 2591; GFX11-NEXT: s_and_b32 s4, s16, 3 2592; GFX11-NEXT: s_lshl_b32 s4, s4, 3 2593; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2594; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 2595; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen 2596; GFX11-NEXT: s_not_b32 s6, s5 2597; GFX11-NEXT: s_mov_b32 s5, 0 2598; GFX11-NEXT: .p2align 6 2599; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start 2600; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2601; GFX11-NEXT: s_waitcnt vmcnt(0) 2602; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 2603; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2604; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2605; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 2606; GFX11-NEXT: v_min_f16_e32 v0, v0, v5 2607; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2608; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2609; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 2610; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2611; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 2612; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 2613; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc 2614; GFX11-NEXT: s_waitcnt vmcnt(0) 2615; GFX11-NEXT: buffer_gl1_inv 2616; GFX11-NEXT: buffer_gl0_inv 2617; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 2618; GFX11-NEXT: v_mov_b32_e32 v1, v2 2619; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 2620; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2621; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 2622; GFX11-NEXT: s_cbranch_execnz .LBB10_1 2623; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2624; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 2625; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 2626; GFX11-NEXT: s_setpc_b64 s[30:31] 2627; 2628; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2629; GFX10: ; %bb.0: 2630; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2631; GFX10-NEXT: s_addk_i32 s20, 0x200 2632; GFX10-NEXT: v_max_f16_e32 v5, v0, v0 2633; GFX10-NEXT: s_and_b32 s4, s20, -4 2634; GFX10-NEXT: v_mov_b32_e32 v4, s4 2635; GFX10-NEXT: s_and_b32 s4, s20, 3 2636; GFX10-NEXT: s_lshl_b32 s4, s4, 3 2637; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 2638; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2639; GFX10-NEXT: s_not_b32 s6, s5 2640; GFX10-NEXT: s_mov_b32 s5, 0 2641; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start 2642; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 2643; GFX10-NEXT: s_waitcnt vmcnt(0) 2644; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 2645; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2646; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 2647; GFX10-NEXT: v_min_f16_e32 v0, v0, v5 2648; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 2649; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 2650; GFX10-NEXT: v_mov_b32_e32 v3, v1 2651; GFX10-NEXT: v_mov_b32_e32 v2, v0 2652; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2653; GFX10-NEXT: s_waitcnt vmcnt(0) 2654; GFX10-NEXT: buffer_gl1_inv 2655; GFX10-NEXT: buffer_gl0_inv 2656; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 2657; GFX10-NEXT: v_mov_b32_e32 v1, v2 2658; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 2659; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 2660; GFX10-NEXT: s_cbranch_execnz .LBB10_1 2661; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 2662; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 2663; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 2664; GFX10-NEXT: s_setpc_b64 s[30:31] 2665; 2666; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2667; GFX90A: ; %bb.0: 2668; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2669; GFX90A-NEXT: s_addk_i32 s20, 0x200 2670; GFX90A-NEXT: s_and_b32 s4, s20, -4 2671; GFX90A-NEXT: v_mov_b32_e32 v4, s4 2672; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2673; GFX90A-NEXT: s_and_b32 s4, s20, 3 2674; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 2675; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 2676; GFX90A-NEXT: s_not_b32 s7, s4 2677; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2678; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0 2679; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start 2680; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2681; GFX90A-NEXT: s_waitcnt vmcnt(0) 2682; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2683; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 2684; GFX90A-NEXT: v_min_f16_e32 v0, v0, v5 2685; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2686; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 2687; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] 2688; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2689; GFX90A-NEXT: s_waitcnt vmcnt(0) 2690; GFX90A-NEXT: buffer_wbinvl1 2691; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2692; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2693; GFX90A-NEXT: v_mov_b32_e32 v1, v2 2694; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2695; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 2696; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2697; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2698; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2699; GFX90A-NEXT: s_setpc_b64 s[30:31] 2700; 2701; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2702; GFX908: ; %bb.0: 2703; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2704; GFX908-NEXT: s_addk_i32 s20, 0x200 2705; GFX908-NEXT: s_and_b32 s4, s20, -4 2706; GFX908-NEXT: v_mov_b32_e32 v4, s4 2707; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2708; GFX908-NEXT: s_and_b32 s4, s20, 3 2709; GFX908-NEXT: s_lshl_b32 s6, s4, 3 2710; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 2711; GFX908-NEXT: s_not_b32 s7, s4 2712; GFX908-NEXT: s_mov_b64 s[4:5], 0 2713; GFX908-NEXT: v_max_f16_e32 v5, v0, v0 2714; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start 2715; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2716; GFX908-NEXT: s_waitcnt vmcnt(0) 2717; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2718; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 2719; GFX908-NEXT: v_min_f16_e32 v0, v0, v5 2720; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2721; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 2722; GFX908-NEXT: v_mov_b32_e32 v3, v1 2723; GFX908-NEXT: v_mov_b32_e32 v2, v0 2724; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2725; GFX908-NEXT: s_waitcnt vmcnt(0) 2726; GFX908-NEXT: buffer_wbinvl1 2727; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2728; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2729; GFX908-NEXT: v_mov_b32_e32 v1, v2 2730; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2731; GFX908-NEXT: s_cbranch_execnz .LBB10_1 2732; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2733; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2734; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2735; GFX908-NEXT: s_setpc_b64 s[30:31] 2736; 2737; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2738; GFX8: ; %bb.0: 2739; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2740; GFX8-NEXT: s_addk_i32 s20, 0x200 2741; GFX8-NEXT: s_and_b32 s4, s20, -4 2742; GFX8-NEXT: v_mov_b32_e32 v4, s4 2743; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2744; GFX8-NEXT: s_and_b32 s4, s20, 3 2745; GFX8-NEXT: s_lshl_b32 s6, s4, 3 2746; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 2747; GFX8-NEXT: s_not_b32 s7, s4 2748; GFX8-NEXT: s_mov_b64 s[4:5], 0 2749; GFX8-NEXT: v_max_f16_e32 v5, v0, v0 2750; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start 2751; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2752; GFX8-NEXT: s_waitcnt vmcnt(0) 2753; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2754; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 2755; GFX8-NEXT: v_min_f16_e32 v0, v0, v5 2756; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 2757; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2758; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 2759; GFX8-NEXT: v_mov_b32_e32 v3, v1 2760; GFX8-NEXT: v_mov_b32_e32 v2, v0 2761; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2762; GFX8-NEXT: s_waitcnt vmcnt(0) 2763; GFX8-NEXT: buffer_wbinvl1 2764; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2765; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2766; GFX8-NEXT: v_mov_b32_e32 v1, v2 2767; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2768; GFX8-NEXT: s_cbranch_execnz .LBB10_1 2769; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2770; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2771; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2772; GFX8-NEXT: s_setpc_b64 s[30:31] 2773; 2774; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2775; GFX7: ; %bb.0: 2776; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2777; GFX7-NEXT: s_addk_i32 s20, 0x200 2778; GFX7-NEXT: s_and_b32 s4, s20, -4 2779; GFX7-NEXT: v_mov_b32_e32 v4, s4 2780; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2781; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 2782; GFX7-NEXT: s_and_b32 s4, s20, 3 2783; GFX7-NEXT: s_lshl_b32 s6, s4, 3 2784; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 2785; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 2786; GFX7-NEXT: s_not_b32 s7, s4 2787; GFX7-NEXT: s_mov_b64 s[4:5], 0 2788; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start 2789; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 2790; GFX7-NEXT: s_waitcnt vmcnt(0) 2791; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2792; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 2793; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 2794; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 2795; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 2796; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2797; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 2798; GFX7-NEXT: v_mov_b32_e32 v3, v1 2799; GFX7-NEXT: v_mov_b32_e32 v2, v0 2800; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2801; GFX7-NEXT: s_waitcnt vmcnt(0) 2802; GFX7-NEXT: buffer_wbinvl1 2803; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2804; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2805; GFX7-NEXT: v_mov_b32_e32 v1, v2 2806; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 2807; GFX7-NEXT: s_cbranch_execnz .LBB10_1 2808; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 2809; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 2810; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2811; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 2812; GFX7-NEXT: s_setpc_b64 s[30:31] 2813; 2814; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory: 2815; GFX6: ; %bb.0: 2816; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2817; GFX6-NEXT: s_addk_i32 s20, 0x200 2818; GFX6-NEXT: s_and_b32 s4, s20, -4 2819; GFX6-NEXT: v_mov_b32_e32 v4, s4 2820; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 2821; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 2822; GFX6-NEXT: s_and_b32 s4, s20, 3 2823; GFX6-NEXT: s_lshl_b32 s6, s4, 3 2824; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 2825; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 2826; GFX6-NEXT: s_not_b32 s7, s4 2827; GFX6-NEXT: s_mov_b64 s[4:5], 0 2828; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start 2829; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 2830; GFX6-NEXT: s_waitcnt vmcnt(0) 2831; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2832; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2833; GFX6-NEXT: s_waitcnt expcnt(0) 2834; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 2835; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 2836; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 2837; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2838; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 2839; GFX6-NEXT: v_mov_b32_e32 v3, v1 2840; GFX6-NEXT: v_mov_b32_e32 v2, v0 2841; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 2842; GFX6-NEXT: s_waitcnt vmcnt(0) 2843; GFX6-NEXT: buffer_wbinvl1 2844; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 2845; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2846; GFX6-NEXT: v_mov_b32_e32 v1, v2 2847; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 2848; GFX6-NEXT: s_cbranch_execnz .LBB10_1 2849; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 2850; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 2851; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 2852; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 2853; GFX6-NEXT: s_waitcnt expcnt(0) 2854; GFX6-NEXT: s_setpc_b64 s[30:31] 2855 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 2856 %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2857 ret half %result 2858} 2859 2860define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 { 2861; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 2862; GFX12: ; %bb.0: 2863; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2864; GFX12-NEXT: s_wait_expcnt 0x0 2865; GFX12-NEXT: s_wait_samplecnt 0x0 2866; GFX12-NEXT: s_wait_bvhcnt 0x0 2867; GFX12-NEXT: s_wait_kmcnt 0x0 2868; GFX12-NEXT: s_addk_co_i32 s16, 0x200 2869; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0 2870; GFX12-NEXT: s_wait_alu 0xfffe 2871; GFX12-NEXT: s_and_b32 s4, s16, -4 2872; GFX12-NEXT: s_wait_alu 0xfffe 2873; GFX12-NEXT: v_mov_b32_e32 v2, s4 2874; GFX12-NEXT: s_and_b32 s4, s16, 3 2875; GFX12-NEXT: s_wait_alu 0xfffe 2876; GFX12-NEXT: s_lshl_b32 s4, s4, 3 2877; GFX12-NEXT: s_wait_alu 0xfffe 2878; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 2879; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen 2880; GFX12-NEXT: s_wait_alu 0xfffe 2881; GFX12-NEXT: s_not_b32 s6, s5 2882; GFX12-NEXT: s_mov_b32 s5, 0 2883; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start 2884; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2885; GFX12-NEXT: s_wait_loadcnt 0x0 2886; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 2887; GFX12-NEXT: s_wait_storecnt 0x0 2888; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2889; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 2890; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v3 2891; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2892; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 2893; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 2894; GFX12-NEXT: s_wait_alu 0xfffe 2895; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2896; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 2897; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 2898; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN 2899; GFX12-NEXT: s_wait_loadcnt 0x0 2900; GFX12-NEXT: global_inv scope:SCOPE_DEV 2901; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 2902; GFX12-NEXT: v_mov_b32_e32 v1, v4 2903; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 2904; GFX12-NEXT: s_wait_alu 0xfffe 2905; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 2906; GFX12-NEXT: s_cbranch_execnz .LBB11_1 2907; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2908; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 2909; GFX12-NEXT: s_wait_alu 0xfffe 2910; GFX12-NEXT: s_setpc_b64 s[30:31] 2911; 2912; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 2913; GFX940: ; %bb.0: 2914; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2915; GFX940-NEXT: s_addk_i32 s16, 0x200 2916; GFX940-NEXT: s_and_b32 s4, s16, -4 2917; GFX940-NEXT: v_mov_b32_e32 v2, s4 2918; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen 2919; GFX940-NEXT: s_and_b32 s4, s16, 3 2920; GFX940-NEXT: s_lshl_b32 s6, s4, 3 2921; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 2922; GFX940-NEXT: s_not_b32 s7, s4 2923; GFX940-NEXT: s_mov_b64 s[4:5], 0 2924; GFX940-NEXT: v_max_f16_e32 v3, v0, v0 2925; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start 2926; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2927; GFX940-NEXT: s_waitcnt vmcnt(0) 2928; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1 2929; GFX940-NEXT: v_max_f16_e32 v0, v0, v0 2930; GFX940-NEXT: v_min_f16_e32 v0, v0, v3 2931; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0 2932; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 2933; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 2934; GFX940-NEXT: buffer_wbl2 sc1 2935; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 2936; GFX940-NEXT: s_waitcnt vmcnt(0) 2937; GFX940-NEXT: buffer_inv sc1 2938; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 2939; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2940; GFX940-NEXT: v_mov_b32_e32 v1, v4 2941; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 2942; GFX940-NEXT: s_cbranch_execnz .LBB11_1 2943; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2944; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 2945; GFX940-NEXT: s_setpc_b64 s[30:31] 2946; 2947; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 2948; GFX11: ; %bb.0: 2949; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2950; GFX11-NEXT: s_addk_i32 s16, 0x200 2951; GFX11-NEXT: v_max_f16_e32 v3, v0, v0 2952; GFX11-NEXT: s_and_b32 s4, s16, -4 2953; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 2954; GFX11-NEXT: v_mov_b32_e32 v2, s4 2955; GFX11-NEXT: s_and_b32 s4, s16, 3 2956; GFX11-NEXT: s_lshl_b32 s4, s4, 3 2957; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2958; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 2959; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen 2960; GFX11-NEXT: s_not_b32 s6, s5 2961; GFX11-NEXT: s_mov_b32 s5, 0 2962; GFX11-NEXT: .p2align 6 2963; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 2964; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2965; GFX11-NEXT: s_waitcnt vmcnt(0) 2966; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 2967; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2968; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2969; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 2970; GFX11-NEXT: v_min_f16_e32 v0, v0, v3 2971; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2972; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2973; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 2974; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2975; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 2976; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 2977; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc 2978; GFX11-NEXT: s_waitcnt vmcnt(0) 2979; GFX11-NEXT: buffer_gl1_inv 2980; GFX11-NEXT: buffer_gl0_inv 2981; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 2982; GFX11-NEXT: v_mov_b32_e32 v1, v4 2983; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 2984; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2985; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 2986; GFX11-NEXT: s_cbranch_execnz .LBB11_1 2987; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2988; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 2989; GFX11-NEXT: s_setpc_b64 s[30:31] 2990; 2991; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 2992; GFX10: ; %bb.0: 2993; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2994; GFX10-NEXT: s_addk_i32 s20, 0x200 2995; GFX10-NEXT: v_max_f16_e32 v3, v0, v0 2996; GFX10-NEXT: s_and_b32 s4, s20, -4 2997; GFX10-NEXT: v_mov_b32_e32 v2, s4 2998; GFX10-NEXT: s_and_b32 s4, s20, 3 2999; GFX10-NEXT: s_lshl_b32 s4, s4, 3 3000; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 3001; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3002; GFX10-NEXT: s_not_b32 s6, s5 3003; GFX10-NEXT: s_mov_b32 s5, 0 3004; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start 3005; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 3006; GFX10-NEXT: s_waitcnt vmcnt(0) 3007; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1 3008; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3009; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 3010; GFX10-NEXT: v_min_f16_e32 v0, v0, v3 3011; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3012; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 3013; GFX10-NEXT: v_mov_b32_e32 v5, v1 3014; GFX10-NEXT: v_mov_b32_e32 v4, v0 3015; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3016; GFX10-NEXT: s_waitcnt vmcnt(0) 3017; GFX10-NEXT: buffer_gl1_inv 3018; GFX10-NEXT: buffer_gl0_inv 3019; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 3020; GFX10-NEXT: v_mov_b32_e32 v1, v4 3021; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3022; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3023; GFX10-NEXT: s_cbranch_execnz .LBB11_1 3024; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 3025; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3026; GFX10-NEXT: s_setpc_b64 s[30:31] 3027; 3028; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 3029; GFX90A: ; %bb.0: 3030; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3031; GFX90A-NEXT: s_addk_i32 s20, 0x200 3032; GFX90A-NEXT: s_and_b32 s4, s20, -4 3033; GFX90A-NEXT: v_mov_b32_e32 v2, s4 3034; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3035; GFX90A-NEXT: s_and_b32 s4, s20, 3 3036; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 3037; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 3038; GFX90A-NEXT: s_not_b32 s7, s4 3039; GFX90A-NEXT: s_mov_b64 s[4:5], 0 3040; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0 3041; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start 3042; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 3043; GFX90A-NEXT: s_waitcnt vmcnt(0) 3044; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3045; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0 3046; GFX90A-NEXT: v_min_f16_e32 v0, v0, v3 3047; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3048; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 3049; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 3050; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3051; GFX90A-NEXT: s_waitcnt vmcnt(0) 3052; GFX90A-NEXT: buffer_wbinvl1 3053; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 3054; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3055; GFX90A-NEXT: v_mov_b32_e32 v1, v4 3056; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 3057; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 3058; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 3059; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3060; GFX90A-NEXT: s_setpc_b64 s[30:31] 3061; 3062; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 3063; GFX908: ; %bb.0: 3064; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3065; GFX908-NEXT: s_addk_i32 s20, 0x200 3066; GFX908-NEXT: s_and_b32 s4, s20, -4 3067; GFX908-NEXT: v_mov_b32_e32 v2, s4 3068; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3069; GFX908-NEXT: s_and_b32 s4, s20, 3 3070; GFX908-NEXT: s_lshl_b32 s6, s4, 3 3071; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 3072; GFX908-NEXT: s_not_b32 s7, s4 3073; GFX908-NEXT: s_mov_b64 s[4:5], 0 3074; GFX908-NEXT: v_max_f16_e32 v3, v0, v0 3075; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 3076; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3077; GFX908-NEXT: s_waitcnt vmcnt(0) 3078; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3079; GFX908-NEXT: v_max_f16_e32 v0, v0, v0 3080; GFX908-NEXT: v_min_f16_e32 v0, v0, v3 3081; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3082; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 3083; GFX908-NEXT: v_mov_b32_e32 v5, v1 3084; GFX908-NEXT: v_mov_b32_e32 v4, v0 3085; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3086; GFX908-NEXT: s_waitcnt vmcnt(0) 3087; GFX908-NEXT: buffer_wbinvl1 3088; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 3089; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3090; GFX908-NEXT: v_mov_b32_e32 v1, v4 3091; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 3092; GFX908-NEXT: s_cbranch_execnz .LBB11_1 3093; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 3094; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3095; GFX908-NEXT: s_setpc_b64 s[30:31] 3096; 3097; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 3098; GFX8: ; %bb.0: 3099; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3100; GFX8-NEXT: s_addk_i32 s20, 0x200 3101; GFX8-NEXT: s_and_b32 s4, s20, -4 3102; GFX8-NEXT: v_mov_b32_e32 v2, s4 3103; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3104; GFX8-NEXT: s_and_b32 s4, s20, 3 3105; GFX8-NEXT: s_lshl_b32 s6, s4, 3 3106; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 3107; GFX8-NEXT: s_not_b32 s7, s4 3108; GFX8-NEXT: s_mov_b64 s[4:5], 0 3109; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 3110; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 3111; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3112; GFX8-NEXT: s_waitcnt vmcnt(0) 3113; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3114; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 3115; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 3116; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 3117; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3118; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 3119; GFX8-NEXT: v_mov_b32_e32 v5, v1 3120; GFX8-NEXT: v_mov_b32_e32 v4, v0 3121; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3122; GFX8-NEXT: s_waitcnt vmcnt(0) 3123; GFX8-NEXT: buffer_wbinvl1 3124; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 3125; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3126; GFX8-NEXT: v_mov_b32_e32 v1, v4 3127; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 3128; GFX8-NEXT: s_cbranch_execnz .LBB11_1 3129; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 3130; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3131; GFX8-NEXT: s_setpc_b64 s[30:31] 3132; 3133; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 3134; GFX7: ; %bb.0: 3135; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3136; GFX7-NEXT: s_addk_i32 s20, 0x200 3137; GFX7-NEXT: s_and_b32 s4, s20, -4 3138; GFX7-NEXT: v_mov_b32_e32 v2, s4 3139; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3140; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 3141; GFX7-NEXT: s_and_b32 s4, s20, 3 3142; GFX7-NEXT: s_lshl_b32 s6, s4, 3 3143; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 3144; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 3145; GFX7-NEXT: s_not_b32 s7, s4 3146; GFX7-NEXT: s_mov_b64 s[4:5], 0 3147; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start 3148; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 3149; GFX7-NEXT: s_waitcnt vmcnt(0) 3150; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3151; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 3152; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 3153; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 3154; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 3155; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3156; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 3157; GFX7-NEXT: v_mov_b32_e32 v5, v1 3158; GFX7-NEXT: v_mov_b32_e32 v4, v0 3159; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3160; GFX7-NEXT: s_waitcnt vmcnt(0) 3161; GFX7-NEXT: buffer_wbinvl1 3162; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 3163; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3164; GFX7-NEXT: v_mov_b32_e32 v1, v4 3165; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 3166; GFX7-NEXT: s_cbranch_execnz .LBB11_1 3167; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 3168; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3169; GFX7-NEXT: s_setpc_b64 s[30:31] 3170; 3171; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_grained_memory: 3172; GFX6: ; %bb.0: 3173; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3174; GFX6-NEXT: s_addk_i32 s20, 0x200 3175; GFX6-NEXT: s_and_b32 s4, s20, -4 3176; GFX6-NEXT: v_mov_b32_e32 v2, s4 3177; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 3178; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 3179; GFX6-NEXT: s_and_b32 s4, s20, 3 3180; GFX6-NEXT: s_lshl_b32 s6, s4, 3 3181; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 3182; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 3183; GFX6-NEXT: s_not_b32 s7, s4 3184; GFX6-NEXT: s_mov_b64 s[4:5], 0 3185; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start 3186; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 3187; GFX6-NEXT: s_waitcnt vmcnt(0) 3188; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 3189; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 3190; GFX6-NEXT: s_waitcnt expcnt(0) 3191; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 3192; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 3193; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 3194; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 3195; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 3196; GFX6-NEXT: v_mov_b32_e32 v5, v1 3197; GFX6-NEXT: v_mov_b32_e32 v4, v0 3198; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 3199; GFX6-NEXT: s_waitcnt vmcnt(0) 3200; GFX6-NEXT: buffer_wbinvl1 3201; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 3202; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3203; GFX6-NEXT: v_mov_b32_e32 v1, v4 3204; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 3205; GFX6-NEXT: s_cbranch_execnz .LBB11_1 3206; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 3207; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 3208; GFX6-NEXT: s_waitcnt expcnt(0) 3209; GFX6-NEXT: s_setpc_b64 s[30:31] 3210 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 3211 %unused = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3212 ret void 3213} 3214 3215define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 { 3216; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3217; GFX12: ; %bb.0: 3218; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3219; GFX12-NEXT: s_wait_expcnt 0x0 3220; GFX12-NEXT: s_wait_samplecnt 0x0 3221; GFX12-NEXT: s_wait_bvhcnt 0x0 3222; GFX12-NEXT: s_wait_kmcnt 0x0 3223; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 3224; GFX12-NEXT: s_mov_b32 s1, exec_lo 3225; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3226; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 3227; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 3228; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 3229; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3230; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 3231; GFX12-NEXT: v_not_b32_e32 v9, v6 3232; GFX12-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3233; GFX12-NEXT: v_readfirstlane_b32 s4, v0 3234; GFX12-NEXT: v_readfirstlane_b32 s5, v1 3235; GFX12-NEXT: v_readfirstlane_b32 s6, v2 3236; GFX12-NEXT: v_readfirstlane_b32 s7, v3 3237; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3238; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 3239; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 3240; GFX12-NEXT: s_wait_alu 0xfffe 3241; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3242; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 3243; GFX12-NEXT: s_wait_alu 0xfffe 3244; GFX12-NEXT: s_and_saveexec_b32 s0, s0 3245; GFX12-NEXT: s_wait_loadcnt 0x0 3246; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen 3247; GFX12-NEXT: s_wait_alu 0xfffe 3248; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 3249; GFX12-NEXT: s_cbranch_execnz .LBB12_1 3250; GFX12-NEXT: ; %bb.2: 3251; GFX12-NEXT: s_mov_b32 exec_lo, s1 3252; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5 3253; GFX12-NEXT: s_mov_b32 s1, 0 3254; GFX12-NEXT: .LBB12_3: ; %atomicrmw.start 3255; GFX12-NEXT: ; =>This Loop Header: Depth=1 3256; GFX12-NEXT: ; Child Loop BB12_4 Depth 2 3257; GFX12-NEXT: s_wait_loadcnt 0x0 3258; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3259; GFX12-NEXT: s_mov_b32 s2, exec_lo 3260; GFX12-NEXT: s_wait_storecnt 0x0 3261; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3262; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4 3263; GFX12-NEXT: v_min_num_f16_e32 v4, v4, v10 3264; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3265; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4 3266; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3267; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3268; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 3269; GFX12-NEXT: v_mov_b32_e32 v4, v5 3270; GFX12-NEXT: v_mov_b32_e32 v5, v6 3271; GFX12-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3272; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 3273; GFX12-NEXT: v_readfirstlane_b32 s4, v0 3274; GFX12-NEXT: v_readfirstlane_b32 s5, v1 3275; GFX12-NEXT: v_readfirstlane_b32 s6, v2 3276; GFX12-NEXT: v_readfirstlane_b32 s7, v3 3277; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3278; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 3279; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 3280; GFX12-NEXT: s_wait_alu 0xfffe 3281; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3282; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 3283; GFX12-NEXT: s_wait_alu 0xfffe 3284; GFX12-NEXT: s_and_saveexec_b32 s0, s0 3285; GFX12-NEXT: s_wait_loadcnt 0x0 3286; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN 3287; GFX12-NEXT: s_wait_alu 0xfffe 3288; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 3289; GFX12-NEXT: s_cbranch_execnz .LBB12_4 3290; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3291; GFX12-NEXT: s_mov_b32 exec_lo, s2 3292; GFX12-NEXT: s_wait_loadcnt 0x0 3293; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 3294; GFX12-NEXT: v_mov_b32_e32 v6, v4 3295; GFX12-NEXT: global_inv scope:SCOPE_DEV 3296; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 3297; GFX12-NEXT: s_wait_alu 0xfffe 3298; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3299; GFX12-NEXT: s_cbranch_execnz .LBB12_3 3300; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 3301; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 3302; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3303; GFX12-NEXT: s_wait_alu 0xfffe 3304; GFX12-NEXT: s_setpc_b64 s[30:31] 3305; 3306; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3307; GFX940: ; %bb.0: 3308; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3309; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 3310; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 3311; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 3312; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 3313; GFX940-NEXT: s_mov_b32 s0, 0xffff 3314; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 3315; GFX940-NEXT: v_not_b32_e32 v10, v4 3316; GFX940-NEXT: s_mov_b64 s[2:3], exec 3317; GFX940-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3318; GFX940-NEXT: v_readfirstlane_b32 s4, v0 3319; GFX940-NEXT: v_readfirstlane_b32 s5, v1 3320; GFX940-NEXT: v_readfirstlane_b32 s6, v2 3321; GFX940-NEXT: v_readfirstlane_b32 s7, v3 3322; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 3323; GFX940-NEXT: s_nop 0 3324; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 3325; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 3326; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 3327; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen 3328; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 3329; GFX940-NEXT: s_cbranch_execnz .LBB12_1 3330; GFX940-NEXT: ; %bb.2: 3331; GFX940-NEXT: s_mov_b64 exec, s[2:3] 3332; GFX940-NEXT: s_mov_b64 s[2:3], 0 3333; GFX940-NEXT: v_max_f16_e32 v11, v5, v5 3334; GFX940-NEXT: .LBB12_3: ; %atomicrmw.start 3335; GFX940-NEXT: ; =>This Loop Header: Depth=1 3336; GFX940-NEXT: ; Child Loop BB12_4 Depth 2 3337; GFX940-NEXT: s_waitcnt vmcnt(0) 3338; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7 3339; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 3340; GFX940-NEXT: v_min_f16_e32 v4, v4, v11 3341; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4 3342; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 3343; GFX940-NEXT: s_mov_b64 s[8:9], exec 3344; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 3345; GFX940-NEXT: buffer_wbl2 sc1 3346; GFX940-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3347; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 3348; GFX940-NEXT: v_readfirstlane_b32 s4, v0 3349; GFX940-NEXT: v_readfirstlane_b32 s5, v1 3350; GFX940-NEXT: v_readfirstlane_b32 s6, v2 3351; GFX940-NEXT: v_readfirstlane_b32 s7, v3 3352; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 3353; GFX940-NEXT: s_nop 0 3354; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 3355; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 3356; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 3357; GFX940-NEXT: s_waitcnt vmcnt(0) 3358; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 3359; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 3360; GFX940-NEXT: s_cbranch_execnz .LBB12_4 3361; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3362; GFX940-NEXT: s_mov_b64 exec, s[8:9] 3363; GFX940-NEXT: s_waitcnt vmcnt(0) 3364; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 3365; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 3366; GFX940-NEXT: v_mov_b32_e32 v7, v4 3367; GFX940-NEXT: buffer_inv sc1 3368; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 3369; GFX940-NEXT: s_cbranch_execnz .LBB12_3 3370; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 3371; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 3372; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 3373; GFX940-NEXT: s_setpc_b64 s[30:31] 3374; 3375; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3376; GFX11: ; %bb.0: 3377; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3378; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 3379; GFX11-NEXT: s_mov_b32 s1, 0 3380; GFX11-NEXT: s_mov_b32 s2, exec_lo 3381; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 3382; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 3383; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 3384; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 3385; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3386; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 3387; GFX11-NEXT: v_not_b32_e32 v9, v6 3388; GFX11-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3389; GFX11-NEXT: v_readfirstlane_b32 s4, v0 3390; GFX11-NEXT: v_readfirstlane_b32 s5, v1 3391; GFX11-NEXT: v_readfirstlane_b32 s6, v2 3392; GFX11-NEXT: v_readfirstlane_b32 s7, v3 3393; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3394; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 3395; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 3396; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3397; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 3398; GFX11-NEXT: s_and_saveexec_b32 s0, s0 3399; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen 3400; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 3401; GFX11-NEXT: s_cbranch_execnz .LBB12_1 3402; GFX11-NEXT: ; %bb.2: 3403; GFX11-NEXT: s_mov_b32 exec_lo, s2 3404; GFX11-NEXT: v_max_f16_e32 v10, v5, v5 3405; GFX11-NEXT: .p2align 6 3406; GFX11-NEXT: .LBB12_3: ; %atomicrmw.start 3407; GFX11-NEXT: ; =>This Loop Header: Depth=1 3408; GFX11-NEXT: ; Child Loop BB12_4 Depth 2 3409; GFX11-NEXT: s_waitcnt vmcnt(0) 3410; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3411; GFX11-NEXT: s_mov_b32 s2, exec_lo 3412; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3413; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3414; GFX11-NEXT: v_max_f16_e32 v4, v4, v4 3415; GFX11-NEXT: v_min_f16_e32 v4, v4, v10 3416; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3417; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 3418; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3419; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3420; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 3421; GFX11-NEXT: v_mov_b32_e32 v4, v5 3422; GFX11-NEXT: v_mov_b32_e32 v5, v6 3423; GFX11-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3424; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 3425; GFX11-NEXT: v_readfirstlane_b32 s4, v0 3426; GFX11-NEXT: v_readfirstlane_b32 s5, v1 3427; GFX11-NEXT: v_readfirstlane_b32 s6, v2 3428; GFX11-NEXT: v_readfirstlane_b32 s7, v3 3429; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3430; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 3431; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 3432; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3433; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 3434; GFX11-NEXT: s_and_saveexec_b32 s0, s0 3435; GFX11-NEXT: s_waitcnt vmcnt(0) 3436; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc 3437; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 3438; GFX11-NEXT: s_cbranch_execnz .LBB12_4 3439; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3440; GFX11-NEXT: s_mov_b32 exec_lo, s2 3441; GFX11-NEXT: s_waitcnt vmcnt(0) 3442; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 3443; GFX11-NEXT: v_mov_b32_e32 v6, v4 3444; GFX11-NEXT: buffer_gl1_inv 3445; GFX11-NEXT: buffer_gl0_inv 3446; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 3447; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3448; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3449; GFX11-NEXT: s_cbranch_execnz .LBB12_3 3450; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 3451; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 3452; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3453; GFX11-NEXT: s_setpc_b64 s[30:31] 3454; 3455; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3456; GFX10: ; %bb.0: 3457; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3458; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 3459; GFX10-NEXT: s_mov_b32 s5, 0 3460; GFX10-NEXT: s_mov_b32 s6, exec_lo 3461; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 3462; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 3463; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 3464; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 3465; GFX10-NEXT: v_not_b32_e32 v9, v6 3466; GFX10-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3467; GFX10-NEXT: v_readfirstlane_b32 s8, v0 3468; GFX10-NEXT: v_readfirstlane_b32 s9, v1 3469; GFX10-NEXT: v_readfirstlane_b32 s10, v2 3470; GFX10-NEXT: v_readfirstlane_b32 s11, v3 3471; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 3472; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 3473; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 3474; GFX10-NEXT: s_and_saveexec_b32 s4, s4 3475; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 3476; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3477; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 3478; GFX10-NEXT: s_cbranch_execnz .LBB12_1 3479; GFX10-NEXT: ; %bb.2: 3480; GFX10-NEXT: s_mov_b32 exec_lo, s6 3481; GFX10-NEXT: v_max_f16_e32 v10, v5, v5 3482; GFX10-NEXT: .LBB12_3: ; %atomicrmw.start 3483; GFX10-NEXT: ; =>This Loop Header: Depth=1 3484; GFX10-NEXT: ; Child Loop BB12_4 Depth 2 3485; GFX10-NEXT: s_waitcnt vmcnt(0) 3486; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3487; GFX10-NEXT: s_mov_b32 s6, exec_lo 3488; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3489; GFX10-NEXT: v_max_f16_e32 v4, v4, v4 3490; GFX10-NEXT: v_min_f16_e32 v4, v4, v10 3491; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 3492; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 3493; GFX10-NEXT: v_mov_b32_e32 v4, v5 3494; GFX10-NEXT: v_mov_b32_e32 v5, v6 3495; GFX10-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3496; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 3497; GFX10-NEXT: v_readfirstlane_b32 s8, v0 3498; GFX10-NEXT: v_readfirstlane_b32 s9, v1 3499; GFX10-NEXT: v_readfirstlane_b32 s10, v2 3500; GFX10-NEXT: v_readfirstlane_b32 s11, v3 3501; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 3502; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 3503; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 3504; GFX10-NEXT: s_and_saveexec_b32 s4, s4 3505; GFX10-NEXT: s_waitcnt vmcnt(0) 3506; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 3507; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3508; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 3509; GFX10-NEXT: s_cbranch_execnz .LBB12_4 3510; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3511; GFX10-NEXT: s_mov_b32 exec_lo, s6 3512; GFX10-NEXT: s_waitcnt vmcnt(0) 3513; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 3514; GFX10-NEXT: v_mov_b32_e32 v6, v4 3515; GFX10-NEXT: buffer_gl1_inv 3516; GFX10-NEXT: buffer_gl0_inv 3517; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 3518; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3519; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 3520; GFX10-NEXT: s_cbranch_execnz .LBB12_3 3521; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 3522; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 3523; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3524; GFX10-NEXT: s_setpc_b64 s[30:31] 3525; 3526; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3527; GFX90A: ; %bb.0: 3528; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3529; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 3530; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 3531; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 3532; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 3533; GFX90A-NEXT: s_mov_b32 s4, 0xffff 3534; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 3535; GFX90A-NEXT: v_not_b32_e32 v10, v4 3536; GFX90A-NEXT: s_mov_b64 s[6:7], exec 3537; GFX90A-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3538; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 3539; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 3540; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 3541; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 3542; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3543; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3544; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3545; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3546; GFX90A-NEXT: s_nop 0 3547; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen 3548; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 3549; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 3550; GFX90A-NEXT: ; %bb.2: 3551; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 3552; GFX90A-NEXT: s_mov_b64 s[6:7], 0 3553; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5 3554; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.start 3555; GFX90A-NEXT: ; =>This Loop Header: Depth=1 3556; GFX90A-NEXT: ; Child Loop BB12_4 Depth 2 3557; GFX90A-NEXT: s_waitcnt vmcnt(0) 3558; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7 3559; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 3560; GFX90A-NEXT: v_min_f16_e32 v4, v4, v11 3561; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4 3562; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 3563; GFX90A-NEXT: s_mov_b64 s[12:13], exec 3564; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 3565; GFX90A-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3566; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 3567; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 3568; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 3569; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 3570; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 3571; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3572; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3573; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3574; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3575; GFX90A-NEXT: s_waitcnt vmcnt(0) 3576; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc 3577; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 3578; GFX90A-NEXT: s_cbranch_execnz .LBB12_4 3579; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3580; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 3581; GFX90A-NEXT: s_waitcnt vmcnt(0) 3582; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 3583; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3584; GFX90A-NEXT: v_mov_b32_e32 v7, v4 3585; GFX90A-NEXT: buffer_wbinvl1 3586; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 3587; GFX90A-NEXT: s_cbranch_execnz .LBB12_3 3588; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 3589; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 3590; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 3591; GFX90A-NEXT: s_setpc_b64 s[30:31] 3592; 3593; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3594; GFX908: ; %bb.0: 3595; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3596; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 3597; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 3598; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 3599; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 3600; GFX908-NEXT: s_mov_b32 s4, 0xffff 3601; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 3602; GFX908-NEXT: v_not_b32_e32 v9, v4 3603; GFX908-NEXT: s_mov_b64 s[6:7], exec 3604; GFX908-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3605; GFX908-NEXT: v_readfirstlane_b32 s8, v0 3606; GFX908-NEXT: v_readfirstlane_b32 s9, v1 3607; GFX908-NEXT: v_readfirstlane_b32 s10, v2 3608; GFX908-NEXT: v_readfirstlane_b32 s11, v3 3609; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3610; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3611; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3612; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3613; GFX908-NEXT: s_nop 0 3614; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 3615; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 3616; GFX908-NEXT: s_cbranch_execnz .LBB12_1 3617; GFX908-NEXT: ; %bb.2: 3618; GFX908-NEXT: s_mov_b64 exec, s[6:7] 3619; GFX908-NEXT: s_mov_b64 s[6:7], 0 3620; GFX908-NEXT: v_max_f16_e32 v10, v5, v5 3621; GFX908-NEXT: .LBB12_3: ; %atomicrmw.start 3622; GFX908-NEXT: ; =>This Loop Header: Depth=1 3623; GFX908-NEXT: ; Child Loop BB12_4 Depth 2 3624; GFX908-NEXT: s_waitcnt vmcnt(0) 3625; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3626; GFX908-NEXT: v_max_f16_e32 v4, v4, v4 3627; GFX908-NEXT: v_min_f16_e32 v4, v4, v10 3628; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3629; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 3630; GFX908-NEXT: v_mov_b32_e32 v4, v5 3631; GFX908-NEXT: s_mov_b64 s[12:13], exec 3632; GFX908-NEXT: v_mov_b32_e32 v5, v6 3633; GFX908-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3634; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 3635; GFX908-NEXT: v_readfirstlane_b32 s8, v0 3636; GFX908-NEXT: v_readfirstlane_b32 s9, v1 3637; GFX908-NEXT: v_readfirstlane_b32 s10, v2 3638; GFX908-NEXT: v_readfirstlane_b32 s11, v3 3639; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3640; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3641; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3642; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3643; GFX908-NEXT: s_waitcnt vmcnt(0) 3644; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 3645; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 3646; GFX908-NEXT: s_cbranch_execnz .LBB12_4 3647; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3648; GFX908-NEXT: s_mov_b64 exec, s[12:13] 3649; GFX908-NEXT: s_waitcnt vmcnt(0) 3650; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 3651; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3652; GFX908-NEXT: v_mov_b32_e32 v6, v4 3653; GFX908-NEXT: buffer_wbinvl1 3654; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3655; GFX908-NEXT: s_cbranch_execnz .LBB12_3 3656; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 3657; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3658; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3659; GFX908-NEXT: s_setpc_b64 s[30:31] 3660; 3661; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3662; GFX8: ; %bb.0: 3663; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3664; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 3665; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 3666; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 3667; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 3668; GFX8-NEXT: s_mov_b32 s4, 0xffff 3669; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 3670; GFX8-NEXT: v_not_b32_e32 v9, v4 3671; GFX8-NEXT: s_mov_b64 s[6:7], exec 3672; GFX8-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3673; GFX8-NEXT: v_readfirstlane_b32 s8, v0 3674; GFX8-NEXT: v_readfirstlane_b32 s9, v1 3675; GFX8-NEXT: v_readfirstlane_b32 s10, v2 3676; GFX8-NEXT: v_readfirstlane_b32 s11, v3 3677; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3678; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3679; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3680; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3681; GFX8-NEXT: s_nop 0 3682; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 3683; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 3684; GFX8-NEXT: s_cbranch_execnz .LBB12_1 3685; GFX8-NEXT: ; %bb.2: 3686; GFX8-NEXT: s_mov_b64 exec, s[6:7] 3687; GFX8-NEXT: s_mov_b64 s[6:7], 0 3688; GFX8-NEXT: v_max_f16_e32 v10, v5, v5 3689; GFX8-NEXT: .LBB12_3: ; %atomicrmw.start 3690; GFX8-NEXT: ; =>This Loop Header: Depth=1 3691; GFX8-NEXT: ; Child Loop BB12_4 Depth 2 3692; GFX8-NEXT: s_waitcnt vmcnt(0) 3693; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3694; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 3695; GFX8-NEXT: v_min_f16_e32 v4, v4, v10 3696; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3697; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 3698; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 3699; GFX8-NEXT: v_mov_b32_e32 v4, v5 3700; GFX8-NEXT: s_mov_b64 s[12:13], exec 3701; GFX8-NEXT: v_mov_b32_e32 v5, v6 3702; GFX8-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3703; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 3704; GFX8-NEXT: v_readfirstlane_b32 s8, v0 3705; GFX8-NEXT: v_readfirstlane_b32 s9, v1 3706; GFX8-NEXT: v_readfirstlane_b32 s10, v2 3707; GFX8-NEXT: v_readfirstlane_b32 s11, v3 3708; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3709; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3710; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3711; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3712; GFX8-NEXT: s_waitcnt vmcnt(0) 3713; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 3714; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 3715; GFX8-NEXT: s_cbranch_execnz .LBB12_4 3716; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3717; GFX8-NEXT: s_mov_b64 exec, s[12:13] 3718; GFX8-NEXT: s_waitcnt vmcnt(0) 3719; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 3720; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3721; GFX8-NEXT: v_mov_b32_e32 v6, v4 3722; GFX8-NEXT: buffer_wbinvl1 3723; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3724; GFX8-NEXT: s_cbranch_execnz .LBB12_3 3725; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 3726; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 3727; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3728; GFX8-NEXT: s_setpc_b64 s[30:31] 3729; 3730; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3731; GFX7: ; %bb.0: 3732; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3733; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 3734; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 3735; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 3736; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 3737; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 3738; GFX7-NEXT: v_not_b32_e32 v9, v4 3739; GFX7-NEXT: s_mov_b64 s[6:7], exec 3740; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3741; GFX7-NEXT: v_readfirstlane_b32 s8, v0 3742; GFX7-NEXT: v_readfirstlane_b32 s9, v1 3743; GFX7-NEXT: v_readfirstlane_b32 s10, v2 3744; GFX7-NEXT: v_readfirstlane_b32 s11, v3 3745; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3746; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3747; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3748; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3749; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 3750; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 3751; GFX7-NEXT: s_cbranch_execnz .LBB12_1 3752; GFX7-NEXT: ; %bb.2: 3753; GFX7-NEXT: s_mov_b64 exec, s[6:7] 3754; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5 3755; GFX7-NEXT: s_mov_b64 s[6:7], 0 3756; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4 3757; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start 3758; GFX7-NEXT: ; =>This Loop Header: Depth=1 3759; GFX7-NEXT: ; Child Loop BB12_4 Depth 2 3760; GFX7-NEXT: s_waitcnt vmcnt(0) 3761; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3762; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 3763; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 3764; GFX7-NEXT: s_mov_b64 s[12:13], exec 3765; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 3766; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 3767; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3768; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 3769; GFX7-NEXT: v_mov_b32_e32 v4, v5 3770; GFX7-NEXT: v_mov_b32_e32 v5, v6 3771; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3772; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 3773; GFX7-NEXT: v_readfirstlane_b32 s8, v0 3774; GFX7-NEXT: v_readfirstlane_b32 s9, v1 3775; GFX7-NEXT: v_readfirstlane_b32 s10, v2 3776; GFX7-NEXT: v_readfirstlane_b32 s11, v3 3777; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3778; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3779; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3780; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3781; GFX7-NEXT: s_waitcnt vmcnt(0) 3782; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 3783; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 3784; GFX7-NEXT: s_cbranch_execnz .LBB12_4 3785; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3786; GFX7-NEXT: s_mov_b64 exec, s[12:13] 3787; GFX7-NEXT: s_waitcnt vmcnt(0) 3788; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 3789; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3790; GFX7-NEXT: v_mov_b32_e32 v6, v4 3791; GFX7-NEXT: buffer_wbinvl1 3792; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 3793; GFX7-NEXT: s_cbranch_execnz .LBB12_3 3794; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 3795; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 3796; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3797; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 3798; GFX7-NEXT: s_setpc_b64 s[30:31] 3799; 3800; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory: 3801; GFX6: ; %bb.0: 3802; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3803; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 3804; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 3805; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 3806; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 3807; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 3808; GFX6-NEXT: v_not_b32_e32 v9, v4 3809; GFX6-NEXT: s_mov_b64 s[6:7], exec 3810; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 3811; GFX6-NEXT: v_readfirstlane_b32 s8, v0 3812; GFX6-NEXT: v_readfirstlane_b32 s9, v1 3813; GFX6-NEXT: v_readfirstlane_b32 s10, v2 3814; GFX6-NEXT: v_readfirstlane_b32 s11, v3 3815; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3816; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3817; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3818; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3819; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 3820; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 3821; GFX6-NEXT: s_cbranch_execnz .LBB12_1 3822; GFX6-NEXT: ; %bb.2: 3823; GFX6-NEXT: s_mov_b64 exec, s[6:7] 3824; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5 3825; GFX6-NEXT: s_mov_b64 s[6:7], 0 3826; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4 3827; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start 3828; GFX6-NEXT: ; =>This Loop Header: Depth=1 3829; GFX6-NEXT: ; Child Loop BB12_4 Depth 2 3830; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3831; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 3832; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 3833; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 3834; GFX6-NEXT: s_mov_b64 s[12:13], exec 3835; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 3836; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 3837; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 3838; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 3839; GFX6-NEXT: v_mov_b32_e32 v4, v5 3840; GFX6-NEXT: v_mov_b32_e32 v5, v6 3841; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 3842; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 3843; GFX6-NEXT: v_readfirstlane_b32 s8, v0 3844; GFX6-NEXT: v_readfirstlane_b32 s9, v1 3845; GFX6-NEXT: v_readfirstlane_b32 s10, v2 3846; GFX6-NEXT: v_readfirstlane_b32 s11, v3 3847; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 3848; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 3849; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 3850; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 3851; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 3852; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 3853; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 3854; GFX6-NEXT: s_cbranch_execnz .LBB12_4 3855; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1 3856; GFX6-NEXT: s_mov_b64 exec, s[12:13] 3857; GFX6-NEXT: s_waitcnt vmcnt(0) 3858; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 3859; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3860; GFX6-NEXT: v_mov_b32_e32 v6, v4 3861; GFX6-NEXT: buffer_wbinvl1 3862; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 3863; GFX6-NEXT: s_cbranch_execnz .LBB12_3 3864; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 3865; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 3866; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 3867; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 3868; GFX6-NEXT: s_waitcnt expcnt(0) 3869; GFX6-NEXT: s_setpc_b64 s[30:31] 3870 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256 3871 %result = atomicrmw fmin ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3872 ret half %result 3873} 3874 3875; -------------------------------------------------------------------- 3876; bfloat 3877; -------------------------------------------------------------------- 3878 3879define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { 3880; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 3881; GFX12: ; %bb.0: 3882; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3883; GFX12-NEXT: s_wait_expcnt 0x0 3884; GFX12-NEXT: s_wait_samplecnt 0x0 3885; GFX12-NEXT: s_wait_bvhcnt 0x0 3886; GFX12-NEXT: s_wait_kmcnt 0x0 3887; GFX12-NEXT: s_addk_co_i32 s16, 0x200 3888; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0 3889; GFX12-NEXT: s_wait_alu 0xfffe 3890; GFX12-NEXT: s_and_b32 s4, s16, -4 3891; GFX12-NEXT: s_wait_alu 0xfffe 3892; GFX12-NEXT: v_mov_b32_e32 v4, s4 3893; GFX12-NEXT: s_and_b32 s4, s16, 3 3894; GFX12-NEXT: s_wait_alu 0xfffe 3895; GFX12-NEXT: s_lshl_b32 s4, s4, 3 3896; GFX12-NEXT: s_wait_alu 0xfffe 3897; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 3898; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen 3899; GFX12-NEXT: s_wait_alu 0xfffe 3900; GFX12-NEXT: s_not_b32 s6, s5 3901; GFX12-NEXT: s_mov_b32 s5, 0 3902; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start 3903; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3904; GFX12-NEXT: s_wait_loadcnt 0x0 3905; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 3906; GFX12-NEXT: s_wait_storecnt 0x0 3907; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3908; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3909; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v5 3910; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3911; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1 3912; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0 3913; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 3914; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 3915; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3916; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 3917; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3918; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3919; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 3920; GFX12-NEXT: s_wait_alu 0xfffe 3921; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 3922; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3923; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 3924; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN 3925; GFX12-NEXT: s_wait_loadcnt 0x0 3926; GFX12-NEXT: global_inv scope:SCOPE_DEV 3927; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 3928; GFX12-NEXT: v_mov_b32_e32 v1, v2 3929; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 3930; GFX12-NEXT: s_wait_alu 0xfffe 3931; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 3932; GFX12-NEXT: s_cbranch_execnz .LBB13_1 3933; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 3934; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 3935; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2 3936; GFX12-NEXT: s_wait_alu 0xfffe 3937; GFX12-NEXT: s_setpc_b64 s[30:31] 3938; 3939; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 3940; GFX940: ; %bb.0: 3941; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3942; GFX940-NEXT: s_addk_i32 s16, 0x200 3943; GFX940-NEXT: s_and_b32 s4, s16, -4 3944; GFX940-NEXT: v_mov_b32_e32 v4, s4 3945; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen 3946; GFX940-NEXT: s_and_b32 s4, s16, 3 3947; GFX940-NEXT: s_lshl_b32 s6, s4, 3 3948; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 3949; GFX940-NEXT: s_not_b32 s7, s4 3950; GFX940-NEXT: s_mov_b64 s[4:5], 0 3951; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0 3952; GFX940-NEXT: s_movk_i32 s8, 0x7fff 3953; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start 3954; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 3955; GFX940-NEXT: s_waitcnt vmcnt(0) 3956; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3957; GFX940-NEXT: buffer_wbl2 sc1 3958; GFX940-NEXT: v_min_f32_e32 v0, v0, v5 3959; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 3960; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 3961; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8 3962; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 3963; GFX940-NEXT: s_nop 1 3964; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 3965; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3966; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 3967; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1] 3968; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0 3969; GFX940-NEXT: s_waitcnt vmcnt(0) 3970; GFX940-NEXT: buffer_inv sc1 3971; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 3972; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 3973; GFX940-NEXT: v_mov_b32_e32 v1, v2 3974; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 3975; GFX940-NEXT: s_cbranch_execnz .LBB13_1 3976; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 3977; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 3978; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2 3979; GFX940-NEXT: s_setpc_b64 s[30:31] 3980; 3981; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 3982; GFX11: ; %bb.0: 3983; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3984; GFX11-NEXT: s_addk_i32 s16, 0x200 3985; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 3986; GFX11-NEXT: s_and_b32 s4, s16, -4 3987; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 3988; GFX11-NEXT: v_mov_b32_e32 v4, s4 3989; GFX11-NEXT: s_and_b32 s4, s16, 3 3990; GFX11-NEXT: s_lshl_b32 s4, s4, 3 3991; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3992; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 3993; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen 3994; GFX11-NEXT: s_not_b32 s6, s5 3995; GFX11-NEXT: s_mov_b32 s5, 0 3996; GFX11-NEXT: .p2align 6 3997; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start 3998; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3999; GFX11-NEXT: s_waitcnt vmcnt(0) 4000; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 4001; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4002; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4003; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4004; GFX11-NEXT: v_min_f32_e32 v0, v0, v5 4005; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4006; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1 4007; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0 4008; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4009; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 4010; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4011; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 4012; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4013; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4014; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 4015; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 4016; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4017; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4018; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc 4019; GFX11-NEXT: s_waitcnt vmcnt(0) 4020; GFX11-NEXT: buffer_gl1_inv 4021; GFX11-NEXT: buffer_gl0_inv 4022; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 4023; GFX11-NEXT: v_mov_b32_e32 v1, v2 4024; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 4025; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4026; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 4027; GFX11-NEXT: s_cbranch_execnz .LBB13_1 4028; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4029; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 4030; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2 4031; GFX11-NEXT: s_setpc_b64 s[30:31] 4032; 4033; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4034; GFX10: ; %bb.0: 4035; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4036; GFX10-NEXT: s_addk_i32 s20, 0x200 4037; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4038; GFX10-NEXT: s_and_b32 s4, s20, -4 4039; GFX10-NEXT: v_mov_b32_e32 v4, s4 4040; GFX10-NEXT: s_and_b32 s4, s20, 3 4041; GFX10-NEXT: s_lshl_b32 s4, s4, 3 4042; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 4043; GFX10-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4044; GFX10-NEXT: s_not_b32 s6, s5 4045; GFX10-NEXT: s_mov_b32 s5, 0 4046; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start 4047; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4048; GFX10-NEXT: s_waitcnt vmcnt(0) 4049; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4050; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4051; GFX10-NEXT: v_min_f32_e32 v0, v0, v5 4052; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 4053; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0 4054; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4055; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 4056; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo 4057; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4058; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 4059; GFX10-NEXT: v_mov_b32_e32 v3, v1 4060; GFX10-NEXT: v_mov_b32_e32 v2, v0 4061; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4062; GFX10-NEXT: s_waitcnt vmcnt(0) 4063; GFX10-NEXT: buffer_gl1_inv 4064; GFX10-NEXT: buffer_gl0_inv 4065; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 4066; GFX10-NEXT: v_mov_b32_e32 v1, v2 4067; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4068; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4069; GFX10-NEXT: s_cbranch_execnz .LBB13_1 4070; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4071; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4072; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2 4073; GFX10-NEXT: s_setpc_b64 s[30:31] 4074; 4075; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4076; GFX90A: ; %bb.0: 4077; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4078; GFX90A-NEXT: s_addk_i32 s20, 0x200 4079; GFX90A-NEXT: s_and_b32 s4, s20, -4 4080; GFX90A-NEXT: v_mov_b32_e32 v4, s4 4081; GFX90A-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4082; GFX90A-NEXT: s_and_b32 s4, s20, 3 4083; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 4084; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 4085; GFX90A-NEXT: s_not_b32 s7, s4 4086; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4087; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4088; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 4089; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start 4090; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4091; GFX90A-NEXT: s_waitcnt vmcnt(0) 4092; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4093; GFX90A-NEXT: v_min_f32_e32 v0, v0, v5 4094; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1 4095; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0 4096; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s8 4097; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4098; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 4099; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4100; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 4101; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] 4102; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4103; GFX90A-NEXT: s_waitcnt vmcnt(0) 4104; GFX90A-NEXT: buffer_wbinvl1 4105; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4106; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4107; GFX90A-NEXT: v_mov_b32_e32 v1, v2 4108; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4109; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 4110; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4111; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4112; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4113; GFX90A-NEXT: s_setpc_b64 s[30:31] 4114; 4115; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4116; GFX908: ; %bb.0: 4117; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4118; GFX908-NEXT: s_addk_i32 s20, 0x200 4119; GFX908-NEXT: s_and_b32 s4, s20, -4 4120; GFX908-NEXT: v_mov_b32_e32 v4, s4 4121; GFX908-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4122; GFX908-NEXT: s_and_b32 s4, s20, 3 4123; GFX908-NEXT: s_lshl_b32 s6, s4, 3 4124; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 4125; GFX908-NEXT: s_not_b32 s7, s4 4126; GFX908-NEXT: s_mov_b64 s[4:5], 0 4127; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4128; GFX908-NEXT: s_movk_i32 s8, 0x7fff 4129; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 4130; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4131; GFX908-NEXT: s_waitcnt vmcnt(0) 4132; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4133; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 4134; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1 4135; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0 4136; GFX908-NEXT: v_add3_u32 v2, v2, v0, s8 4137; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4138; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 4139; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4140; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 4141; GFX908-NEXT: v_mov_b32_e32 v3, v1 4142; GFX908-NEXT: v_mov_b32_e32 v2, v0 4143; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4144; GFX908-NEXT: s_waitcnt vmcnt(0) 4145; GFX908-NEXT: buffer_wbinvl1 4146; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4147; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4148; GFX908-NEXT: v_mov_b32_e32 v1, v2 4149; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4150; GFX908-NEXT: s_cbranch_execnz .LBB13_1 4151; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4152; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4153; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4154; GFX908-NEXT: s_setpc_b64 s[30:31] 4155; 4156; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4157; GFX8: ; %bb.0: 4158; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4159; GFX8-NEXT: s_addk_i32 s20, 0x200 4160; GFX8-NEXT: s_and_b32 s4, s20, -4 4161; GFX8-NEXT: v_mov_b32_e32 v4, s4 4162; GFX8-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4163; GFX8-NEXT: s_and_b32 s4, s20, 3 4164; GFX8-NEXT: s_lshl_b32 s6, s4, 3 4165; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 4166; GFX8-NEXT: s_not_b32 s7, s4 4167; GFX8-NEXT: s_mov_b64 s[4:5], 0 4168; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0 4169; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 4170; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4171; GFX8-NEXT: v_mov_b32_e32 v0, s6 4172; GFX8-NEXT: s_waitcnt vmcnt(0) 4173; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4174; GFX8-NEXT: v_min_f32_e32 v3, v3, v5 4175; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 4176; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 4177; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 4178; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3 4179; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 4180; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc 4181; GFX8-NEXT: v_and_b32_e32 v2, s7, v1 4182; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4183; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 4184; GFX8-NEXT: v_mov_b32_e32 v3, v1 4185; GFX8-NEXT: v_mov_b32_e32 v2, v0 4186; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4187; GFX8-NEXT: s_waitcnt vmcnt(0) 4188; GFX8-NEXT: buffer_wbinvl1 4189; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4190; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4191; GFX8-NEXT: v_mov_b32_e32 v1, v2 4192; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4193; GFX8-NEXT: s_cbranch_execnz .LBB13_1 4194; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4195; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4196; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4197; GFX8-NEXT: s_setpc_b64 s[30:31] 4198; 4199; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4200; GFX7: ; %bb.0: 4201; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4202; GFX7-NEXT: s_addk_i32 s20, 0x200 4203; GFX7-NEXT: s_and_b32 s4, s20, -4 4204; GFX7-NEXT: v_mov_b32_e32 v4, s4 4205; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4206; GFX7-NEXT: s_and_b32 s4, s20, 3 4207; GFX7-NEXT: s_lshl_b32 s6, s4, 3 4208; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 4209; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4210; GFX7-NEXT: s_not_b32 s7, s4 4211; GFX7-NEXT: s_mov_b64 s[4:5], 0 4212; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 4213; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start 4214; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4215; GFX7-NEXT: s_waitcnt vmcnt(0) 4216; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4217; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4218; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4219; GFX7-NEXT: v_min_f32_e32 v0, v0, v5 4220; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4221; GFX7-NEXT: v_and_b32_e32 v2, s7, v1 4222; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4223; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 4224; GFX7-NEXT: v_mov_b32_e32 v3, v1 4225; GFX7-NEXT: v_mov_b32_e32 v2, v0 4226; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4227; GFX7-NEXT: s_waitcnt vmcnt(0) 4228; GFX7-NEXT: buffer_wbinvl1 4229; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4230; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4231; GFX7-NEXT: v_mov_b32_e32 v1, v2 4232; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4233; GFX7-NEXT: s_cbranch_execnz .LBB13_1 4234; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4235; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4236; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4237; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4238; GFX7-NEXT: s_setpc_b64 s[30:31] 4239; 4240; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: 4241; GFX6: ; %bb.0: 4242; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4243; GFX6-NEXT: s_addk_i32 s20, 0x200 4244; GFX6-NEXT: s_and_b32 s4, s20, -4 4245; GFX6-NEXT: v_mov_b32_e32 v4, s4 4246; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen 4247; GFX6-NEXT: s_and_b32 s4, s20, 3 4248; GFX6-NEXT: s_lshl_b32 s6, s4, 3 4249; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 4250; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 4251; GFX6-NEXT: s_not_b32 s7, s4 4252; GFX6-NEXT: s_mov_b64 s[4:5], 0 4253; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 4254; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start 4255; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 4256; GFX6-NEXT: s_waitcnt vmcnt(0) 4257; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4258; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4259; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 4260; GFX6-NEXT: v_min_f32_e32 v0, v0, v5 4261; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4262; GFX6-NEXT: s_waitcnt expcnt(0) 4263; GFX6-NEXT: v_and_b32_e32 v2, s7, v1 4264; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4265; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4266; GFX6-NEXT: v_mov_b32_e32 v3, v1 4267; GFX6-NEXT: v_mov_b32_e32 v2, v0 4268; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[16:19], 0 offen glc 4269; GFX6-NEXT: s_waitcnt vmcnt(0) 4270; GFX6-NEXT: buffer_wbinvl1 4271; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 4272; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4273; GFX6-NEXT: v_mov_b32_e32 v1, v2 4274; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 4275; GFX6-NEXT: s_cbranch_execnz .LBB13_1 4276; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 4277; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 4278; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2 4279; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4280; GFX6-NEXT: s_waitcnt expcnt(0) 4281; GFX6-NEXT: s_setpc_b64 s[30:31] 4282 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 4283 %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4284 ret bfloat %result 4285} 4286 4287define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 { 4288; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4289; GFX12: ; %bb.0: 4290; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4291; GFX12-NEXT: s_wait_expcnt 0x0 4292; GFX12-NEXT: s_wait_samplecnt 0x0 4293; GFX12-NEXT: s_wait_bvhcnt 0x0 4294; GFX12-NEXT: s_wait_kmcnt 0x0 4295; GFX12-NEXT: s_addk_co_i32 s16, 0x200 4296; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4297; GFX12-NEXT: s_wait_alu 0xfffe 4298; GFX12-NEXT: s_and_b32 s4, s16, -4 4299; GFX12-NEXT: s_wait_alu 0xfffe 4300; GFX12-NEXT: v_mov_b32_e32 v2, s4 4301; GFX12-NEXT: s_and_b32 s4, s16, 3 4302; GFX12-NEXT: s_wait_alu 0xfffe 4303; GFX12-NEXT: s_lshl_b32 s4, s4, 3 4304; GFX12-NEXT: s_wait_alu 0xfffe 4305; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4 4306; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen 4307; GFX12-NEXT: s_wait_alu 0xfffe 4308; GFX12-NEXT: s_not_b32 s6, s5 4309; GFX12-NEXT: s_mov_b32 s5, 0 4310; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start 4311; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4312; GFX12-NEXT: s_wait_loadcnt 0x0 4313; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1 4314; GFX12-NEXT: s_wait_storecnt 0x0 4315; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4316; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4317; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v3 4318; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4319; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1 4320; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0 4321; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4322; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 4323; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4324; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 4325; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4326; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 4327; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0 4328; GFX12-NEXT: s_wait_alu 0xfffe 4329; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0 4330; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4331; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 4332; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN 4333; GFX12-NEXT: s_wait_loadcnt 0x0 4334; GFX12-NEXT: global_inv scope:SCOPE_DEV 4335; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 4336; GFX12-NEXT: v_mov_b32_e32 v1, v4 4337; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 4338; GFX12-NEXT: s_wait_alu 0xfffe 4339; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 4340; GFX12-NEXT: s_cbranch_execnz .LBB14_1 4341; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 4342; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 4343; GFX12-NEXT: s_wait_alu 0xfffe 4344; GFX12-NEXT: s_setpc_b64 s[30:31] 4345; 4346; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4347; GFX940: ; %bb.0: 4348; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4349; GFX940-NEXT: s_addk_i32 s16, 0x200 4350; GFX940-NEXT: s_and_b32 s4, s16, -4 4351; GFX940-NEXT: v_mov_b32_e32 v2, s4 4352; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen 4353; GFX940-NEXT: s_and_b32 s4, s16, 3 4354; GFX940-NEXT: s_lshl_b32 s6, s4, 3 4355; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6 4356; GFX940-NEXT: s_not_b32 s7, s4 4357; GFX940-NEXT: s_mov_b64 s[4:5], 0 4358; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4359; GFX940-NEXT: s_movk_i32 s8, 0x7fff 4360; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start 4361; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 4362; GFX940-NEXT: s_waitcnt vmcnt(0) 4363; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4364; GFX940-NEXT: buffer_wbl2 sc1 4365; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 4366; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1 4367; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0 4368; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8 4369; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4370; GFX940-NEXT: s_nop 1 4371; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 4372; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4373; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0 4374; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 4375; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 4376; GFX940-NEXT: s_waitcnt vmcnt(0) 4377; GFX940-NEXT: buffer_inv sc1 4378; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4379; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4380; GFX940-NEXT: v_mov_b32_e32 v1, v4 4381; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 4382; GFX940-NEXT: s_cbranch_execnz .LBB14_1 4383; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 4384; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 4385; GFX940-NEXT: s_setpc_b64 s[30:31] 4386; 4387; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4388; GFX11: ; %bb.0: 4389; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4390; GFX11-NEXT: s_addk_i32 s16, 0x200 4391; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4392; GFX11-NEXT: s_and_b32 s4, s16, -4 4393; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 4394; GFX11-NEXT: v_mov_b32_e32 v2, s4 4395; GFX11-NEXT: s_and_b32 s4, s16, 3 4396; GFX11-NEXT: s_lshl_b32 s4, s4, 3 4397; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4398; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 4399; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen 4400; GFX11-NEXT: s_not_b32 s6, s5 4401; GFX11-NEXT: s_mov_b32 s5, 0 4402; GFX11-NEXT: .p2align 6 4403; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start 4404; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4405; GFX11-NEXT: s_waitcnt vmcnt(0) 4406; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1 4407; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4408; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4409; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4410; GFX11-NEXT: v_min_f32_e32 v0, v0, v3 4411; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4412; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 4413; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 4414; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4415; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 4416; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4417; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 4418; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4419; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4420; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0 4421; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0 4422; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4423; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 4424; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc 4425; GFX11-NEXT: s_waitcnt vmcnt(0) 4426; GFX11-NEXT: buffer_gl1_inv 4427; GFX11-NEXT: buffer_gl0_inv 4428; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 4429; GFX11-NEXT: v_mov_b32_e32 v1, v4 4430; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 4431; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4432; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 4433; GFX11-NEXT: s_cbranch_execnz .LBB14_1 4434; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 4435; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 4436; GFX11-NEXT: s_setpc_b64 s[30:31] 4437; 4438; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4439; GFX10: ; %bb.0: 4440; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4441; GFX10-NEXT: s_addk_i32 s20, 0x200 4442; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4443; GFX10-NEXT: s_and_b32 s4, s20, -4 4444; GFX10-NEXT: v_mov_b32_e32 v2, s4 4445; GFX10-NEXT: s_and_b32 s4, s20, 3 4446; GFX10-NEXT: s_lshl_b32 s4, s4, 3 4447; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 4448; GFX10-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4449; GFX10-NEXT: s_not_b32 s6, s5 4450; GFX10-NEXT: s_mov_b32 s5, 0 4451; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start 4452; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 4453; GFX10-NEXT: s_waitcnt vmcnt(0) 4454; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4455; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4456; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 4457; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1 4458; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 4459; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 4460; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff 4461; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo 4462; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4463; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0 4464; GFX10-NEXT: v_mov_b32_e32 v5, v1 4465; GFX10-NEXT: v_mov_b32_e32 v4, v0 4466; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4467; GFX10-NEXT: s_waitcnt vmcnt(0) 4468; GFX10-NEXT: buffer_gl1_inv 4469; GFX10-NEXT: buffer_gl0_inv 4470; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 4471; GFX10-NEXT: v_mov_b32_e32 v1, v4 4472; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 4473; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 4474; GFX10-NEXT: s_cbranch_execnz .LBB14_1 4475; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 4476; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 4477; GFX10-NEXT: s_setpc_b64 s[30:31] 4478; 4479; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4480; GFX90A: ; %bb.0: 4481; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4482; GFX90A-NEXT: s_addk_i32 s20, 0x200 4483; GFX90A-NEXT: s_and_b32 s4, s20, -4 4484; GFX90A-NEXT: v_mov_b32_e32 v2, s4 4485; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4486; GFX90A-NEXT: s_and_b32 s4, s20, 3 4487; GFX90A-NEXT: s_lshl_b32 s6, s4, 3 4488; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6 4489; GFX90A-NEXT: s_not_b32 s7, s4 4490; GFX90A-NEXT: s_mov_b64 s[4:5], 0 4491; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4492; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 4493; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start 4494; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 4495; GFX90A-NEXT: s_waitcnt vmcnt(0) 4496; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4497; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 4498; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1 4499; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0 4500; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s8 4501; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4502; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 4503; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4504; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0 4505; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 4506; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4507; GFX90A-NEXT: s_waitcnt vmcnt(0) 4508; GFX90A-NEXT: buffer_wbinvl1 4509; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4510; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4511; GFX90A-NEXT: v_mov_b32_e32 v1, v4 4512; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 4513; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 4514; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 4515; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4516; GFX90A-NEXT: s_setpc_b64 s[30:31] 4517; 4518; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4519; GFX908: ; %bb.0: 4520; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4521; GFX908-NEXT: s_addk_i32 s20, 0x200 4522; GFX908-NEXT: s_and_b32 s4, s20, -4 4523; GFX908-NEXT: v_mov_b32_e32 v2, s4 4524; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4525; GFX908-NEXT: s_and_b32 s4, s20, 3 4526; GFX908-NEXT: s_lshl_b32 s6, s4, 3 4527; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6 4528; GFX908-NEXT: s_not_b32 s7, s4 4529; GFX908-NEXT: s_mov_b64 s[4:5], 0 4530; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4531; GFX908-NEXT: s_movk_i32 s8, 0x7fff 4532; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 4533; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4534; GFX908-NEXT: s_waitcnt vmcnt(0) 4535; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4536; GFX908-NEXT: v_min_f32_e32 v0, v0, v3 4537; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1 4538; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0 4539; GFX908-NEXT: v_add3_u32 v4, v4, v0, s8 4540; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 4541; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 4542; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4543; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0 4544; GFX908-NEXT: v_mov_b32_e32 v5, v1 4545; GFX908-NEXT: v_mov_b32_e32 v4, v0 4546; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4547; GFX908-NEXT: s_waitcnt vmcnt(0) 4548; GFX908-NEXT: buffer_wbinvl1 4549; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4550; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4551; GFX908-NEXT: v_mov_b32_e32 v1, v4 4552; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 4553; GFX908-NEXT: s_cbranch_execnz .LBB14_1 4554; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 4555; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4556; GFX908-NEXT: s_setpc_b64 s[30:31] 4557; 4558; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4559; GFX8: ; %bb.0: 4560; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4561; GFX8-NEXT: s_addk_i32 s20, 0x200 4562; GFX8-NEXT: s_and_b32 s4, s20, -4 4563; GFX8-NEXT: v_mov_b32_e32 v2, s4 4564; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4565; GFX8-NEXT: s_and_b32 s4, s20, 3 4566; GFX8-NEXT: s_lshl_b32 s6, s4, 3 4567; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6 4568; GFX8-NEXT: s_not_b32 s7, s4 4569; GFX8-NEXT: s_mov_b64 s[4:5], 0 4570; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 4571; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 4572; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4573; GFX8-NEXT: v_mov_b32_e32 v0, s6 4574; GFX8-NEXT: s_waitcnt vmcnt(0) 4575; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4576; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 4577; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 4578; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 4579; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 4580; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5 4581; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 4582; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc 4583; GFX8-NEXT: v_and_b32_e32 v4, s7, v1 4584; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4585; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 4586; GFX8-NEXT: v_mov_b32_e32 v5, v1 4587; GFX8-NEXT: v_mov_b32_e32 v4, v0 4588; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4589; GFX8-NEXT: s_waitcnt vmcnt(0) 4590; GFX8-NEXT: buffer_wbinvl1 4591; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4592; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4593; GFX8-NEXT: v_mov_b32_e32 v1, v4 4594; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 4595; GFX8-NEXT: s_cbranch_execnz .LBB14_1 4596; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 4597; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4598; GFX8-NEXT: s_setpc_b64 s[30:31] 4599; 4600; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4601; GFX7: ; %bb.0: 4602; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4603; GFX7-NEXT: s_addk_i32 s20, 0x200 4604; GFX7-NEXT: s_and_b32 s4, s20, -4 4605; GFX7-NEXT: v_mov_b32_e32 v2, s4 4606; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4607; GFX7-NEXT: s_and_b32 s4, s20, 3 4608; GFX7-NEXT: s_lshl_b32 s6, s4, 3 4609; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6 4610; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4611; GFX7-NEXT: s_not_b32 s7, s4 4612; GFX7-NEXT: s_mov_b64 s[4:5], 0 4613; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 4614; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start 4615; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 4616; GFX7-NEXT: s_waitcnt vmcnt(0) 4617; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4618; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4619; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 4620; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 4621; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4622; GFX7-NEXT: v_and_b32_e32 v4, s7, v1 4623; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4624; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 4625; GFX7-NEXT: v_mov_b32_e32 v5, v1 4626; GFX7-NEXT: v_mov_b32_e32 v4, v0 4627; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4628; GFX7-NEXT: s_waitcnt vmcnt(0) 4629; GFX7-NEXT: buffer_wbinvl1 4630; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4631; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4632; GFX7-NEXT: v_mov_b32_e32 v1, v4 4633; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 4634; GFX7-NEXT: s_cbranch_execnz .LBB14_1 4635; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 4636; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4637; GFX7-NEXT: s_setpc_b64 s[30:31] 4638; 4639; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine_grained_memory: 4640; GFX6: ; %bb.0: 4641; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4642; GFX6-NEXT: s_addk_i32 s20, 0x200 4643; GFX6-NEXT: s_and_b32 s4, s20, -4 4644; GFX6-NEXT: v_mov_b32_e32 v2, s4 4645; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 4646; GFX6-NEXT: s_and_b32 s4, s20, 3 4647; GFX6-NEXT: s_lshl_b32 s6, s4, 3 4648; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6 4649; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 4650; GFX6-NEXT: s_not_b32 s7, s4 4651; GFX6-NEXT: s_mov_b64 s[4:5], 0 4652; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 4653; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start 4654; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 4655; GFX6-NEXT: s_waitcnt vmcnt(0) 4656; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1 4657; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4658; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 4659; GFX6-NEXT: v_min_f32_e32 v0, v0, v3 4660; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 4661; GFX6-NEXT: s_waitcnt expcnt(0) 4662; GFX6-NEXT: v_and_b32_e32 v4, s7, v1 4663; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 4664; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 4665; GFX6-NEXT: v_mov_b32_e32 v5, v1 4666; GFX6-NEXT: v_mov_b32_e32 v4, v0 4667; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 4668; GFX6-NEXT: s_waitcnt vmcnt(0) 4669; GFX6-NEXT: buffer_wbinvl1 4670; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 4671; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 4672; GFX6-NEXT: v_mov_b32_e32 v1, v4 4673; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 4674; GFX6-NEXT: s_cbranch_execnz .LBB14_1 4675; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 4676; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 4677; GFX6-NEXT: s_waitcnt expcnt(0) 4678; GFX6-NEXT: s_setpc_b64 s[30:31] 4679 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 4680 %unused = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4681 ret void 4682} 4683 4684define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 { 4685; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 4686; GFX12: ; %bb.0: 4687; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4688; GFX12-NEXT: s_wait_expcnt 0x0 4689; GFX12-NEXT: s_wait_samplecnt 0x0 4690; GFX12-NEXT: s_wait_bvhcnt 0x0 4691; GFX12-NEXT: s_wait_kmcnt 0x0 4692; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 4693; GFX12-NEXT: s_mov_b32 s1, exec_lo 4694; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4695; GFX12-NEXT: v_and_b32_e32 v6, 3, v4 4696; GFX12-NEXT: v_and_b32_e32 v8, -4, v4 4697; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6 4698; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4699; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 4700; GFX12-NEXT: v_not_b32_e32 v9, v6 4701; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4702; GFX12-NEXT: v_readfirstlane_b32 s4, v0 4703; GFX12-NEXT: v_readfirstlane_b32 s5, v1 4704; GFX12-NEXT: v_readfirstlane_b32 s6, v2 4705; GFX12-NEXT: v_readfirstlane_b32 s7, v3 4706; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4707; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4708; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4709; GFX12-NEXT: s_wait_alu 0xfffe 4710; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4711; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 4712; GFX12-NEXT: s_wait_alu 0xfffe 4713; GFX12-NEXT: s_and_saveexec_b32 s0, s0 4714; GFX12-NEXT: s_wait_loadcnt 0x0 4715; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen 4716; GFX12-NEXT: s_wait_alu 0xfffe 4717; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4718; GFX12-NEXT: s_cbranch_execnz .LBB15_1 4719; GFX12-NEXT: ; %bb.2: 4720; GFX12-NEXT: s_mov_b32 exec_lo, s1 4721; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5 4722; GFX12-NEXT: s_mov_b32 s1, 0 4723; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start 4724; GFX12-NEXT: ; =>This Loop Header: Depth=1 4725; GFX12-NEXT: ; Child Loop BB15_4 Depth 2 4726; GFX12-NEXT: s_wait_loadcnt 0x0 4727; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6 4728; GFX12-NEXT: s_mov_b32 s2, exec_lo 4729; GFX12-NEXT: s_wait_storecnt 0x0 4730; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4731; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4 4732; GFX12-NEXT: v_min_num_f32_e32 v4, v4, v10 4733; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4734; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1 4735; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4 4736; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 4737; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 4738; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4739; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 4740; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4 4741; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4742; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4 4743; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4 4744; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4745; GFX12-NEXT: v_mov_b32_e32 v4, v5 4746; GFX12-NEXT: v_mov_b32_e32 v5, v6 4747; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4748; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 4749; GFX12-NEXT: v_readfirstlane_b32 s4, v0 4750; GFX12-NEXT: v_readfirstlane_b32 s5, v1 4751; GFX12-NEXT: v_readfirstlane_b32 s6, v2 4752; GFX12-NEXT: v_readfirstlane_b32 s7, v3 4753; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4754; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4755; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4756; GFX12-NEXT: s_wait_alu 0xfffe 4757; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4758; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 4759; GFX12-NEXT: s_wait_alu 0xfffe 4760; GFX12-NEXT: s_and_saveexec_b32 s0, s0 4761; GFX12-NEXT: s_wait_loadcnt 0x0 4762; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN 4763; GFX12-NEXT: s_wait_alu 0xfffe 4764; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4765; GFX12-NEXT: s_cbranch_execnz .LBB15_4 4766; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4767; GFX12-NEXT: s_mov_b32 exec_lo, s2 4768; GFX12-NEXT: s_wait_loadcnt 0x0 4769; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 4770; GFX12-NEXT: v_mov_b32_e32 v6, v4 4771; GFX12-NEXT: global_inv scope:SCOPE_DEV 4772; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4773; GFX12-NEXT: s_wait_alu 0xfffe 4774; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4775; GFX12-NEXT: s_cbranch_execnz .LBB15_3 4776; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 4777; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4778; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4 4779; GFX12-NEXT: s_wait_alu 0xfffe 4780; GFX12-NEXT: s_setpc_b64 s[30:31] 4781; 4782; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 4783; GFX940: ; %bb.0: 4784; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4785; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4 4786; GFX940-NEXT: v_and_b32_e32 v9, -4, v4 4787; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 4788; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4 4789; GFX940-NEXT: s_mov_b32 s0, 0xffff 4790; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0 4791; GFX940-NEXT: v_not_b32_e32 v10, v4 4792; GFX940-NEXT: s_mov_b64 s[2:3], exec 4793; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4794; GFX940-NEXT: v_readfirstlane_b32 s4, v0 4795; GFX940-NEXT: v_readfirstlane_b32 s5, v1 4796; GFX940-NEXT: v_readfirstlane_b32 s6, v2 4797; GFX940-NEXT: v_readfirstlane_b32 s7, v3 4798; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 4799; GFX940-NEXT: s_nop 0 4800; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 4801; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 4802; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 4803; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen 4804; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 4805; GFX940-NEXT: s_cbranch_execnz .LBB15_1 4806; GFX940-NEXT: ; %bb.2: 4807; GFX940-NEXT: s_mov_b64 exec, s[2:3] 4808; GFX940-NEXT: s_mov_b64 s[2:3], 0 4809; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5 4810; GFX940-NEXT: s_movk_i32 s10, 0x7fff 4811; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start 4812; GFX940-NEXT: ; =>This Loop Header: Depth=1 4813; GFX940-NEXT: ; Child Loop BB15_4 Depth 2 4814; GFX940-NEXT: s_waitcnt vmcnt(0) 4815; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4816; GFX940-NEXT: s_mov_b64 s[8:9], exec 4817; GFX940-NEXT: v_min_f32_e32 v4, v4, v11 4818; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 4819; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 4820; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 4821; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 4822; GFX940-NEXT: buffer_wbl2 sc1 4823; GFX940-NEXT: s_nop 0 4824; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 4825; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4826; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4 4827; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 4828; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4829; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 4830; GFX940-NEXT: v_readfirstlane_b32 s4, v0 4831; GFX940-NEXT: v_readfirstlane_b32 s5, v1 4832; GFX940-NEXT: v_readfirstlane_b32 s6, v2 4833; GFX940-NEXT: v_readfirstlane_b32 s7, v3 4834; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 4835; GFX940-NEXT: s_nop 0 4836; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 4837; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 4838; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 4839; GFX940-NEXT: s_waitcnt vmcnt(0) 4840; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0 4841; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 4842; GFX940-NEXT: s_cbranch_execnz .LBB15_4 4843; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4844; GFX940-NEXT: s_mov_b64 exec, s[8:9] 4845; GFX940-NEXT: s_waitcnt vmcnt(0) 4846; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 4847; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 4848; GFX940-NEXT: v_mov_b32_e32 v7, v4 4849; GFX940-NEXT: buffer_inv sc1 4850; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 4851; GFX940-NEXT: s_cbranch_execnz .LBB15_3 4852; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 4853; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 4854; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4 4855; GFX940-NEXT: s_setpc_b64 s[30:31] 4856; 4857; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 4858; GFX11: ; %bb.0: 4859; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4860; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 4861; GFX11-NEXT: s_mov_b32 s1, 0 4862; GFX11-NEXT: s_mov_b32 s2, exec_lo 4863; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 4864; GFX11-NEXT: v_and_b32_e32 v6, 3, v4 4865; GFX11-NEXT: v_and_b32_e32 v8, -4, v4 4866; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6 4867; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4868; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 4869; GFX11-NEXT: v_not_b32_e32 v9, v6 4870; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4871; GFX11-NEXT: v_readfirstlane_b32 s4, v0 4872; GFX11-NEXT: v_readfirstlane_b32 s5, v1 4873; GFX11-NEXT: v_readfirstlane_b32 s6, v2 4874; GFX11-NEXT: v_readfirstlane_b32 s7, v3 4875; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4876; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4877; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4878; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 4879; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 4880; GFX11-NEXT: s_and_saveexec_b32 s0, s0 4881; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen 4882; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4883; GFX11-NEXT: s_cbranch_execnz .LBB15_1 4884; GFX11-NEXT: ; %bb.2: 4885; GFX11-NEXT: s_mov_b32 exec_lo, s2 4886; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5 4887; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 4888; GFX11-NEXT: .p2align 6 4889; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start 4890; GFX11-NEXT: ; =>This Loop Header: Depth=1 4891; GFX11-NEXT: ; Child Loop BB15_4 Depth 2 4892; GFX11-NEXT: s_waitcnt vmcnt(0) 4893; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6 4894; GFX11-NEXT: s_mov_b32 s2, exec_lo 4895; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4896; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4897; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 4898; GFX11-NEXT: v_min_f32_e32 v4, v4, v10 4899; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 4900; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1 4901; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 4902; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 4903; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 4904; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4905; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 4906; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 4907; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4908; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4 4909; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4 4910; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4911; GFX11-NEXT: v_mov_b32_e32 v4, v5 4912; GFX11-NEXT: v_mov_b32_e32 v5, v6 4913; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4914; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 4915; GFX11-NEXT: v_readfirstlane_b32 s4, v0 4916; GFX11-NEXT: v_readfirstlane_b32 s5, v1 4917; GFX11-NEXT: v_readfirstlane_b32 s6, v2 4918; GFX11-NEXT: v_readfirstlane_b32 s7, v3 4919; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 4920; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 4921; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 4922; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 4923; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 4924; GFX11-NEXT: s_and_saveexec_b32 s0, s0 4925; GFX11-NEXT: s_waitcnt vmcnt(0) 4926; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc 4927; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 4928; GFX11-NEXT: s_cbranch_execnz .LBB15_4 4929; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 4930; GFX11-NEXT: s_mov_b32 exec_lo, s2 4931; GFX11-NEXT: s_waitcnt vmcnt(0) 4932; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 4933; GFX11-NEXT: v_mov_b32_e32 v6, v4 4934; GFX11-NEXT: buffer_gl1_inv 4935; GFX11-NEXT: buffer_gl0_inv 4936; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4937; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4938; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4939; GFX11-NEXT: s_cbranch_execnz .LBB15_3 4940; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 4941; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 4942; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4943; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4 4944; GFX11-NEXT: s_setpc_b64 s[30:31] 4945; 4946; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 4947; GFX10: ; %bb.0: 4948; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4949; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 4950; GFX10-NEXT: s_mov_b32 s5, 0 4951; GFX10-NEXT: s_mov_b32 s6, exec_lo 4952; GFX10-NEXT: v_and_b32_e32 v6, 3, v4 4953; GFX10-NEXT: v_and_b32_e32 v8, -4, v4 4954; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6 4955; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff 4956; GFX10-NEXT: v_not_b32_e32 v9, v6 4957; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 4958; GFX10-NEXT: v_readfirstlane_b32 s8, v0 4959; GFX10-NEXT: v_readfirstlane_b32 s9, v1 4960; GFX10-NEXT: v_readfirstlane_b32 s10, v2 4961; GFX10-NEXT: v_readfirstlane_b32 s11, v3 4962; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 4963; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 4964; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 4965; GFX10-NEXT: s_and_saveexec_b32 s4, s4 4966; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 4967; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4968; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 4969; GFX10-NEXT: s_cbranch_execnz .LBB15_1 4970; GFX10-NEXT: ; %bb.2: 4971; GFX10-NEXT: s_mov_b32 exec_lo, s6 4972; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5 4973; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start 4974; GFX10-NEXT: ; =>This Loop Header: Depth=1 4975; GFX10-NEXT: ; Child Loop BB15_4 Depth 2 4976; GFX10-NEXT: s_waitcnt vmcnt(0) 4977; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 4978; GFX10-NEXT: s_mov_b32 s6, exec_lo 4979; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4980; GFX10-NEXT: v_min_f32_e32 v4, v4, v10 4981; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 4982; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4 4983; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 4984; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff 4985; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo 4986; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 4987; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4 4988; GFX10-NEXT: v_mov_b32_e32 v4, v5 4989; GFX10-NEXT: v_mov_b32_e32 v5, v6 4990; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 4991; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 4992; GFX10-NEXT: v_readfirstlane_b32 s8, v0 4993; GFX10-NEXT: v_readfirstlane_b32 s9, v1 4994; GFX10-NEXT: v_readfirstlane_b32 s10, v2 4995; GFX10-NEXT: v_readfirstlane_b32 s11, v3 4996; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 4997; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 4998; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 4999; GFX10-NEXT: s_and_saveexec_b32 s4, s4 5000; GFX10-NEXT: s_waitcnt vmcnt(0) 5001; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5002; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5003; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 5004; GFX10-NEXT: s_cbranch_execnz .LBB15_4 5005; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5006; GFX10-NEXT: s_mov_b32 exec_lo, s6 5007; GFX10-NEXT: s_waitcnt vmcnt(0) 5008; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 5009; GFX10-NEXT: v_mov_b32_e32 v6, v4 5010; GFX10-NEXT: buffer_gl1_inv 5011; GFX10-NEXT: buffer_gl0_inv 5012; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5013; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5014; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5015; GFX10-NEXT: s_cbranch_execnz .LBB15_3 5016; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 5017; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5018; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5019; GFX10-NEXT: s_setpc_b64 s[30:31] 5020; 5021; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5022; GFX90A: ; %bb.0: 5023; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5024; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4 5025; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4 5026; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 5027; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4 5028; GFX90A-NEXT: s_mov_b32 s4, 0xffff 5029; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4 5030; GFX90A-NEXT: v_not_b32_e32 v10, v4 5031; GFX90A-NEXT: s_mov_b64 s[6:7], exec 5032; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 5033; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 5034; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 5035; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 5036; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 5037; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5038; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5039; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5040; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5041; GFX90A-NEXT: s_nop 0 5042; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen 5043; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 5044; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 5045; GFX90A-NEXT: ; %bb.2: 5046; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 5047; GFX90A-NEXT: s_mov_b64 s[6:7], 0 5048; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5 5049; GFX90A-NEXT: s_movk_i32 s14, 0x7fff 5050; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start 5051; GFX90A-NEXT: ; =>This Loop Header: Depth=1 5052; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2 5053; GFX90A-NEXT: s_waitcnt vmcnt(0) 5054; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5055; GFX90A-NEXT: v_min_f32_e32 v4, v4, v11 5056; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 5057; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 5058; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 5059; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5060; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 5061; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5062; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4 5063; GFX90A-NEXT: s_mov_b64 s[12:13], exec 5064; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 5065; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 5066; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 5067; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 5068; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 5069; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 5070; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 5071; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5072; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5073; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5074; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5075; GFX90A-NEXT: s_waitcnt vmcnt(0) 5076; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc 5077; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 5078; GFX90A-NEXT: s_cbranch_execnz .LBB15_4 5079; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5080; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 5081; GFX90A-NEXT: s_waitcnt vmcnt(0) 5082; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 5083; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5084; GFX90A-NEXT: v_mov_b32_e32 v7, v4 5085; GFX90A-NEXT: buffer_wbinvl1 5086; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 5087; GFX90A-NEXT: s_cbranch_execnz .LBB15_3 5088; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 5089; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5090; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4 5091; GFX90A-NEXT: s_setpc_b64 s[30:31] 5092; 5093; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5094; GFX908: ; %bb.0: 5095; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5096; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4 5097; GFX908-NEXT: v_and_b32_e32 v8, -4, v4 5098; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 5099; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4 5100; GFX908-NEXT: s_mov_b32 s4, 0xffff 5101; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4 5102; GFX908-NEXT: v_not_b32_e32 v9, v4 5103; GFX908-NEXT: s_mov_b64 s[6:7], exec 5104; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 5105; GFX908-NEXT: v_readfirstlane_b32 s8, v0 5106; GFX908-NEXT: v_readfirstlane_b32 s9, v1 5107; GFX908-NEXT: v_readfirstlane_b32 s10, v2 5108; GFX908-NEXT: v_readfirstlane_b32 s11, v3 5109; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5110; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5111; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5112; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5113; GFX908-NEXT: s_nop 0 5114; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5115; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 5116; GFX908-NEXT: s_cbranch_execnz .LBB15_1 5117; GFX908-NEXT: ; %bb.2: 5118; GFX908-NEXT: s_mov_b64 exec, s[6:7] 5119; GFX908-NEXT: s_mov_b64 s[6:7], 0 5120; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5121; GFX908-NEXT: s_movk_i32 s14, 0x7fff 5122; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start 5123; GFX908-NEXT: ; =>This Loop Header: Depth=1 5124; GFX908-NEXT: ; Child Loop BB15_4 Depth 2 5125; GFX908-NEXT: s_waitcnt vmcnt(0) 5126; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5127; GFX908-NEXT: v_min_f32_e32 v4, v4, v10 5128; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 5129; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 5130; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4 5131; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5132; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc 5133; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5134; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4 5135; GFX908-NEXT: v_mov_b32_e32 v4, v5 5136; GFX908-NEXT: s_mov_b64 s[12:13], exec 5137; GFX908-NEXT: v_mov_b32_e32 v5, v6 5138; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 5139; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 5140; GFX908-NEXT: v_readfirstlane_b32 s8, v0 5141; GFX908-NEXT: v_readfirstlane_b32 s9, v1 5142; GFX908-NEXT: v_readfirstlane_b32 s10, v2 5143; GFX908-NEXT: v_readfirstlane_b32 s11, v3 5144; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5145; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5146; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5147; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5148; GFX908-NEXT: s_waitcnt vmcnt(0) 5149; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5150; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 5151; GFX908-NEXT: s_cbranch_execnz .LBB15_4 5152; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5153; GFX908-NEXT: s_mov_b64 exec, s[12:13] 5154; GFX908-NEXT: s_waitcnt vmcnt(0) 5155; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 5156; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5157; GFX908-NEXT: v_mov_b32_e32 v6, v4 5158; GFX908-NEXT: buffer_wbinvl1 5159; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5160; GFX908-NEXT: s_cbranch_execnz .LBB15_3 5161; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 5162; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5163; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5164; GFX908-NEXT: s_setpc_b64 s[30:31] 5165; 5166; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5167; GFX8: ; %bb.0: 5168; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5169; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4 5170; GFX8-NEXT: v_and_b32_e32 v8, -4, v4 5171; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 5172; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4 5173; GFX8-NEXT: s_mov_b32 s4, 0xffff 5174; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4 5175; GFX8-NEXT: v_not_b32_e32 v9, v4 5176; GFX8-NEXT: s_mov_b64 s[6:7], exec 5177; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 5178; GFX8-NEXT: v_readfirstlane_b32 s8, v0 5179; GFX8-NEXT: v_readfirstlane_b32 s9, v1 5180; GFX8-NEXT: v_readfirstlane_b32 s10, v2 5181; GFX8-NEXT: v_readfirstlane_b32 s11, v3 5182; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5183; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5184; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5185; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5186; GFX8-NEXT: s_nop 0 5187; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5188; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 5189; GFX8-NEXT: s_cbranch_execnz .LBB15_1 5190; GFX8-NEXT: ; %bb.2: 5191; GFX8-NEXT: s_mov_b64 exec, s[6:7] 5192; GFX8-NEXT: s_mov_b64 s[6:7], 0 5193; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 5194; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start 5195; GFX8-NEXT: ; =>This Loop Header: Depth=1 5196; GFX8-NEXT: ; Child Loop BB15_4 Depth 2 5197; GFX8-NEXT: s_waitcnt vmcnt(0) 5198; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5199; GFX8-NEXT: v_min_f32_e32 v4, v4, v10 5200; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 5201; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 5202; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 5203; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4 5204; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 5205; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc 5206; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 5207; GFX8-NEXT: v_and_b32_e32 v5, v6, v9 5208; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 5209; GFX8-NEXT: v_mov_b32_e32 v4, v5 5210; GFX8-NEXT: s_mov_b64 s[12:13], exec 5211; GFX8-NEXT: v_mov_b32_e32 v5, v6 5212; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 5213; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 5214; GFX8-NEXT: v_readfirstlane_b32 s8, v0 5215; GFX8-NEXT: v_readfirstlane_b32 s9, v1 5216; GFX8-NEXT: v_readfirstlane_b32 s10, v2 5217; GFX8-NEXT: v_readfirstlane_b32 s11, v3 5218; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5219; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5220; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5221; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5222; GFX8-NEXT: s_waitcnt vmcnt(0) 5223; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5224; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 5225; GFX8-NEXT: s_cbranch_execnz .LBB15_4 5226; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5227; GFX8-NEXT: s_mov_b64 exec, s[12:13] 5228; GFX8-NEXT: s_waitcnt vmcnt(0) 5229; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 5230; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5231; GFX8-NEXT: v_mov_b32_e32 v6, v4 5232; GFX8-NEXT: buffer_wbinvl1 5233; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5234; GFX8-NEXT: s_cbranch_execnz .LBB15_3 5235; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 5236; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5237; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5238; GFX8-NEXT: s_setpc_b64 s[30:31] 5239; 5240; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5241; GFX7: ; %bb.0: 5242; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5243; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 5244; GFX7-NEXT: v_and_b32_e32 v8, -4, v4 5245; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 5246; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4 5247; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 5248; GFX7-NEXT: v_not_b32_e32 v9, v4 5249; GFX7-NEXT: s_mov_b64 s[6:7], exec 5250; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 5251; GFX7-NEXT: v_readfirstlane_b32 s8, v0 5252; GFX7-NEXT: v_readfirstlane_b32 s9, v1 5253; GFX7-NEXT: v_readfirstlane_b32 s10, v2 5254; GFX7-NEXT: v_readfirstlane_b32 s11, v3 5255; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5256; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5257; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5258; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5259; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5260; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 5261; GFX7-NEXT: s_cbranch_execnz .LBB15_1 5262; GFX7-NEXT: ; %bb.2: 5263; GFX7-NEXT: s_mov_b64 exec, s[6:7] 5264; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5 5265; GFX7-NEXT: s_mov_b64 s[6:7], 0 5266; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 5267; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start 5268; GFX7-NEXT: ; =>This Loop Header: Depth=1 5269; GFX7-NEXT: ; Child Loop BB15_4 Depth 2 5270; GFX7-NEXT: s_waitcnt vmcnt(0) 5271; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6 5272; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5273; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 5274; GFX7-NEXT: v_min_f32_e32 v4, v4, v10 5275; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 5276; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4 5277; GFX7-NEXT: v_and_b32_e32 v5, v6, v9 5278; GFX7-NEXT: v_or_b32_e32 v5, v5, v4 5279; GFX7-NEXT: v_mov_b32_e32 v4, v5 5280; GFX7-NEXT: s_mov_b64 s[12:13], exec 5281; GFX7-NEXT: v_mov_b32_e32 v5, v6 5282; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 5283; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 5284; GFX7-NEXT: v_readfirstlane_b32 s8, v0 5285; GFX7-NEXT: v_readfirstlane_b32 s9, v1 5286; GFX7-NEXT: v_readfirstlane_b32 s10, v2 5287; GFX7-NEXT: v_readfirstlane_b32 s11, v3 5288; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5289; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5290; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5291; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5292; GFX7-NEXT: s_waitcnt vmcnt(0) 5293; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5294; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 5295; GFX7-NEXT: s_cbranch_execnz .LBB15_4 5296; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5297; GFX7-NEXT: s_mov_b64 exec, s[12:13] 5298; GFX7-NEXT: s_waitcnt vmcnt(0) 5299; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 5300; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5301; GFX7-NEXT: v_mov_b32_e32 v6, v4 5302; GFX7-NEXT: buffer_wbinvl1 5303; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 5304; GFX7-NEXT: s_cbranch_execnz .LBB15_3 5305; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 5306; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 5307; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5308; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5309; GFX7-NEXT: s_setpc_b64 s[30:31] 5310; 5311; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 5312; GFX6: ; %bb.0: 5313; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5314; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4 5315; GFX6-NEXT: v_and_b32_e32 v8, -4, v4 5316; GFX6-NEXT: v_and_b32_e32 v4, 3, v4 5317; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4 5318; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7 5319; GFX6-NEXT: v_not_b32_e32 v9, v4 5320; GFX6-NEXT: s_mov_b64 s[6:7], exec 5321; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 5322; GFX6-NEXT: v_readfirstlane_b32 s8, v0 5323; GFX6-NEXT: v_readfirstlane_b32 s9, v1 5324; GFX6-NEXT: v_readfirstlane_b32 s10, v2 5325; GFX6-NEXT: v_readfirstlane_b32 s11, v3 5326; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5327; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5328; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5329; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5330; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen 5331; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 5332; GFX6-NEXT: s_cbranch_execnz .LBB15_1 5333; GFX6-NEXT: ; %bb.2: 5334; GFX6-NEXT: s_mov_b64 exec, s[6:7] 5335; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5 5336; GFX6-NEXT: s_mov_b64 s[6:7], 0 5337; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 5338; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start 5339; GFX6-NEXT: ; =>This Loop Header: Depth=1 5340; GFX6-NEXT: ; Child Loop BB15_4 Depth 2 5341; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5342; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6 5343; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5344; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 5345; GFX6-NEXT: v_min_f32_e32 v4, v4, v10 5346; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 5347; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4 5348; GFX6-NEXT: v_and_b32_e32 v5, v6, v9 5349; GFX6-NEXT: v_or_b32_e32 v5, v5, v4 5350; GFX6-NEXT: v_mov_b32_e32 v4, v5 5351; GFX6-NEXT: s_mov_b64 s[12:13], exec 5352; GFX6-NEXT: v_mov_b32_e32 v5, v6 5353; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 5354; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 5355; GFX6-NEXT: v_readfirstlane_b32 s8, v0 5356; GFX6-NEXT: v_readfirstlane_b32 s9, v1 5357; GFX6-NEXT: v_readfirstlane_b32 s10, v2 5358; GFX6-NEXT: v_readfirstlane_b32 s11, v3 5359; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 5360; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 5361; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 5362; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 5363; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5364; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 5365; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 5366; GFX6-NEXT: s_cbranch_execnz .LBB15_4 5367; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 5368; GFX6-NEXT: s_mov_b64 exec, s[12:13] 5369; GFX6-NEXT: s_waitcnt vmcnt(0) 5370; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 5371; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5372; GFX6-NEXT: v_mov_b32_e32 v6, v4 5373; GFX6-NEXT: buffer_wbinvl1 5374; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 5375; GFX6-NEXT: s_cbranch_execnz .LBB15_3 5376; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 5377; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 5378; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4 5379; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 5380; GFX6-NEXT: s_waitcnt expcnt(0) 5381; GFX6-NEXT: s_setpc_b64 s[30:31] 5382 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256 5383 %result = atomicrmw fmin ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 5384 ret bfloat %result 5385} 5386 5387; -------------------------------------------------------------------- 5388; <2 x half> 5389; -------------------------------------------------------------------- 5390 5391define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 5392; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5393; GFX12: ; %bb.0: 5394; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5395; GFX12-NEXT: s_wait_expcnt 0x0 5396; GFX12-NEXT: s_wait_samplecnt 0x0 5397; GFX12-NEXT: s_wait_bvhcnt 0x0 5398; GFX12-NEXT: s_wait_kmcnt 0x0 5399; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 5400; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 5401; GFX12-NEXT: s_wait_alu 0xfffe 5402; GFX12-NEXT: v_mov_b32_e32 v3, s4 5403; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 5404; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1 5405; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 5406; GFX12-NEXT: s_mov_b32 s4, 0 5407; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start 5408; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5409; GFX12-NEXT: s_wait_loadcnt 0x0 5410; GFX12-NEXT: v_mov_b32_e32 v5, v0 5411; GFX12-NEXT: s_wait_storecnt 0x0 5412; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5413; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5 5414; GFX12-NEXT: v_pk_min_num_f16 v4, v0, v2 5415; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5416; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 5417; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN 5418; GFX12-NEXT: s_wait_loadcnt 0x0 5419; GFX12-NEXT: global_inv scope:SCOPE_DEV 5420; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 5421; GFX12-NEXT: s_wait_alu 0xfffe 5422; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 5423; GFX12-NEXT: s_wait_alu 0xfffe 5424; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 5425; GFX12-NEXT: s_cbranch_execnz .LBB16_1 5426; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 5427; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 5428; GFX12-NEXT: s_wait_alu 0xfffe 5429; GFX12-NEXT: s_setpc_b64 s[30:31] 5430; 5431; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5432; GFX940: ; %bb.0: 5433; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5434; GFX940-NEXT: v_mov_b32_e32 v1, v0 5435; GFX940-NEXT: v_mov_b32_e32 v0, s16 5436; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 5437; GFX940-NEXT: s_add_i32 s6, s16, 0x400 5438; GFX940-NEXT: s_mov_b64 s[4:5], 0 5439; GFX940-NEXT: v_pk_max_f16 v2, v1, v1 5440; GFX940-NEXT: v_mov_b32_e32 v3, s6 5441; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start 5442; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5443; GFX940-NEXT: s_waitcnt vmcnt(0) 5444; GFX940-NEXT: v_mov_b32_e32 v5, v0 5445; GFX940-NEXT: v_pk_max_f16 v0, v5, v5 5446; GFX940-NEXT: buffer_wbl2 sc1 5447; GFX940-NEXT: v_pk_min_f16 v4, v0, v2 5448; GFX940-NEXT: s_nop 0 5449; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 5450; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 5451; GFX940-NEXT: s_waitcnt vmcnt(0) 5452; GFX940-NEXT: buffer_inv sc1 5453; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 5454; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5455; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 5456; GFX940-NEXT: s_cbranch_execnz .LBB16_1 5457; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 5458; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 5459; GFX940-NEXT: s_setpc_b64 s[30:31] 5460; 5461; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5462; GFX11: ; %bb.0: 5463; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5464; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 5465; GFX11-NEXT: s_add_i32 s4, s16, 0x400 5466; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 5467; GFX11-NEXT: v_mov_b32_e32 v3, s4 5468; GFX11-NEXT: v_pk_max_f16 v2, v1, v1 5469; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 5470; GFX11-NEXT: s_mov_b32 s4, 0 5471; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start 5472; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5473; GFX11-NEXT: s_waitcnt vmcnt(0) 5474; GFX11-NEXT: v_mov_b32_e32 v5, v0 5475; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5476; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5477; GFX11-NEXT: v_pk_max_f16 v0, v5, v5 5478; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 5479; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5480; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 5481; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc 5482; GFX11-NEXT: s_waitcnt vmcnt(0) 5483; GFX11-NEXT: buffer_gl1_inv 5484; GFX11-NEXT: buffer_gl0_inv 5485; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 5486; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 5487; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5488; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 5489; GFX11-NEXT: s_cbranch_execnz .LBB16_1 5490; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 5491; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 5492; GFX11-NEXT: s_setpc_b64 s[30:31] 5493; 5494; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5495; GFX10: ; %bb.0: 5496; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5497; GFX10-NEXT: v_mov_b32_e32 v1, v0 5498; GFX10-NEXT: v_mov_b32_e32 v0, s20 5499; GFX10-NEXT: s_add_i32 s4, s20, 0x400 5500; GFX10-NEXT: v_mov_b32_e32 v3, s4 5501; GFX10-NEXT: v_pk_max_f16 v2, v1, v1 5502; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 5503; GFX10-NEXT: s_mov_b32 s4, 0 5504; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start 5505; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5506; GFX10-NEXT: s_waitcnt vmcnt(0) 5507; GFX10-NEXT: v_mov_b32_e32 v5, v0 5508; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5509; GFX10-NEXT: v_pk_max_f16 v0, v5, v5 5510; GFX10-NEXT: v_pk_min_f16 v4, v0, v2 5511; GFX10-NEXT: v_mov_b32_e32 v0, v4 5512; GFX10-NEXT: v_mov_b32_e32 v1, v5 5513; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 5514; GFX10-NEXT: s_waitcnt vmcnt(0) 5515; GFX10-NEXT: buffer_gl1_inv 5516; GFX10-NEXT: buffer_gl0_inv 5517; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 5518; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5519; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5520; GFX10-NEXT: s_cbranch_execnz .LBB16_1 5521; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5522; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5523; GFX10-NEXT: s_setpc_b64 s[30:31] 5524; 5525; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5526; GFX90A: ; %bb.0: 5527; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5528; GFX90A-NEXT: v_mov_b32_e32 v1, v0 5529; GFX90A-NEXT: v_mov_b32_e32 v0, s20 5530; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 5531; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 5532; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5533; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1 5534; GFX90A-NEXT: v_mov_b32_e32 v3, s6 5535; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start 5536; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5537; GFX90A-NEXT: s_waitcnt vmcnt(0) 5538; GFX90A-NEXT: v_mov_b32_e32 v5, v0 5539; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5 5540; GFX90A-NEXT: v_pk_min_f16 v4, v0, v2 5541; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 5542; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 5543; GFX90A-NEXT: s_waitcnt vmcnt(0) 5544; GFX90A-NEXT: buffer_wbinvl1 5545; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 5546; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5547; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 5548; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 5549; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 5550; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5551; GFX90A-NEXT: s_setpc_b64 s[30:31] 5552; 5553; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5554; GFX908: ; %bb.0: 5555; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5556; GFX908-NEXT: v_mov_b32_e32 v1, v0 5557; GFX908-NEXT: v_mov_b32_e32 v0, s20 5558; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 5559; GFX908-NEXT: s_add_i32 s6, s20, 0x400 5560; GFX908-NEXT: s_mov_b64 s[4:5], 0 5561; GFX908-NEXT: v_pk_max_f16 v2, v1, v1 5562; GFX908-NEXT: v_mov_b32_e32 v3, s6 5563; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start 5564; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5565; GFX908-NEXT: s_waitcnt vmcnt(0) 5566; GFX908-NEXT: v_mov_b32_e32 v5, v0 5567; GFX908-NEXT: v_pk_max_f16 v0, v5, v5 5568; GFX908-NEXT: v_pk_min_f16 v4, v0, v2 5569; GFX908-NEXT: v_mov_b32_e32 v0, v4 5570; GFX908-NEXT: v_mov_b32_e32 v1, v5 5571; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 5572; GFX908-NEXT: s_waitcnt vmcnt(0) 5573; GFX908-NEXT: buffer_wbinvl1 5574; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 5575; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5576; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5577; GFX908-NEXT: s_cbranch_execnz .LBB16_1 5578; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5579; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5580; GFX908-NEXT: s_setpc_b64 s[30:31] 5581; 5582; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5583; GFX8: ; %bb.0: 5584; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5585; GFX8-NEXT: v_mov_b32_e32 v1, v0 5586; GFX8-NEXT: v_mov_b32_e32 v0, s20 5587; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 5588; GFX8-NEXT: s_add_i32 s6, s20, 0x400 5589; GFX8-NEXT: s_mov_b64 s[4:5], 0 5590; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 5591; GFX8-NEXT: v_max_f16_e32 v3, v1, v1 5592; GFX8-NEXT: v_mov_b32_e32 v4, s6 5593; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start 5594; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5595; GFX8-NEXT: s_waitcnt vmcnt(0) 5596; GFX8-NEXT: v_mov_b32_e32 v6, v0 5597; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 5598; GFX8-NEXT: v_max_f16_e32 v1, v6, v6 5599; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5600; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 5601; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 5602; GFX8-NEXT: v_mov_b32_e32 v0, v5 5603; GFX8-NEXT: v_mov_b32_e32 v1, v6 5604; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 5605; GFX8-NEXT: s_waitcnt vmcnt(0) 5606; GFX8-NEXT: buffer_wbinvl1 5607; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 5608; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5609; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5610; GFX8-NEXT: s_cbranch_execnz .LBB16_1 5611; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5612; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5613; GFX8-NEXT: s_setpc_b64 s[30:31] 5614; 5615; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5616; GFX7: ; %bb.0: 5617; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5618; GFX7-NEXT: v_mov_b32_e32 v2, s20 5619; GFX7-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 5620; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 5621; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0 5622; GFX7-NEXT: s_add_i32 s6, s20, 0x400 5623; GFX7-NEXT: s_mov_b64 s[4:5], 0 5624; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1 5625; GFX7-NEXT: s_waitcnt vmcnt(0) 5626; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 5627; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 5628; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 5629; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 5630; GFX7-NEXT: v_mov_b32_e32 v4, s6 5631; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start 5632; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5633; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 5634; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 5635; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1 5636; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 5637; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5638; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 5639; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 5640; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 5641; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 5642; GFX7-NEXT: v_or_b32_e32 v6, v0, v1 5643; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 5644; GFX7-NEXT: v_or_b32_e32 v5, v7, v0 5645; GFX7-NEXT: v_mov_b32_e32 v8, v6 5646; GFX7-NEXT: v_mov_b32_e32 v7, v5 5647; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 5648; GFX7-NEXT: s_waitcnt vmcnt(0) 5649; GFX7-NEXT: buffer_wbinvl1 5650; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7 5651; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7 5652; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 5653; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 5654; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5655; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5656; GFX7-NEXT: s_cbranch_execnz .LBB16_1 5657; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5658; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5659; GFX7-NEXT: s_setpc_b64 s[30:31] 5660; 5661; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no_fine_grained_memory: 5662; GFX6: ; %bb.0: 5663; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5664; GFX6-NEXT: v_mov_b32_e32 v2, s20 5665; GFX6-NEXT: buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024 5666; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 5667; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0 5668; GFX6-NEXT: s_add_i32 s6, s20, 0x400 5669; GFX6-NEXT: s_mov_b64 s[4:5], 0 5670; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1 5671; GFX6-NEXT: s_waitcnt vmcnt(0) 5672; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3 5673; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3 5674; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 5675; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4 5676; GFX6-NEXT: v_mov_b32_e32 v4, s6 5677; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start 5678; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 5679; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 5680; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 5681; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1 5682; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0 5683; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5684; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 5685; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 5686; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 5687; GFX6-NEXT: s_waitcnt expcnt(0) 5688; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 5689; GFX6-NEXT: v_or_b32_e32 v6, v0, v1 5690; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 5691; GFX6-NEXT: v_or_b32_e32 v5, v7, v0 5692; GFX6-NEXT: v_mov_b32_e32 v8, v6 5693; GFX6-NEXT: v_mov_b32_e32 v7, v5 5694; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc 5695; GFX6-NEXT: s_waitcnt vmcnt(0) 5696; GFX6-NEXT: buffer_wbinvl1 5697; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7 5698; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7 5699; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 5700; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 5701; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5702; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 5703; GFX6-NEXT: s_cbranch_execnz .LBB16_1 5704; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 5705; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 5706; GFX6-NEXT: s_waitcnt expcnt(0) 5707; GFX6-NEXT: s_setpc_b64 s[30:31] 5708 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 5709 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 5710 ret <2 x half> %result 5711} 5712 5713define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 { 5714; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5715; GFX12: ; %bb.0: 5716; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5717; GFX12-NEXT: s_wait_expcnt 0x0 5718; GFX12-NEXT: s_wait_samplecnt 0x0 5719; GFX12-NEXT: s_wait_bvhcnt 0x0 5720; GFX12-NEXT: s_wait_kmcnt 0x0 5721; GFX12-NEXT: v_mov_b32_e32 v1, s16 5722; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 5723; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0 5724; GFX12-NEXT: s_wait_alu 0xfffe 5725; GFX12-NEXT: v_mov_b32_e32 v3, s4 5726; GFX12-NEXT: s_mov_b32 s4, 0 5727; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 5728; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start 5729; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5730; GFX12-NEXT: s_wait_loadcnt 0x0 5731; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1 5732; GFX12-NEXT: s_wait_storecnt 0x0 5733; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5734; GFX12-NEXT: v_pk_min_num_f16 v0, v0, v2 5735; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 5736; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN 5737; GFX12-NEXT: s_wait_loadcnt 0x0 5738; GFX12-NEXT: global_inv scope:SCOPE_DEV 5739; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5740; GFX12-NEXT: v_mov_b32_e32 v1, v4 5741; GFX12-NEXT: s_wait_alu 0xfffe 5742; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 5743; GFX12-NEXT: s_wait_alu 0xfffe 5744; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 5745; GFX12-NEXT: s_cbranch_execnz .LBB17_1 5746; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 5747; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 5748; GFX12-NEXT: s_wait_alu 0xfffe 5749; GFX12-NEXT: s_setpc_b64 s[30:31] 5750; 5751; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5752; GFX940: ; %bb.0: 5753; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5754; GFX940-NEXT: v_mov_b32_e32 v1, s16 5755; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 5756; GFX940-NEXT: s_add_i32 s6, s16, 0x400 5757; GFX940-NEXT: s_mov_b64 s[4:5], 0 5758; GFX940-NEXT: v_pk_max_f16 v2, v0, v0 5759; GFX940-NEXT: v_mov_b32_e32 v3, s6 5760; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start 5761; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 5762; GFX940-NEXT: s_waitcnt vmcnt(0) 5763; GFX940-NEXT: v_pk_max_f16 v0, v1, v1 5764; GFX940-NEXT: buffer_wbl2 sc1 5765; GFX940-NEXT: v_pk_min_f16 v0, v0, v2 5766; GFX940-NEXT: s_nop 0 5767; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 5768; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0 5769; GFX940-NEXT: s_waitcnt vmcnt(0) 5770; GFX940-NEXT: buffer_inv sc1 5771; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5772; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5773; GFX940-NEXT: v_mov_b32_e32 v1, v4 5774; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 5775; GFX940-NEXT: s_cbranch_execnz .LBB17_1 5776; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 5777; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 5778; GFX940-NEXT: s_setpc_b64 s[30:31] 5779; 5780; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5781; GFX11: ; %bb.0: 5782; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5783; GFX11-NEXT: v_mov_b32_e32 v1, s16 5784; GFX11-NEXT: s_add_i32 s4, s16, 0x400 5785; GFX11-NEXT: v_pk_max_f16 v2, v0, v0 5786; GFX11-NEXT: v_mov_b32_e32 v3, s4 5787; GFX11-NEXT: s_mov_b32 s4, 0 5788; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 5789; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start 5790; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5791; GFX11-NEXT: s_waitcnt vmcnt(0) 5792; GFX11-NEXT: v_pk_max_f16 v0, v1, v1 5793; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5794; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5795; GFX11-NEXT: v_pk_min_f16 v0, v0, v2 5796; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 5797; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc 5798; GFX11-NEXT: s_waitcnt vmcnt(0) 5799; GFX11-NEXT: buffer_gl1_inv 5800; GFX11-NEXT: buffer_gl0_inv 5801; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5802; GFX11-NEXT: v_mov_b32_e32 v1, v4 5803; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 5804; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5805; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 5806; GFX11-NEXT: s_cbranch_execnz .LBB17_1 5807; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 5808; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 5809; GFX11-NEXT: s_setpc_b64 s[30:31] 5810; 5811; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5812; GFX10: ; %bb.0: 5813; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5814; GFX10-NEXT: v_mov_b32_e32 v1, s20 5815; GFX10-NEXT: s_add_i32 s4, s20, 0x400 5816; GFX10-NEXT: v_pk_max_f16 v2, v0, v0 5817; GFX10-NEXT: v_mov_b32_e32 v3, s4 5818; GFX10-NEXT: s_mov_b32 s4, 0 5819; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 5820; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start 5821; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5822; GFX10-NEXT: s_waitcnt vmcnt(0) 5823; GFX10-NEXT: v_pk_max_f16 v0, v1, v1 5824; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5825; GFX10-NEXT: v_pk_min_f16 v0, v0, v2 5826; GFX10-NEXT: v_mov_b32_e32 v5, v1 5827; GFX10-NEXT: v_mov_b32_e32 v4, v0 5828; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 5829; GFX10-NEXT: s_waitcnt vmcnt(0) 5830; GFX10-NEXT: buffer_gl1_inv 5831; GFX10-NEXT: buffer_gl0_inv 5832; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 5833; GFX10-NEXT: v_mov_b32_e32 v1, v4 5834; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 5835; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 5836; GFX10-NEXT: s_cbranch_execnz .LBB17_1 5837; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 5838; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5839; GFX10-NEXT: s_setpc_b64 s[30:31] 5840; 5841; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5842; GFX90A: ; %bb.0: 5843; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5844; GFX90A-NEXT: v_mov_b32_e32 v1, s20 5845; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 5846; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 5847; GFX90A-NEXT: s_mov_b64 s[4:5], 0 5848; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0 5849; GFX90A-NEXT: v_mov_b32_e32 v3, s6 5850; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start 5851; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5852; GFX90A-NEXT: s_waitcnt vmcnt(0) 5853; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 5854; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 5855; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 5856; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 5857; GFX90A-NEXT: s_waitcnt vmcnt(0) 5858; GFX90A-NEXT: buffer_wbinvl1 5859; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5860; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5861; GFX90A-NEXT: v_mov_b32_e32 v1, v4 5862; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 5863; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 5864; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 5865; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5866; GFX90A-NEXT: s_setpc_b64 s[30:31] 5867; 5868; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5869; GFX908: ; %bb.0: 5870; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5871; GFX908-NEXT: v_mov_b32_e32 v1, s20 5872; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 5873; GFX908-NEXT: s_add_i32 s6, s20, 0x400 5874; GFX908-NEXT: s_mov_b64 s[4:5], 0 5875; GFX908-NEXT: v_pk_max_f16 v2, v0, v0 5876; GFX908-NEXT: v_mov_b32_e32 v3, s6 5877; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start 5878; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5879; GFX908-NEXT: s_waitcnt vmcnt(0) 5880; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 5881; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 5882; GFX908-NEXT: v_mov_b32_e32 v5, v1 5883; GFX908-NEXT: v_mov_b32_e32 v4, v0 5884; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc 5885; GFX908-NEXT: s_waitcnt vmcnt(0) 5886; GFX908-NEXT: buffer_wbinvl1 5887; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 5888; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5889; GFX908-NEXT: v_mov_b32_e32 v1, v4 5890; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 5891; GFX908-NEXT: s_cbranch_execnz .LBB17_1 5892; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 5893; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5894; GFX908-NEXT: s_setpc_b64 s[30:31] 5895; 5896; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5897; GFX8: ; %bb.0: 5898; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5899; GFX8-NEXT: v_mov_b32_e32 v1, s20 5900; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 5901; GFX8-NEXT: s_add_i32 s6, s20, 0x400 5902; GFX8-NEXT: s_mov_b64 s[4:5], 0 5903; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 5904; GFX8-NEXT: v_max_f16_e32 v3, v0, v0 5905; GFX8-NEXT: v_mov_b32_e32 v4, s6 5906; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start 5907; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5908; GFX8-NEXT: s_waitcnt vmcnt(0) 5909; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 5910; GFX8-NEXT: v_max_f16_e32 v5, v1, v1 5911; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5912; GFX8-NEXT: v_min_f16_e32 v5, v5, v3 5913; GFX8-NEXT: v_or_b32_e32 v0, v5, v0 5914; GFX8-NEXT: v_mov_b32_e32 v6, v1 5915; GFX8-NEXT: v_mov_b32_e32 v5, v0 5916; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 5917; GFX8-NEXT: s_waitcnt vmcnt(0) 5918; GFX8-NEXT: buffer_wbinvl1 5919; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 5920; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5921; GFX8-NEXT: v_mov_b32_e32 v1, v5 5922; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 5923; GFX8-NEXT: s_cbranch_execnz .LBB17_1 5924; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 5925; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5926; GFX8-NEXT: s_setpc_b64 s[30:31] 5927; 5928; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5929; GFX7: ; %bb.0: 5930; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5931; GFX7-NEXT: v_mov_b32_e32 v2, s20 5932; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 5933; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 5934; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0 5935; GFX7-NEXT: s_add_i32 s6, s20, 0x400 5936; GFX7-NEXT: s_mov_b64 s[4:5], 0 5937; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 5938; GFX7-NEXT: s_waitcnt vmcnt(0) 5939; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 5940; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2 5941; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 5942; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5 5943; GFX7-NEXT: v_mov_b32_e32 v2, s6 5944; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start 5945; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5946; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 5947; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 5948; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4 5949; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 5950; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5951; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 5952; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 5953; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5 5954; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 5955; GFX7-NEXT: v_or_b32_e32 v5, v3, v4 5956; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 5957; GFX7-NEXT: v_or_b32_e32 v4, v6, v3 5958; GFX7-NEXT: v_mov_b32_e32 v7, v5 5959; GFX7-NEXT: v_mov_b32_e32 v6, v4 5960; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 5961; GFX7-NEXT: s_waitcnt vmcnt(0) 5962; GFX7-NEXT: buffer_wbinvl1 5963; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 5964; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 5965; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 5966; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 5967; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 5968; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 5969; GFX7-NEXT: s_cbranch_execnz .LBB17_1 5970; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 5971; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5972; GFX7-NEXT: s_setpc_b64 s[30:31] 5973; 5974; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fine_grained_memory: 5975; GFX6: ; %bb.0: 5976; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5977; GFX6-NEXT: v_mov_b32_e32 v2, s20 5978; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 5979; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 5980; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0 5981; GFX6-NEXT: s_add_i32 s6, s20, 0x400 5982; GFX6-NEXT: s_mov_b64 s[4:5], 0 5983; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1 5984; GFX6-NEXT: s_waitcnt vmcnt(0) 5985; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2 5986; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 5987; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 5988; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5 5989; GFX6-NEXT: v_mov_b32_e32 v2, s6 5990; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start 5991; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 5992; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 5993; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 5994; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4 5995; GFX6-NEXT: s_waitcnt expcnt(0) 5996; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3 5997; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 5998; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 5999; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 6000; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5 6001; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 6002; GFX6-NEXT: v_or_b32_e32 v5, v3, v4 6003; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 6004; GFX6-NEXT: v_or_b32_e32 v4, v6, v3 6005; GFX6-NEXT: v_mov_b32_e32 v7, v5 6006; GFX6-NEXT: v_mov_b32_e32 v6, v4 6007; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc 6008; GFX6-NEXT: s_waitcnt vmcnt(0) 6009; GFX6-NEXT: buffer_wbinvl1 6010; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6 6011; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6 6012; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 6013; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 6014; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6015; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 6016; GFX6-NEXT: s_cbranch_execnz .LBB17_1 6017; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 6018; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 6019; GFX6-NEXT: s_waitcnt expcnt(0) 6020; GFX6-NEXT: s_setpc_b64 s[30:31] 6021 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 6022 %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6023 ret void 6024} 6025 6026define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 { 6027; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6028; GFX12: ; %bb.0: 6029; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6030; GFX12-NEXT: s_wait_expcnt 0x0 6031; GFX12-NEXT: s_wait_samplecnt 0x0 6032; GFX12-NEXT: s_wait_bvhcnt 0x0 6033; GFX12-NEXT: s_wait_kmcnt 0x0 6034; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 6035; GFX12-NEXT: s_mov_b32 s1, exec_lo 6036; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6037; GFX12-NEXT: v_readfirstlane_b32 s4, v0 6038; GFX12-NEXT: v_readfirstlane_b32 s5, v1 6039; GFX12-NEXT: v_readfirstlane_b32 s6, v2 6040; GFX12-NEXT: v_readfirstlane_b32 s7, v3 6041; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6042; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6043; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6044; GFX12-NEXT: s_wait_alu 0xfffe 6045; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6046; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 6047; GFX12-NEXT: s_wait_alu 0xfffe 6048; GFX12-NEXT: s_and_saveexec_b32 s0, s0 6049; GFX12-NEXT: s_wait_loadcnt 0x0 6050; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 6051; GFX12-NEXT: ; implicit-def: $vgpr4 6052; GFX12-NEXT: s_wait_alu 0xfffe 6053; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6054; GFX12-NEXT: s_cbranch_execnz .LBB18_1 6055; GFX12-NEXT: ; %bb.2: 6056; GFX12-NEXT: s_mov_b32 exec_lo, s1 6057; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5 6058; GFX12-NEXT: s_mov_b32 s1, 0 6059; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start 6060; GFX12-NEXT: ; =>This Loop Header: Depth=1 6061; GFX12-NEXT: ; Child Loop BB18_4 Depth 2 6062; GFX12-NEXT: s_wait_loadcnt 0x0 6063; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6 6064; GFX12-NEXT: s_mov_b32 s2, exec_lo 6065; GFX12-NEXT: s_wait_storecnt 0x0 6066; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6067; GFX12-NEXT: v_pk_min_num_f16 v5, v4, v8 6068; GFX12-NEXT: v_mov_b32_e32 v4, v5 6069; GFX12-NEXT: v_mov_b32_e32 v5, v6 6070; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6071; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 6072; GFX12-NEXT: v_readfirstlane_b32 s4, v0 6073; GFX12-NEXT: v_readfirstlane_b32 s5, v1 6074; GFX12-NEXT: v_readfirstlane_b32 s6, v2 6075; GFX12-NEXT: v_readfirstlane_b32 s7, v3 6076; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6077; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6078; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6079; GFX12-NEXT: s_wait_alu 0xfffe 6080; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6081; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 6082; GFX12-NEXT: s_wait_alu 0xfffe 6083; GFX12-NEXT: s_and_saveexec_b32 s0, s0 6084; GFX12-NEXT: s_wait_loadcnt 0x0 6085; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN 6086; GFX12-NEXT: s_wait_alu 0xfffe 6087; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6088; GFX12-NEXT: s_cbranch_execnz .LBB18_4 6089; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6090; GFX12-NEXT: s_mov_b32 exec_lo, s2 6091; GFX12-NEXT: s_wait_loadcnt 0x0 6092; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 6093; GFX12-NEXT: v_mov_b32_e32 v6, v4 6094; GFX12-NEXT: global_inv scope:SCOPE_DEV 6095; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 6096; GFX12-NEXT: s_wait_alu 0xfffe 6097; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6098; GFX12-NEXT: s_cbranch_execnz .LBB18_3 6099; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 6100; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 6101; GFX12-NEXT: v_mov_b32_e32 v0, v4 6102; GFX12-NEXT: s_wait_alu 0xfffe 6103; GFX12-NEXT: s_setpc_b64 s[30:31] 6104; 6105; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6106; GFX940: ; %bb.0: 6107; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6108; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 6109; GFX940-NEXT: s_mov_b64 s[2:3], exec 6110; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6111; GFX940-NEXT: v_readfirstlane_b32 s4, v0 6112; GFX940-NEXT: v_readfirstlane_b32 s5, v1 6113; GFX940-NEXT: v_readfirstlane_b32 s6, v2 6114; GFX940-NEXT: v_readfirstlane_b32 s7, v3 6115; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 6116; GFX940-NEXT: s_nop 0 6117; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 6118; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 6119; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 6120; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 6121; GFX940-NEXT: ; implicit-def: $vgpr4 6122; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 6123; GFX940-NEXT: s_cbranch_execnz .LBB18_1 6124; GFX940-NEXT: ; %bb.2: 6125; GFX940-NEXT: s_mov_b64 exec, s[2:3] 6126; GFX940-NEXT: s_mov_b64 s[2:3], 0 6127; GFX940-NEXT: v_pk_max_f16 v9, v5, v5 6128; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start 6129; GFX940-NEXT: ; =>This Loop Header: Depth=1 6130; GFX940-NEXT: ; Child Loop BB18_4 Depth 2 6131; GFX940-NEXT: s_waitcnt vmcnt(0) 6132; GFX940-NEXT: v_pk_max_f16 v4, v7, v7 6133; GFX940-NEXT: s_mov_b64 s[8:9], exec 6134; GFX940-NEXT: v_pk_min_f16 v6, v4, v9 6135; GFX940-NEXT: buffer_wbl2 sc1 6136; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 6137; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6138; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 6139; GFX940-NEXT: v_readfirstlane_b32 s4, v0 6140; GFX940-NEXT: v_readfirstlane_b32 s5, v1 6141; GFX940-NEXT: v_readfirstlane_b32 s6, v2 6142; GFX940-NEXT: v_readfirstlane_b32 s7, v3 6143; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 6144; GFX940-NEXT: s_nop 0 6145; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 6146; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 6147; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 6148; GFX940-NEXT: s_waitcnt vmcnt(0) 6149; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 6150; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 6151; GFX940-NEXT: s_cbranch_execnz .LBB18_4 6152; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6153; GFX940-NEXT: s_mov_b64 exec, s[8:9] 6154; GFX940-NEXT: s_waitcnt vmcnt(0) 6155; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6156; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 6157; GFX940-NEXT: v_mov_b32_e32 v7, v4 6158; GFX940-NEXT: buffer_inv sc1 6159; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 6160; GFX940-NEXT: s_cbranch_execnz .LBB18_3 6161; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 6162; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 6163; GFX940-NEXT: v_mov_b32_e32 v0, v4 6164; GFX940-NEXT: s_setpc_b64 s[30:31] 6165; 6166; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6167; GFX11: ; %bb.0: 6168; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6169; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 6170; GFX11-NEXT: s_mov_b32 s1, 0 6171; GFX11-NEXT: s_mov_b32 s2, exec_lo 6172; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6173; GFX11-NEXT: v_readfirstlane_b32 s4, v0 6174; GFX11-NEXT: v_readfirstlane_b32 s5, v1 6175; GFX11-NEXT: v_readfirstlane_b32 s6, v2 6176; GFX11-NEXT: v_readfirstlane_b32 s7, v3 6177; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6178; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6179; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6180; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 6181; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 6182; GFX11-NEXT: s_and_saveexec_b32 s0, s0 6183; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 6184; GFX11-NEXT: ; implicit-def: $vgpr4 6185; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6186; GFX11-NEXT: s_cbranch_execnz .LBB18_1 6187; GFX11-NEXT: ; %bb.2: 6188; GFX11-NEXT: s_mov_b32 exec_lo, s2 6189; GFX11-NEXT: v_pk_max_f16 v8, v5, v5 6190; GFX11-NEXT: .p2align 6 6191; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start 6192; GFX11-NEXT: ; =>This Loop Header: Depth=1 6193; GFX11-NEXT: ; Child Loop BB18_4 Depth 2 6194; GFX11-NEXT: s_waitcnt vmcnt(0) 6195; GFX11-NEXT: v_pk_max_f16 v4, v6, v6 6196; GFX11-NEXT: s_mov_b32 s2, exec_lo 6197; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6199; GFX11-NEXT: v_pk_min_f16 v5, v4, v8 6200; GFX11-NEXT: v_mov_b32_e32 v4, v5 6201; GFX11-NEXT: v_mov_b32_e32 v5, v6 6202; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6203; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 6204; GFX11-NEXT: v_readfirstlane_b32 s4, v0 6205; GFX11-NEXT: v_readfirstlane_b32 s5, v1 6206; GFX11-NEXT: v_readfirstlane_b32 s6, v2 6207; GFX11-NEXT: v_readfirstlane_b32 s7, v3 6208; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 6209; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 6210; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 6211; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 6212; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 6213; GFX11-NEXT: s_and_saveexec_b32 s0, s0 6214; GFX11-NEXT: s_waitcnt vmcnt(0) 6215; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc 6216; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 6217; GFX11-NEXT: s_cbranch_execnz .LBB18_4 6218; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6219; GFX11-NEXT: s_mov_b32 exec_lo, s2 6220; GFX11-NEXT: s_waitcnt vmcnt(0) 6221; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 6222; GFX11-NEXT: v_mov_b32_e32 v6, v4 6223; GFX11-NEXT: buffer_gl1_inv 6224; GFX11-NEXT: buffer_gl0_inv 6225; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 6226; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6227; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 6228; GFX11-NEXT: s_cbranch_execnz .LBB18_3 6229; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 6230; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 6231; GFX11-NEXT: v_mov_b32_e32 v0, v4 6232; GFX11-NEXT: s_setpc_b64 s[30:31] 6233; 6234; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6235; GFX10: ; %bb.0: 6236; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6237; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 6238; GFX10-NEXT: s_mov_b32 s5, 0 6239; GFX10-NEXT: s_mov_b32 s6, exec_lo 6240; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6241; GFX10-NEXT: v_readfirstlane_b32 s8, v0 6242; GFX10-NEXT: v_readfirstlane_b32 s9, v1 6243; GFX10-NEXT: v_readfirstlane_b32 s10, v2 6244; GFX10-NEXT: v_readfirstlane_b32 s11, v3 6245; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 6246; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 6247; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 6248; GFX10-NEXT: s_and_saveexec_b32 s4, s4 6249; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 6250; GFX10-NEXT: ; implicit-def: $vgpr4 6251; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6252; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 6253; GFX10-NEXT: s_cbranch_execnz .LBB18_1 6254; GFX10-NEXT: ; %bb.2: 6255; GFX10-NEXT: s_mov_b32 exec_lo, s6 6256; GFX10-NEXT: v_pk_max_f16 v8, v5, v5 6257; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start 6258; GFX10-NEXT: ; =>This Loop Header: Depth=1 6259; GFX10-NEXT: ; Child Loop BB18_4 Depth 2 6260; GFX10-NEXT: s_waitcnt vmcnt(0) 6261; GFX10-NEXT: v_pk_max_f16 v4, v6, v6 6262; GFX10-NEXT: s_mov_b32 s6, exec_lo 6263; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6264; GFX10-NEXT: v_pk_min_f16 v5, v4, v8 6265; GFX10-NEXT: v_mov_b32_e32 v4, v5 6266; GFX10-NEXT: v_mov_b32_e32 v5, v6 6267; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6268; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 6269; GFX10-NEXT: v_readfirstlane_b32 s8, v0 6270; GFX10-NEXT: v_readfirstlane_b32 s9, v1 6271; GFX10-NEXT: v_readfirstlane_b32 s10, v2 6272; GFX10-NEXT: v_readfirstlane_b32 s11, v3 6273; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 6274; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 6275; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 6276; GFX10-NEXT: s_and_saveexec_b32 s4, s4 6277; GFX10-NEXT: s_waitcnt vmcnt(0) 6278; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 6279; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6280; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 6281; GFX10-NEXT: s_cbranch_execnz .LBB18_4 6282; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6283; GFX10-NEXT: s_mov_b32 exec_lo, s6 6284; GFX10-NEXT: s_waitcnt vmcnt(0) 6285; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 6286; GFX10-NEXT: v_mov_b32_e32 v6, v4 6287; GFX10-NEXT: buffer_gl1_inv 6288; GFX10-NEXT: buffer_gl0_inv 6289; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 6290; GFX10-NEXT: s_waitcnt_depctr 0xffe3 6291; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 6292; GFX10-NEXT: s_cbranch_execnz .LBB18_3 6293; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 6294; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 6295; GFX10-NEXT: v_mov_b32_e32 v0, v4 6296; GFX10-NEXT: s_setpc_b64 s[30:31] 6297; 6298; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6299; GFX90A: ; %bb.0: 6300; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6301; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 6302; GFX90A-NEXT: s_mov_b64 s[6:7], exec 6303; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6304; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 6305; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 6306; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 6307; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 6308; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6309; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6310; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6311; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6312; GFX90A-NEXT: s_nop 0 6313; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 6314; GFX90A-NEXT: ; implicit-def: $vgpr4 6315; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 6316; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 6317; GFX90A-NEXT: ; %bb.2: 6318; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 6319; GFX90A-NEXT: s_mov_b64 s[6:7], 0 6320; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5 6321; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start 6322; GFX90A-NEXT: ; =>This Loop Header: Depth=1 6323; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2 6324; GFX90A-NEXT: s_waitcnt vmcnt(0) 6325; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7 6326; GFX90A-NEXT: v_pk_min_f16 v6, v4, v9 6327; GFX90A-NEXT: s_mov_b64 s[12:13], exec 6328; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 6329; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6330; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 6331; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 6332; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 6333; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 6334; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 6335; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6336; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6337; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6338; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6339; GFX90A-NEXT: s_waitcnt vmcnt(0) 6340; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 6341; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 6342; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 6343; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6344; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 6345; GFX90A-NEXT: s_waitcnt vmcnt(0) 6346; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 6347; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6348; GFX90A-NEXT: v_mov_b32_e32 v7, v4 6349; GFX90A-NEXT: buffer_wbinvl1 6350; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 6351; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 6352; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 6353; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 6354; GFX90A-NEXT: v_mov_b32_e32 v0, v4 6355; GFX90A-NEXT: s_setpc_b64 s[30:31] 6356; 6357; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6358; GFX908: ; %bb.0: 6359; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6360; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 6361; GFX908-NEXT: s_mov_b64 s[6:7], exec 6362; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6363; GFX908-NEXT: v_readfirstlane_b32 s8, v0 6364; GFX908-NEXT: v_readfirstlane_b32 s9, v1 6365; GFX908-NEXT: v_readfirstlane_b32 s10, v2 6366; GFX908-NEXT: v_readfirstlane_b32 s11, v3 6367; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6368; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6369; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6370; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6371; GFX908-NEXT: s_nop 0 6372; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 6373; GFX908-NEXT: ; implicit-def: $vgpr4 6374; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 6375; GFX908-NEXT: s_cbranch_execnz .LBB18_1 6376; GFX908-NEXT: ; %bb.2: 6377; GFX908-NEXT: s_mov_b64 exec, s[6:7] 6378; GFX908-NEXT: s_mov_b64 s[6:7], 0 6379; GFX908-NEXT: v_pk_max_f16 v8, v5, v5 6380; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start 6381; GFX908-NEXT: ; =>This Loop Header: Depth=1 6382; GFX908-NEXT: ; Child Loop BB18_4 Depth 2 6383; GFX908-NEXT: s_waitcnt vmcnt(0) 6384; GFX908-NEXT: v_pk_max_f16 v4, v6, v6 6385; GFX908-NEXT: v_pk_min_f16 v5, v4, v8 6386; GFX908-NEXT: v_mov_b32_e32 v4, v5 6387; GFX908-NEXT: s_mov_b64 s[12:13], exec 6388; GFX908-NEXT: v_mov_b32_e32 v5, v6 6389; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6390; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 6391; GFX908-NEXT: v_readfirstlane_b32 s8, v0 6392; GFX908-NEXT: v_readfirstlane_b32 s9, v1 6393; GFX908-NEXT: v_readfirstlane_b32 s10, v2 6394; GFX908-NEXT: v_readfirstlane_b32 s11, v3 6395; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6396; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6397; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6398; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6399; GFX908-NEXT: s_waitcnt vmcnt(0) 6400; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 6401; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 6402; GFX908-NEXT: s_cbranch_execnz .LBB18_4 6403; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6404; GFX908-NEXT: s_mov_b64 exec, s[12:13] 6405; GFX908-NEXT: s_waitcnt vmcnt(0) 6406; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6407; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6408; GFX908-NEXT: v_mov_b32_e32 v6, v4 6409; GFX908-NEXT: buffer_wbinvl1 6410; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 6411; GFX908-NEXT: s_cbranch_execnz .LBB18_3 6412; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 6413; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 6414; GFX908-NEXT: v_mov_b32_e32 v0, v4 6415; GFX908-NEXT: s_setpc_b64 s[30:31] 6416; 6417; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6418; GFX8: ; %bb.0: 6419; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6420; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 6421; GFX8-NEXT: s_mov_b64 s[6:7], exec 6422; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6423; GFX8-NEXT: v_readfirstlane_b32 s8, v0 6424; GFX8-NEXT: v_readfirstlane_b32 s9, v1 6425; GFX8-NEXT: v_readfirstlane_b32 s10, v2 6426; GFX8-NEXT: v_readfirstlane_b32 s11, v3 6427; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6428; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6429; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6430; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6431; GFX8-NEXT: s_nop 0 6432; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 6433; GFX8-NEXT: ; implicit-def: $vgpr4 6434; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 6435; GFX8-NEXT: s_cbranch_execnz .LBB18_1 6436; GFX8-NEXT: ; %bb.2: 6437; GFX8-NEXT: s_mov_b64 exec, s[6:7] 6438; GFX8-NEXT: s_mov_b64 s[6:7], 0 6439; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 6440; GFX8-NEXT: v_max_f16_e32 v9, v5, v5 6441; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start 6442; GFX8-NEXT: ; =>This Loop Header: Depth=1 6443; GFX8-NEXT: ; Child Loop BB18_4 Depth 2 6444; GFX8-NEXT: s_waitcnt vmcnt(0) 6445; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 6446; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 6447; GFX8-NEXT: v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 6448; GFX8-NEXT: v_min_f16_e32 v5, v5, v9 6449; GFX8-NEXT: v_or_b32_e32 v5, v5, v4 6450; GFX8-NEXT: v_mov_b32_e32 v4, v5 6451; GFX8-NEXT: s_mov_b64 s[12:13], exec 6452; GFX8-NEXT: v_mov_b32_e32 v5, v6 6453; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6454; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 6455; GFX8-NEXT: v_readfirstlane_b32 s8, v0 6456; GFX8-NEXT: v_readfirstlane_b32 s9, v1 6457; GFX8-NEXT: v_readfirstlane_b32 s10, v2 6458; GFX8-NEXT: v_readfirstlane_b32 s11, v3 6459; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6460; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6461; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6462; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6463; GFX8-NEXT: s_waitcnt vmcnt(0) 6464; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 6465; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 6466; GFX8-NEXT: s_cbranch_execnz .LBB18_4 6467; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6468; GFX8-NEXT: s_mov_b64 exec, s[12:13] 6469; GFX8-NEXT: s_waitcnt vmcnt(0) 6470; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 6471; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6472; GFX8-NEXT: v_mov_b32_e32 v6, v4 6473; GFX8-NEXT: buffer_wbinvl1 6474; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 6475; GFX8-NEXT: s_cbranch_execnz .LBB18_3 6476; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 6477; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 6478; GFX8-NEXT: v_mov_b32_e32 v0, v4 6479; GFX8-NEXT: s_setpc_b64 s[30:31] 6480; 6481; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6482; GFX7: ; %bb.0: 6483; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6484; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 6485; GFX7-NEXT: s_mov_b64 s[6:7], exec 6486; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6487; GFX7-NEXT: v_readfirstlane_b32 s8, v0 6488; GFX7-NEXT: v_readfirstlane_b32 s9, v1 6489; GFX7-NEXT: v_readfirstlane_b32 s10, v2 6490; GFX7-NEXT: v_readfirstlane_b32 s11, v3 6491; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6492; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6493; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6494; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6495; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 6496; GFX7-NEXT: ; implicit-def: $vgpr4 6497; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 6498; GFX7-NEXT: s_cbranch_execnz .LBB18_1 6499; GFX7-NEXT: ; %bb.2: 6500; GFX7-NEXT: s_mov_b64 exec, s[6:7] 6501; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 6502; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5 6503; GFX7-NEXT: s_waitcnt vmcnt(0) 6504; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 6505; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 6506; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6507; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6 6508; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8 6509; GFX7-NEXT: s_mov_b64 s[6:7], 0 6510; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start 6511; GFX7-NEXT: ; =>This Loop Header: Depth=1 6512; GFX7-NEXT: ; Child Loop BB18_4 Depth 2 6513; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6514; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 6515; GFX7-NEXT: s_mov_b64 s[12:13], exec 6516; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 6517; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 6518; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 6519; GFX7-NEXT: v_min_f32_e32 v6, v6, v10 6520; GFX7-NEXT: v_min_f32_e32 v7, v7, v11 6521; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 6522; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 6523; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 6524; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 6525; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 6526; GFX7-NEXT: v_mov_b32_e32 v8, v6 6527; GFX7-NEXT: v_mov_b32_e32 v7, v5 6528; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6529; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 6530; GFX7-NEXT: v_readfirstlane_b32 s8, v0 6531; GFX7-NEXT: v_readfirstlane_b32 s9, v1 6532; GFX7-NEXT: v_readfirstlane_b32 s10, v2 6533; GFX7-NEXT: v_readfirstlane_b32 s11, v3 6534; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6535; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6536; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6537; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6538; GFX7-NEXT: s_waitcnt vmcnt(0) 6539; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc 6540; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 6541; GFX7-NEXT: s_cbranch_execnz .LBB18_4 6542; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6543; GFX7-NEXT: s_mov_b64 exec, s[12:13] 6544; GFX7-NEXT: s_waitcnt vmcnt(0) 6545; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 6546; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 6547; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6548; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 6549; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6550; GFX7-NEXT: buffer_wbinvl1 6551; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 6552; GFX7-NEXT: s_cbranch_execnz .LBB18_3 6553; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 6554; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 6555; GFX7-NEXT: v_mov_b32_e32 v0, v4 6556; GFX7-NEXT: v_mov_b32_e32 v1, v5 6557; GFX7-NEXT: s_setpc_b64 s[30:31] 6558; 6559; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory: 6560; GFX6: ; %bb.0: 6561; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6562; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4 6563; GFX6-NEXT: s_mov_b64 s[6:7], exec 6564; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 6565; GFX6-NEXT: v_readfirstlane_b32 s8, v0 6566; GFX6-NEXT: v_readfirstlane_b32 s9, v1 6567; GFX6-NEXT: v_readfirstlane_b32 s10, v2 6568; GFX6-NEXT: v_readfirstlane_b32 s11, v3 6569; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6570; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6571; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6572; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6573; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 6574; GFX6-NEXT: ; implicit-def: $vgpr4 6575; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 6576; GFX6-NEXT: s_cbranch_execnz .LBB18_1 6577; GFX6-NEXT: ; %bb.2: 6578; GFX6-NEXT: s_mov_b64 exec, s[6:7] 6579; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6 6580; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5 6581; GFX6-NEXT: s_waitcnt vmcnt(0) 6582; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 6583; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 6584; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 6585; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6 6586; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8 6587; GFX6-NEXT: s_mov_b64 s[6:7], 0 6588; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start 6589; GFX6-NEXT: ; =>This Loop Header: Depth=1 6590; GFX6-NEXT: ; Child Loop BB18_4 Depth 2 6591; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5 6592; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 6593; GFX6-NEXT: s_mov_b64 s[12:13], exec 6594; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5 6595; GFX6-NEXT: s_waitcnt expcnt(0) 6596; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4 6597; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 6598; GFX6-NEXT: v_min_f32_e32 v6, v6, v10 6599; GFX6-NEXT: v_min_f32_e32 v7, v7, v11 6600; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6 6601; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7 6602; GFX6-NEXT: v_or_b32_e32 v6, v4, v5 6603; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8 6604; GFX6-NEXT: v_or_b32_e32 v5, v7, v4 6605; GFX6-NEXT: v_mov_b32_e32 v8, v6 6606; GFX6-NEXT: v_mov_b32_e32 v7, v5 6607; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 6608; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 6609; GFX6-NEXT: v_readfirstlane_b32 s8, v0 6610; GFX6-NEXT: v_readfirstlane_b32 s9, v1 6611; GFX6-NEXT: v_readfirstlane_b32 s10, v2 6612; GFX6-NEXT: v_readfirstlane_b32 s11, v3 6613; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 6614; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 6615; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 6616; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 6617; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6618; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc 6619; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 6620; GFX6-NEXT: s_cbranch_execnz .LBB18_4 6621; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 6622; GFX6-NEXT: s_mov_b64 exec, s[12:13] 6623; GFX6-NEXT: s_waitcnt vmcnt(0) 6624; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7 6625; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7 6626; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 6627; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 6628; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6629; GFX6-NEXT: buffer_wbinvl1 6630; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 6631; GFX6-NEXT: s_cbranch_execnz .LBB18_3 6632; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 6633; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 6634; GFX6-NEXT: v_mov_b32_e32 v0, v4 6635; GFX6-NEXT: v_mov_b32_e32 v1, v5 6636; GFX6-NEXT: s_waitcnt expcnt(0) 6637; GFX6-NEXT: s_setpc_b64 s[30:31] 6638 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256 6639 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6640 ret <2 x half> %result 6641} 6642 6643; -------------------------------------------------------------------- 6644; <2 x bfloat> 6645; -------------------------------------------------------------------- 6646 6647define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 6648; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6649; GFX12: ; %bb.0: 6650; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6651; GFX12-NEXT: s_wait_expcnt 0x0 6652; GFX12-NEXT: s_wait_samplecnt 0x0 6653; GFX12-NEXT: s_wait_bvhcnt 0x0 6654; GFX12-NEXT: s_wait_kmcnt 0x0 6655; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 6656; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 6657; GFX12-NEXT: s_mov_b32 s5, 0 6658; GFX12-NEXT: s_wait_alu 0xfffe 6659; GFX12-NEXT: v_mov_b32_e32 v4, s4 6660; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6661; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 6662; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6663; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start 6664; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6665; GFX12-NEXT: s_wait_loadcnt 0x0 6666; GFX12-NEXT: v_mov_b32_e32 v6, v0 6667; GFX12-NEXT: s_wait_storecnt 0x0 6668; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6669; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 6670; GFX12-NEXT: v_min_num_f32_e32 v1, v1, v3 6671; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 6672; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1 6673; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1 6674; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 6675; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 6676; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6677; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 6678; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v2 6679; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 6680; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1 6681; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 6682; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 6683; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 6684; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6685; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 6686; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 6687; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6688; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 6689; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN 6690; GFX12-NEXT: s_wait_loadcnt 0x0 6691; GFX12-NEXT: global_inv scope:SCOPE_DEV 6692; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 6693; GFX12-NEXT: s_wait_alu 0xfffe 6694; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 6695; GFX12-NEXT: s_wait_alu 0xfffe 6696; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 6697; GFX12-NEXT: s_cbranch_execnz .LBB19_1 6698; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6699; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 6700; GFX12-NEXT: s_wait_alu 0xfffe 6701; GFX12-NEXT: s_setpc_b64 s[30:31] 6702; 6703; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6704; GFX940: ; %bb.0: 6705; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6706; GFX940-NEXT: v_mov_b32_e32 v1, v0 6707; GFX940-NEXT: v_mov_b32_e32 v0, s16 6708; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 6709; GFX940-NEXT: s_add_i32 s4, s16, 0x400 6710; GFX940-NEXT: s_mov_b64 s[6:7], 0 6711; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6712; GFX940-NEXT: s_movk_i32 s8, 0x7fff 6713; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6714; GFX940-NEXT: s_mov_b32 s9, 0x7060302 6715; GFX940-NEXT: v_mov_b32_e32 v4, s4 6716; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start 6717; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6718; GFX940-NEXT: s_waitcnt vmcnt(0) 6719; GFX940-NEXT: v_mov_b32_e32 v7, v0 6720; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 6721; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 6722; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 6723; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 6724; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1 6725; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1 6726; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0 6727; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1 6728; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8 6729; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8 6730; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 6731; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 6732; GFX940-NEXT: buffer_wbl2 sc1 6733; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 6734; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 6735; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9 6736; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7] 6737; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0 6738; GFX940-NEXT: s_waitcnt vmcnt(0) 6739; GFX940-NEXT: buffer_inv sc1 6740; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 6741; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6742; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 6743; GFX940-NEXT: s_cbranch_execnz .LBB19_1 6744; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6745; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 6746; GFX940-NEXT: s_setpc_b64 s[30:31] 6747; 6748; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6749; GFX11: ; %bb.0: 6750; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6751; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16 6752; GFX11-NEXT: s_add_i32 s4, s16, 0x400 6753; GFX11-NEXT: s_mov_b32 s5, 0 6754; GFX11-NEXT: v_mov_b32_e32 v4, s4 6755; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 6756; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6757; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 6758; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6759; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 6760; GFX11-NEXT: .p2align 6 6761; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start 6762; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6763; GFX11-NEXT: s_waitcnt vmcnt(0) 6764; GFX11-NEXT: v_mov_b32_e32 v6, v0 6765; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6766; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6767; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 6768; GFX11-NEXT: v_min_f32_e32 v1, v1, v3 6769; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 6770; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 6771; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 6772; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 6773; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 6774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6775; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6 6776; GFX11-NEXT: v_min_f32_e32 v0, v0, v2 6777; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 6778; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 6779; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 6780; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 6781; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 6782; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6783; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 6784; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 6785; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6786; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6 6787; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc 6788; GFX11-NEXT: s_waitcnt vmcnt(0) 6789; GFX11-NEXT: buffer_gl1_inv 6790; GFX11-NEXT: buffer_gl0_inv 6791; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 6792; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 6793; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6794; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 6795; GFX11-NEXT: s_cbranch_execnz .LBB19_1 6796; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6797; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 6798; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 6799; GFX11-NEXT: s_setpc_b64 s[30:31] 6800; 6801; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6802; GFX10: ; %bb.0: 6803; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6804; GFX10-NEXT: v_mov_b32_e32 v1, v0 6805; GFX10-NEXT: v_mov_b32_e32 v0, s20 6806; GFX10-NEXT: s_add_i32 s4, s20, 0x400 6807; GFX10-NEXT: s_mov_b32 s5, 0 6808; GFX10-NEXT: v_mov_b32_e32 v4, s4 6809; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6810; GFX10-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6811; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6812; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start 6813; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6814; GFX10-NEXT: s_waitcnt vmcnt(0) 6815; GFX10-NEXT: v_mov_b32_e32 v6, v0 6816; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6817; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 6818; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 6819; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 6820; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 6821; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 6822; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 6823; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 6824; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1 6825; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 6826; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff 6827; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff 6828; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 6829; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo 6830; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4 6831; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302 6832; GFX10-NEXT: v_mov_b32_e32 v0, v5 6833; GFX10-NEXT: v_mov_b32_e32 v1, v6 6834; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 6835; GFX10-NEXT: s_waitcnt vmcnt(0) 6836; GFX10-NEXT: buffer_gl1_inv 6837; GFX10-NEXT: buffer_gl0_inv 6838; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 6839; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 6840; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 6841; GFX10-NEXT: s_cbranch_execnz .LBB19_1 6842; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6843; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 6844; GFX10-NEXT: s_setpc_b64 s[30:31] 6845; 6846; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6847; GFX90A: ; %bb.0: 6848; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6849; GFX90A-NEXT: v_mov_b32_e32 v1, v0 6850; GFX90A-NEXT: v_mov_b32_e32 v0, s20 6851; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6852; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 6853; GFX90A-NEXT: s_mov_b64 s[6:7], 0 6854; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6855; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6856; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6857; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 6858; GFX90A-NEXT: v_mov_b32_e32 v4, s4 6859; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start 6860; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6861; GFX90A-NEXT: s_waitcnt vmcnt(0) 6862; GFX90A-NEXT: v_mov_b32_e32 v7, v0 6863; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 6864; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 6865; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 6866; GFX90A-NEXT: v_min_f32_e32 v1, v1, v3 6867; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1 6868; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1 6869; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0 6870; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1 6871; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s8 6872; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s8 6873; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 6874; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 6875; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5] 6876; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 6877; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s9 6878; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1] 6879; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 6880; GFX90A-NEXT: s_waitcnt vmcnt(0) 6881; GFX90A-NEXT: buffer_wbinvl1 6882; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 6883; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6884; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 6885; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 6886; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6887; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 6888; GFX90A-NEXT: s_setpc_b64 s[30:31] 6889; 6890; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6891; GFX908: ; %bb.0: 6892; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6893; GFX908-NEXT: v_mov_b32_e32 v1, v0 6894; GFX908-NEXT: v_mov_b32_e32 v0, s20 6895; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6896; GFX908-NEXT: s_add_i32 s4, s20, 0x400 6897; GFX908-NEXT: s_mov_b64 s[6:7], 0 6898; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6899; GFX908-NEXT: s_movk_i32 s8, 0x7fff 6900; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6901; GFX908-NEXT: s_mov_b32 s9, 0x7060302 6902; GFX908-NEXT: v_mov_b32_e32 v4, s4 6903; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start 6904; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6905; GFX908-NEXT: s_waitcnt vmcnt(0) 6906; GFX908-NEXT: v_mov_b32_e32 v6, v0 6907; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 6908; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 6909; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 6910; GFX908-NEXT: v_min_f32_e32 v1, v1, v3 6911; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1 6912; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1 6913; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 6914; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1 6915; GFX908-NEXT: v_add3_u32 v5, v5, v0, s8 6916; GFX908-NEXT: v_add3_u32 v8, v8, v1, s8 6917; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 6918; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 6919; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 6920; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 6921; GFX908-NEXT: v_perm_b32 v5, v1, v0, s9 6922; GFX908-NEXT: v_mov_b32_e32 v0, v5 6923; GFX908-NEXT: v_mov_b32_e32 v1, v6 6924; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 6925; GFX908-NEXT: s_waitcnt vmcnt(0) 6926; GFX908-NEXT: buffer_wbinvl1 6927; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 6928; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6929; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 6930; GFX908-NEXT: s_cbranch_execnz .LBB19_1 6931; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6932; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 6933; GFX908-NEXT: s_setpc_b64 s[30:31] 6934; 6935; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6936; GFX8: ; %bb.0: 6937; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6938; GFX8-NEXT: v_mov_b32_e32 v1, v0 6939; GFX8-NEXT: v_mov_b32_e32 v0, s20 6940; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 6941; GFX8-NEXT: s_add_i32 s4, s20, 0x400 6942; GFX8-NEXT: s_mov_b64 s[6:7], 0 6943; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 6944; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 6945; GFX8-NEXT: v_mov_b32_e32 v4, s4 6946; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start 6947; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6948; GFX8-NEXT: s_waitcnt vmcnt(0) 6949; GFX8-NEXT: v_mov_b32_e32 v6, v0 6950; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 6951; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 6952; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 6953; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 6954; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 6955; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1 6956; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 6957; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1 6958; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 6959; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 6960; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 6961; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 6962; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 6963; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 6964; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc 6965; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5] 6966; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 6967; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16 6968; GFX8-NEXT: v_mov_b32_e32 v0, v5 6969; GFX8-NEXT: v_mov_b32_e32 v1, v6 6970; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc 6971; GFX8-NEXT: s_waitcnt vmcnt(0) 6972; GFX8-NEXT: buffer_wbinvl1 6973; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 6974; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 6975; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 6976; GFX8-NEXT: s_cbranch_execnz .LBB19_1 6977; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6978; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 6979; GFX8-NEXT: s_setpc_b64 s[30:31] 6980; 6981; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 6982; GFX7: ; %bb.0: 6983; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6984; GFX7-NEXT: v_mov_b32_e32 v2, s20 6985; GFX7-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 6986; GFX7-NEXT: s_add_i32 s6, s20, 0x400 6987; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 6988; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 6989; GFX7-NEXT: s_mov_b64 s[4:5], 0 6990; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 6991; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 6992; GFX7-NEXT: s_waitcnt vmcnt(0) 6993; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 6994; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 6995; GFX7-NEXT: v_mov_b32_e32 v4, s6 6996; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start 6997; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6998; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 6999; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 7000; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7001; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 7002; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 7003; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 7004; GFX7-NEXT: v_min_f32_e32 v6, v6, v3 7005; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 7006; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5 7007; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16 7008; GFX7-NEXT: v_mov_b32_e32 v6, v1 7009; GFX7-NEXT: v_mov_b32_e32 v5, v0 7010; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 7011; GFX7-NEXT: s_waitcnt vmcnt(0) 7012; GFX7-NEXT: buffer_wbinvl1 7013; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 7014; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 7015; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7016; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5 7017; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7018; GFX7-NEXT: s_cbranch_execnz .LBB19_1 7019; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7020; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7021; GFX7-NEXT: s_setpc_b64 s[30:31] 7022; 7023; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7024; GFX6: ; %bb.0: 7025; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7026; GFX6-NEXT: v_mov_b32_e32 v2, s20 7027; GFX6-NEXT: buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024 7028; GFX6-NEXT: s_add_i32 s6, s20, 0x400 7029; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 7030; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 7031; GFX6-NEXT: s_mov_b64 s[4:5], 0 7032; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 7033; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7034; GFX6-NEXT: s_waitcnt vmcnt(0) 7035; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 7036; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4 7037; GFX6-NEXT: v_mov_b32_e32 v4, s6 7038; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start 7039; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 7040; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 7041; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 7042; GFX6-NEXT: s_waitcnt expcnt(0) 7043; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7044; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 7045; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 7046; GFX6-NEXT: v_min_f32_e32 v5, v5, v2 7047; GFX6-NEXT: v_min_f32_e32 v6, v6, v3 7048; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16 7049; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5 7050; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16 7051; GFX6-NEXT: v_mov_b32_e32 v6, v1 7052; GFX6-NEXT: v_mov_b32_e32 v5, v0 7053; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 7054; GFX6-NEXT: s_waitcnt vmcnt(0) 7055; GFX6-NEXT: buffer_wbinvl1 7056; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 7057; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 7058; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7059; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5 7060; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 7061; GFX6-NEXT: s_cbranch_execnz .LBB19_1 7062; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 7063; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 7064; GFX6-NEXT: s_waitcnt expcnt(0) 7065; GFX6-NEXT: s_setpc_b64 s[30:31] 7066 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 7067 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7068 ret <2 x bfloat> %result 7069} 7070 7071define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 { 7072; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7073; GFX12: ; %bb.0: 7074; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7075; GFX12-NEXT: s_wait_expcnt 0x0 7076; GFX12-NEXT: s_wait_samplecnt 0x0 7077; GFX12-NEXT: s_wait_bvhcnt 0x0 7078; GFX12-NEXT: s_wait_kmcnt 0x0 7079; GFX12-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 7080; GFX12-NEXT: s_add_co_i32 s4, s16, 0x400 7081; GFX12-NEXT: s_wait_alu 0xfffe 7082; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 7083; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024 7084; GFX12-NEXT: s_mov_b32 s5, 0 7085; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start 7086; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7087; GFX12-NEXT: s_wait_loadcnt 0x0 7088; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7089; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7090; GFX12-NEXT: s_wait_storecnt 0x0 7091; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7092; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v3 :: v_dual_min_num_f32 v0, v0, v2 7093; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 7094; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 7095; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1 7096; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0 7097; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5 7098; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7099; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 7100; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 7101; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0 7102; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7103; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 7104; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 7105; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7106; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 7107; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 7108; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN 7109; GFX12-NEXT: s_wait_loadcnt 0x0 7110; GFX12-NEXT: global_inv scope:SCOPE_DEV 7111; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 7112; GFX12-NEXT: v_mov_b32_e32 v1, v5 7113; GFX12-NEXT: s_wait_alu 0xfffe 7114; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5 7115; GFX12-NEXT: s_wait_alu 0xfffe 7116; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 7117; GFX12-NEXT: s_cbranch_execnz .LBB20_1 7118; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7119; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5 7120; GFX12-NEXT: s_wait_alu 0xfffe 7121; GFX12-NEXT: s_setpc_b64 s[30:31] 7122; 7123; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7124; GFX940: ; %bb.0: 7125; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7126; GFX940-NEXT: v_mov_b32_e32 v1, s16 7127; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024 7128; GFX940-NEXT: s_add_i32 s4, s16, 0x400 7129; GFX940-NEXT: s_mov_b64 s[6:7], 0 7130; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0 7131; GFX940-NEXT: s_movk_i32 s8, 0x7fff 7132; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7133; GFX940-NEXT: s_mov_b32 s9, 0x7060302 7134; GFX940-NEXT: v_mov_b32_e32 v4, s4 7135; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start 7136; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7137; GFX940-NEXT: s_waitcnt vmcnt(0) 7138; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7139; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7140; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 7141; GFX940-NEXT: v_min_f32_e32 v5, v5, v3 7142; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 7143; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1 7144; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0 7145; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5 7146; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8 7147; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8 7148; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7149; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 7150; GFX940-NEXT: buffer_wbl2 sc1 7151; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 7152; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 7153; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9 7154; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1] 7155; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0 7156; GFX940-NEXT: s_waitcnt vmcnt(0) 7157; GFX940-NEXT: buffer_inv sc1 7158; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 7159; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7160; GFX940-NEXT: v_mov_b32_e32 v1, v6 7161; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7] 7162; GFX940-NEXT: s_cbranch_execnz .LBB20_1 7163; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7164; GFX940-NEXT: s_or_b64 exec, exec, s[6:7] 7165; GFX940-NEXT: s_setpc_b64 s[30:31] 7166; 7167; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7168; GFX11: ; %bb.0: 7169; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7170; GFX11-NEXT: v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0 7171; GFX11-NEXT: s_add_i32 s4, s16, 0x400 7172; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7173; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0 7174; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024 7175; GFX11-NEXT: s_mov_b32 s5, 0 7176; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 7177; GFX11-NEXT: .p2align 6 7178; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start 7179; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7180; GFX11-NEXT: s_waitcnt vmcnt(0) 7181; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7182; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7183; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7184; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7185; GFX11-NEXT: v_dual_min_f32 v5, v5, v3 :: v_dual_min_f32 v0, v0, v2 7186; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 7187; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 7188; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 7189; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 7190; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5 7191; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7192; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 7193; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 7194; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0 7195; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7196; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 7197; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 7198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7199; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 7200; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 7201; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc 7202; GFX11-NEXT: s_waitcnt vmcnt(0) 7203; GFX11-NEXT: buffer_gl1_inv 7204; GFX11-NEXT: buffer_gl0_inv 7205; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 7206; GFX11-NEXT: v_mov_b32_e32 v1, v5 7207; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5 7208; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7209; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 7210; GFX11-NEXT: s_cbranch_execnz .LBB20_1 7211; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7212; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 7213; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5 7214; GFX11-NEXT: s_setpc_b64 s[30:31] 7215; 7216; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7217; GFX10: ; %bb.0: 7218; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7219; GFX10-NEXT: v_mov_b32_e32 v1, s20 7220; GFX10-NEXT: s_add_i32 s4, s20, 0x400 7221; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 7222; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7223; GFX10-NEXT: v_mov_b32_e32 v4, s4 7224; GFX10-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 7225; GFX10-NEXT: s_mov_b32 s5, 0 7226; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start 7227; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7228; GFX10-NEXT: s_waitcnt vmcnt(0) 7229; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7230; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7231; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7232; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 7233; GFX10-NEXT: v_min_f32_e32 v5, v5, v3 7234; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1 7235; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 7236; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0 7237; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 7238; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7239; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff 7240; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 7241; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 7242; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo 7243; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 7244; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 7245; GFX10-NEXT: v_mov_b32_e32 v6, v1 7246; GFX10-NEXT: v_mov_b32_e32 v5, v0 7247; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 7248; GFX10-NEXT: s_waitcnt vmcnt(0) 7249; GFX10-NEXT: buffer_gl1_inv 7250; GFX10-NEXT: buffer_gl0_inv 7251; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1 7252; GFX10-NEXT: v_mov_b32_e32 v1, v5 7253; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 7254; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 7255; GFX10-NEXT: s_cbranch_execnz .LBB20_1 7256; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7257; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 7258; GFX10-NEXT: s_setpc_b64 s[30:31] 7259; 7260; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7261; GFX90A: ; %bb.0: 7262; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7263; GFX90A-NEXT: v_mov_b32_e32 v1, s20 7264; GFX90A-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 7265; GFX90A-NEXT: s_add_i32 s4, s20, 0x400 7266; GFX90A-NEXT: s_mov_b64 s[6:7], 0 7267; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 7268; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 7269; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7270; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 7271; GFX90A-NEXT: v_mov_b32_e32 v4, s4 7272; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start 7273; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7274; GFX90A-NEXT: s_waitcnt vmcnt(0) 7275; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7276; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7277; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 7278; GFX90A-NEXT: v_min_f32_e32 v5, v5, v3 7279; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 7280; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 7281; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 7282; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 7283; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 7284; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 7285; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7286; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 7287; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 7288; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 7289; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 7290; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1] 7291; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc 7292; GFX90A-NEXT: s_waitcnt vmcnt(0) 7293; GFX90A-NEXT: buffer_wbinvl1 7294; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1 7295; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7296; GFX90A-NEXT: v_mov_b32_e32 v1, v6 7297; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 7298; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 7299; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7300; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 7301; GFX90A-NEXT: s_setpc_b64 s[30:31] 7302; 7303; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7304; GFX908: ; %bb.0: 7305; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7306; GFX908-NEXT: v_mov_b32_e32 v1, s20 7307; GFX908-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 7308; GFX908-NEXT: s_add_i32 s4, s20, 0x400 7309; GFX908-NEXT: s_mov_b64 s[6:7], 0 7310; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0 7311; GFX908-NEXT: s_movk_i32 s8, 0x7fff 7312; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7313; GFX908-NEXT: s_mov_b32 s9, 0x7060302 7314; GFX908-NEXT: v_mov_b32_e32 v4, s4 7315; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start 7316; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7317; GFX908-NEXT: s_waitcnt vmcnt(0) 7318; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7319; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7320; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 7321; GFX908-NEXT: v_min_f32_e32 v5, v5, v3 7322; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1 7323; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1 7324; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0 7325; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5 7326; GFX908-NEXT: v_add3_u32 v6, v6, v0, s8 7327; GFX908-NEXT: v_add3_u32 v8, v8, v5, s8 7328; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7329; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 7330; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 7331; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 7332; GFX908-NEXT: v_perm_b32 v0, v5, v0, s9 7333; GFX908-NEXT: v_mov_b32_e32 v6, v1 7334; GFX908-NEXT: v_mov_b32_e32 v5, v0 7335; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 7336; GFX908-NEXT: s_waitcnt vmcnt(0) 7337; GFX908-NEXT: buffer_wbinvl1 7338; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 7339; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7340; GFX908-NEXT: v_mov_b32_e32 v1, v5 7341; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7342; GFX908-NEXT: s_cbranch_execnz .LBB20_1 7343; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7344; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7345; GFX908-NEXT: s_setpc_b64 s[30:31] 7346; 7347; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7348; GFX8: ; %bb.0: 7349; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7350; GFX8-NEXT: v_mov_b32_e32 v1, s20 7351; GFX8-NEXT: buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024 7352; GFX8-NEXT: s_add_i32 s4, s20, 0x400 7353; GFX8-NEXT: s_mov_b64 s[6:7], 0 7354; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 7355; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 7356; GFX8-NEXT: v_mov_b32_e32 v4, s4 7357; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start 7358; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7359; GFX8-NEXT: s_waitcnt vmcnt(0) 7360; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1 7361; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 7362; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 7363; GFX8-NEXT: v_min_f32_e32 v5, v5, v3 7364; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 7365; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 7366; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 7367; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 7368; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 7369; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 7370; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 7371; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7372; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 7373; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 7374; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 7375; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] 7376; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 7377; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16 7378; GFX8-NEXT: v_mov_b32_e32 v6, v1 7379; GFX8-NEXT: v_mov_b32_e32 v5, v0 7380; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc 7381; GFX8-NEXT: s_waitcnt vmcnt(0) 7382; GFX8-NEXT: buffer_wbinvl1 7383; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 7384; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7385; GFX8-NEXT: v_mov_b32_e32 v1, v5 7386; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 7387; GFX8-NEXT: s_cbranch_execnz .LBB20_1 7388; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7389; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 7390; GFX8-NEXT: s_setpc_b64 s[30:31] 7391; 7392; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7393; GFX7: ; %bb.0: 7394; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7395; GFX7-NEXT: v_mov_b32_e32 v2, s20 7396; GFX7-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 7397; GFX7-NEXT: s_add_i32 s6, s20, 0x400 7398; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 7399; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0 7400; GFX7-NEXT: s_mov_b64 s[4:5], 0 7401; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 7402; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 7403; GFX7-NEXT: s_waitcnt vmcnt(0) 7404; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 7405; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2 7406; GFX7-NEXT: v_mov_b32_e32 v2, s6 7407; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start 7408; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7409; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 7410; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 7411; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 7412; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 7413; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 7414; GFX7-NEXT: v_min_f32_e32 v5, v5, v0 7415; GFX7-NEXT: v_min_f32_e32 v6, v6, v1 7416; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16 7417; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 7418; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16 7419; GFX7-NEXT: v_mov_b32_e32 v6, v4 7420; GFX7-NEXT: v_mov_b32_e32 v5, v3 7421; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 7422; GFX7-NEXT: s_waitcnt vmcnt(0) 7423; GFX7-NEXT: buffer_wbinvl1 7424; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 7425; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 7426; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7427; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 7428; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7429; GFX7-NEXT: s_cbranch_execnz .LBB20_1 7430; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7431; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7432; GFX7-NEXT: s_setpc_b64 s[30:31] 7433; 7434; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fine_grained_memory: 7435; GFX6: ; %bb.0: 7436; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7437; GFX6-NEXT: v_mov_b32_e32 v2, s20 7438; GFX6-NEXT: buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024 7439; GFX6-NEXT: s_add_i32 s6, s20, 0x400 7440; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 7441; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0 7442; GFX6-NEXT: s_mov_b64 s[4:5], 0 7443; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 7444; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 7445; GFX6-NEXT: s_waitcnt vmcnt(0) 7446; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 7447; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2 7448; GFX6-NEXT: v_mov_b32_e32 v2, s6 7449; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start 7450; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 7451; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4 7452; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 7453; GFX6-NEXT: s_waitcnt expcnt(0) 7454; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 7455; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 7456; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 7457; GFX6-NEXT: v_min_f32_e32 v5, v5, v0 7458; GFX6-NEXT: v_min_f32_e32 v6, v6, v1 7459; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16 7460; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5 7461; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16 7462; GFX6-NEXT: v_mov_b32_e32 v6, v4 7463; GFX6-NEXT: v_mov_b32_e32 v5, v3 7464; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc 7465; GFX6-NEXT: s_waitcnt vmcnt(0) 7466; GFX6-NEXT: buffer_wbinvl1 7467; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 7468; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 7469; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7470; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 7471; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] 7472; GFX6-NEXT: s_cbranch_execnz .LBB20_1 7473; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end 7474; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] 7475; GFX6-NEXT: s_waitcnt expcnt(0) 7476; GFX6-NEXT: s_setpc_b64 s[30:31] 7477 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 7478 %unused = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7479 ret void 7480} 7481 7482define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 { 7483; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7484; GFX12: ; %bb.0: 7485; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7486; GFX12-NEXT: s_wait_expcnt 0x0 7487; GFX12-NEXT: s_wait_samplecnt 0x0 7488; GFX12-NEXT: s_wait_bvhcnt 0x0 7489; GFX12-NEXT: s_wait_kmcnt 0x0 7490; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 7491; GFX12-NEXT: s_mov_b32 s1, exec_lo 7492; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7493; GFX12-NEXT: v_readfirstlane_b32 s4, v0 7494; GFX12-NEXT: v_readfirstlane_b32 s5, v1 7495; GFX12-NEXT: v_readfirstlane_b32 s6, v2 7496; GFX12-NEXT: v_readfirstlane_b32 s7, v3 7497; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7498; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 7499; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 7500; GFX12-NEXT: s_wait_alu 0xfffe 7501; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7502; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 7503; GFX12-NEXT: s_wait_alu 0xfffe 7504; GFX12-NEXT: s_and_saveexec_b32 s0, s0 7505; GFX12-NEXT: s_wait_loadcnt 0x0 7506; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024 7507; GFX12-NEXT: ; implicit-def: $vgpr4 7508; GFX12-NEXT: s_wait_alu 0xfffe 7509; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 7510; GFX12-NEXT: s_cbranch_execnz .LBB21_1 7511; GFX12-NEXT: ; %bb.2: 7512; GFX12-NEXT: s_mov_b32 exec_lo, s1 7513; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5 7514; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 7515; GFX12-NEXT: s_mov_b32 s1, 0 7516; GFX12-NEXT: .LBB21_3: ; %atomicrmw.start 7517; GFX12-NEXT: ; =>This Loop Header: Depth=1 7518; GFX12-NEXT: ; Child Loop BB21_4 Depth 2 7519; GFX12-NEXT: s_wait_loadcnt 0x0 7520; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 7521; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6 7522; GFX12-NEXT: s_mov_b32 s2, exec_lo 7523; GFX12-NEXT: s_wait_storecnt 0x0 7524; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7525; GFX12-NEXT: v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8 7526; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1 7527; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 7528; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1 7529; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4 7530; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 7531; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5 7532; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff 7533; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff 7534; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 7535; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo 7536; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7537; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo 7538; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7539; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 7540; GFX12-NEXT: v_mov_b32_e32 v4, v5 7541; GFX12-NEXT: v_mov_b32_e32 v5, v6 7542; GFX12-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7543; GFX12-NEXT: ; => This Inner Loop Header: Depth=2 7544; GFX12-NEXT: v_readfirstlane_b32 s4, v0 7545; GFX12-NEXT: v_readfirstlane_b32 s5, v1 7546; GFX12-NEXT: v_readfirstlane_b32 s6, v2 7547; GFX12-NEXT: v_readfirstlane_b32 s7, v3 7548; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7549; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 7550; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 7551; GFX12-NEXT: s_wait_alu 0xfffe 7552; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7553; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 7554; GFX12-NEXT: s_wait_alu 0xfffe 7555; GFX12-NEXT: s_and_saveexec_b32 s0, s0 7556; GFX12-NEXT: s_wait_loadcnt 0x0 7557; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN 7558; GFX12-NEXT: s_wait_alu 0xfffe 7559; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0 7560; GFX12-NEXT: s_cbranch_execnz .LBB21_4 7561; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7562; GFX12-NEXT: s_mov_b32 exec_lo, s2 7563; GFX12-NEXT: s_wait_loadcnt 0x0 7564; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 7565; GFX12-NEXT: v_mov_b32_e32 v6, v4 7566; GFX12-NEXT: global_inv scope:SCOPE_DEV 7567; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 7568; GFX12-NEXT: s_wait_alu 0xfffe 7569; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7570; GFX12-NEXT: s_cbranch_execnz .LBB21_3 7571; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end 7572; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 7573; GFX12-NEXT: v_mov_b32_e32 v0, v4 7574; GFX12-NEXT: s_wait_alu 0xfffe 7575; GFX12-NEXT: s_setpc_b64 s[30:31] 7576; 7577; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7578; GFX940: ; %bb.0: 7579; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7580; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4 7581; GFX940-NEXT: s_mov_b64 s[2:3], exec 7582; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7583; GFX940-NEXT: v_readfirstlane_b32 s4, v0 7584; GFX940-NEXT: v_readfirstlane_b32 s5, v1 7585; GFX940-NEXT: v_readfirstlane_b32 s6, v2 7586; GFX940-NEXT: v_readfirstlane_b32 s7, v3 7587; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 7588; GFX940-NEXT: s_nop 0 7589; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 7590; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 7591; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 7592; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024 7593; GFX940-NEXT: ; implicit-def: $vgpr4 7594; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 7595; GFX940-NEXT: s_cbranch_execnz .LBB21_1 7596; GFX940-NEXT: ; %bb.2: 7597; GFX940-NEXT: s_mov_b64 exec, s[2:3] 7598; GFX940-NEXT: s_mov_b64 s[2:3], 0 7599; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5 7600; GFX940-NEXT: s_movk_i32 s10, 0x7fff 7601; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 7602; GFX940-NEXT: s_mov_b32 s11, 0x7060302 7603; GFX940-NEXT: .LBB21_3: ; %atomicrmw.start 7604; GFX940-NEXT: ; =>This Loop Header: Depth=1 7605; GFX940-NEXT: ; Child Loop BB21_4 Depth 2 7606; GFX940-NEXT: s_waitcnt vmcnt(0) 7607; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7 7608; GFX940-NEXT: v_min_f32_e32 v4, v4, v9 7609; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 7610; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10 7611; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4 7612; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 7613; GFX940-NEXT: s_mov_b64 s[8:9], exec 7614; GFX940-NEXT: buffer_wbl2 sc1 7615; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 7616; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 7617; GFX940-NEXT: v_min_f32_e32 v5, v5, v10 7618; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 7619; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10 7620; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5 7621; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7622; GFX940-NEXT: s_nop 1 7623; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc 7624; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11 7625; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7] 7626; GFX940-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7627; GFX940-NEXT: ; => This Inner Loop Header: Depth=2 7628; GFX940-NEXT: v_readfirstlane_b32 s4, v0 7629; GFX940-NEXT: v_readfirstlane_b32 s5, v1 7630; GFX940-NEXT: v_readfirstlane_b32 s6, v2 7631; GFX940-NEXT: v_readfirstlane_b32 s7, v3 7632; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 7633; GFX940-NEXT: s_nop 0 7634; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 7635; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 7636; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 7637; GFX940-NEXT: s_waitcnt vmcnt(0) 7638; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0 7639; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1] 7640; GFX940-NEXT: s_cbranch_execnz .LBB21_4 7641; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7642; GFX940-NEXT: s_mov_b64 exec, s[8:9] 7643; GFX940-NEXT: s_waitcnt vmcnt(0) 7644; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 7645; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 7646; GFX940-NEXT: v_mov_b32_e32 v7, v4 7647; GFX940-NEXT: buffer_inv sc1 7648; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 7649; GFX940-NEXT: s_cbranch_execnz .LBB21_3 7650; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end 7651; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 7652; GFX940-NEXT: v_mov_b32_e32 v0, v4 7653; GFX940-NEXT: s_setpc_b64 s[30:31] 7654; 7655; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7656; GFX11: ; %bb.0: 7657; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7658; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 7659; GFX11-NEXT: s_mov_b32 s1, 0 7660; GFX11-NEXT: s_mov_b32 s2, exec_lo 7661; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7662; GFX11-NEXT: v_readfirstlane_b32 s4, v0 7663; GFX11-NEXT: v_readfirstlane_b32 s5, v1 7664; GFX11-NEXT: v_readfirstlane_b32 s6, v2 7665; GFX11-NEXT: v_readfirstlane_b32 s7, v3 7666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7667; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 7668; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 7669; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 7670; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 7671; GFX11-NEXT: s_and_saveexec_b32 s0, s0 7672; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024 7673; GFX11-NEXT: ; implicit-def: $vgpr4 7674; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 7675; GFX11-NEXT: s_cbranch_execnz .LBB21_1 7676; GFX11-NEXT: ; %bb.2: 7677; GFX11-NEXT: s_mov_b32 exec_lo, s2 7678; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5 7679; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 7680; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 7681; GFX11-NEXT: .p2align 6 7682; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start 7683; GFX11-NEXT: ; =>This Loop Header: Depth=1 7684; GFX11-NEXT: ; Child Loop BB21_4 Depth 2 7685; GFX11-NEXT: s_waitcnt vmcnt(0) 7686; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 7687; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6 7688; GFX11-NEXT: s_mov_b32 s2, exec_lo 7689; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7690; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7691; GFX11-NEXT: v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8 7692; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 7693; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 7694; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 7695; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4 7696; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 7697; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 7698; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff 7699; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff 7700; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) 7701; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo 7702; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7703; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo 7704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7705; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 7706; GFX11-NEXT: v_mov_b32_e32 v4, v5 7707; GFX11-NEXT: v_mov_b32_e32 v5, v6 7708; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7709; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 7710; GFX11-NEXT: v_readfirstlane_b32 s4, v0 7711; GFX11-NEXT: v_readfirstlane_b32 s5, v1 7712; GFX11-NEXT: v_readfirstlane_b32 s6, v2 7713; GFX11-NEXT: v_readfirstlane_b32 s7, v3 7714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 7715; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 7716; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 7717; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 7718; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 7719; GFX11-NEXT: s_and_saveexec_b32 s0, s0 7720; GFX11-NEXT: s_waitcnt vmcnt(0) 7721; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc 7722; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 7723; GFX11-NEXT: s_cbranch_execnz .LBB21_4 7724; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7725; GFX11-NEXT: s_mov_b32 exec_lo, s2 7726; GFX11-NEXT: s_waitcnt vmcnt(0) 7727; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 7728; GFX11-NEXT: v_mov_b32_e32 v6, v4 7729; GFX11-NEXT: buffer_gl1_inv 7730; GFX11-NEXT: buffer_gl0_inv 7731; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 7732; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7733; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 7734; GFX11-NEXT: s_cbranch_execnz .LBB21_3 7735; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end 7736; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 7737; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 7738; GFX11-NEXT: v_mov_b32_e32 v0, v4 7739; GFX11-NEXT: s_setpc_b64 s[30:31] 7740; 7741; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7742; GFX10: ; %bb.0: 7743; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7744; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4 7745; GFX10-NEXT: s_mov_b32 s5, 0 7746; GFX10-NEXT: s_mov_b32 s6, exec_lo 7747; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7748; GFX10-NEXT: v_readfirstlane_b32 s8, v0 7749; GFX10-NEXT: v_readfirstlane_b32 s9, v1 7750; GFX10-NEXT: v_readfirstlane_b32 s10, v2 7751; GFX10-NEXT: v_readfirstlane_b32 s11, v3 7752; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 7753; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 7754; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 7755; GFX10-NEXT: s_and_saveexec_b32 s4, s4 7756; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 7757; GFX10-NEXT: ; implicit-def: $vgpr4 7758; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7759; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 7760; GFX10-NEXT: s_cbranch_execnz .LBB21_1 7761; GFX10-NEXT: ; %bb.2: 7762; GFX10-NEXT: s_mov_b32 exec_lo, s6 7763; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5 7764; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 7765; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start 7766; GFX10-NEXT: ; =>This Loop Header: Depth=1 7767; GFX10-NEXT: ; Child Loop BB21_4 Depth 2 7768; GFX10-NEXT: s_waitcnt vmcnt(0) 7769; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6 7770; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 7771; GFX10-NEXT: s_mov_b32 s6, exec_lo 7772; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7773; GFX10-NEXT: v_min_f32_e32 v4, v4, v8 7774; GFX10-NEXT: v_min_f32_e32 v5, v5, v9 7775; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1 7776; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1 7777; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4 7778; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 7779; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5 7780; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff 7781; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff 7782; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo 7783; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 7784; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo 7785; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302 7786; GFX10-NEXT: v_mov_b32_e32 v4, v5 7787; GFX10-NEXT: v_mov_b32_e32 v5, v6 7788; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7789; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 7790; GFX10-NEXT: v_readfirstlane_b32 s8, v0 7791; GFX10-NEXT: v_readfirstlane_b32 s9, v1 7792; GFX10-NEXT: v_readfirstlane_b32 s10, v2 7793; GFX10-NEXT: v_readfirstlane_b32 s11, v3 7794; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1] 7795; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] 7796; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4 7797; GFX10-NEXT: s_and_saveexec_b32 s4, s4 7798; GFX10-NEXT: s_waitcnt vmcnt(0) 7799; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 7800; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7801; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4 7802; GFX10-NEXT: s_cbranch_execnz .LBB21_4 7803; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7804; GFX10-NEXT: s_mov_b32 exec_lo, s6 7805; GFX10-NEXT: s_waitcnt vmcnt(0) 7806; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 7807; GFX10-NEXT: v_mov_b32_e32 v6, v4 7808; GFX10-NEXT: buffer_gl1_inv 7809; GFX10-NEXT: buffer_gl0_inv 7810; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 7811; GFX10-NEXT: s_waitcnt_depctr 0xffe3 7812; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 7813; GFX10-NEXT: s_cbranch_execnz .LBB21_3 7814; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end 7815; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 7816; GFX10-NEXT: v_mov_b32_e32 v0, v4 7817; GFX10-NEXT: s_setpc_b64 s[30:31] 7818; 7819; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7820; GFX90A: ; %bb.0: 7821; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7822; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4 7823; GFX90A-NEXT: s_mov_b64 s[6:7], exec 7824; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7825; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 7826; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 7827; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 7828; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 7829; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7830; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7831; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7832; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7833; GFX90A-NEXT: s_nop 0 7834; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 7835; GFX90A-NEXT: ; implicit-def: $vgpr4 7836; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 7837; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 7838; GFX90A-NEXT: ; %bb.2: 7839; GFX90A-NEXT: s_mov_b64 exec, s[6:7] 7840; GFX90A-NEXT: s_mov_b64 s[6:7], 0 7841; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5 7842; GFX90A-NEXT: s_movk_i32 s14, 0x7fff 7843; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 7844; GFX90A-NEXT: s_mov_b32 s15, 0x7060302 7845; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.start 7846; GFX90A-NEXT: ; =>This Loop Header: Depth=1 7847; GFX90A-NEXT: ; Child Loop BB21_4 Depth 2 7848; GFX90A-NEXT: s_waitcnt vmcnt(0) 7849; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7 7850; GFX90A-NEXT: v_min_f32_e32 v4, v4, v9 7851; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1 7852; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14 7853; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4 7854; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 7855; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 7856; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 7857; GFX90A-NEXT: v_min_f32_e32 v5, v5, v10 7858; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 7859; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14 7860; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5 7861; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7862; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc 7863; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15 7864; GFX90A-NEXT: s_mov_b64 s[12:13], exec 7865; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1] 7866; GFX90A-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7867; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 7868; GFX90A-NEXT: v_readfirstlane_b32 s8, v0 7869; GFX90A-NEXT: v_readfirstlane_b32 s9, v1 7870; GFX90A-NEXT: v_readfirstlane_b32 s10, v2 7871; GFX90A-NEXT: v_readfirstlane_b32 s11, v3 7872; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7873; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7874; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7875; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7876; GFX90A-NEXT: s_waitcnt vmcnt(0) 7877; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc 7878; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5] 7879; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 7880; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7881; GFX90A-NEXT: s_mov_b64 exec, s[12:13] 7882; GFX90A-NEXT: s_waitcnt vmcnt(0) 7883; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 7884; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7885; GFX90A-NEXT: v_mov_b32_e32 v7, v4 7886; GFX90A-NEXT: buffer_wbinvl1 7887; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 7888; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 7889; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end 7890; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 7891; GFX90A-NEXT: v_mov_b32_e32 v0, v4 7892; GFX90A-NEXT: s_setpc_b64 s[30:31] 7893; 7894; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7895; GFX908: ; %bb.0: 7896; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7897; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4 7898; GFX908-NEXT: s_mov_b64 s[6:7], exec 7899; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7900; GFX908-NEXT: v_readfirstlane_b32 s8, v0 7901; GFX908-NEXT: v_readfirstlane_b32 s9, v1 7902; GFX908-NEXT: v_readfirstlane_b32 s10, v2 7903; GFX908-NEXT: v_readfirstlane_b32 s11, v3 7904; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7905; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7906; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7907; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7908; GFX908-NEXT: s_nop 0 7909; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 7910; GFX908-NEXT: ; implicit-def: $vgpr4 7911; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 7912; GFX908-NEXT: s_cbranch_execnz .LBB21_1 7913; GFX908-NEXT: ; %bb.2: 7914; GFX908-NEXT: s_mov_b64 exec, s[6:7] 7915; GFX908-NEXT: s_mov_b64 s[6:7], 0 7916; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5 7917; GFX908-NEXT: s_movk_i32 s14, 0x7fff 7918; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 7919; GFX908-NEXT: s_mov_b32 s15, 0x7060302 7920; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start 7921; GFX908-NEXT: ; =>This Loop Header: Depth=1 7922; GFX908-NEXT: ; Child Loop BB21_4 Depth 2 7923; GFX908-NEXT: s_waitcnt vmcnt(0) 7924; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6 7925; GFX908-NEXT: v_min_f32_e32 v4, v4, v8 7926; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1 7927; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14 7928; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4 7929; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 7930; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc 7931; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 7932; GFX908-NEXT: v_min_f32_e32 v5, v5, v9 7933; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1 7934; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14 7935; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5 7936; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 7937; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc 7938; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15 7939; GFX908-NEXT: v_mov_b32_e32 v4, v5 7940; GFX908-NEXT: s_mov_b64 s[12:13], exec 7941; GFX908-NEXT: v_mov_b32_e32 v5, v6 7942; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 7943; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 7944; GFX908-NEXT: v_readfirstlane_b32 s8, v0 7945; GFX908-NEXT: v_readfirstlane_b32 s9, v1 7946; GFX908-NEXT: v_readfirstlane_b32 s10, v2 7947; GFX908-NEXT: v_readfirstlane_b32 s11, v3 7948; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7949; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7950; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7951; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7952; GFX908-NEXT: s_waitcnt vmcnt(0) 7953; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 7954; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5] 7955; GFX908-NEXT: s_cbranch_execnz .LBB21_4 7956; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 7957; GFX908-NEXT: s_mov_b64 exec, s[12:13] 7958; GFX908-NEXT: s_waitcnt vmcnt(0) 7959; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 7960; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 7961; GFX908-NEXT: v_mov_b32_e32 v6, v4 7962; GFX908-NEXT: buffer_wbinvl1 7963; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 7964; GFX908-NEXT: s_cbranch_execnz .LBB21_3 7965; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end 7966; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 7967; GFX908-NEXT: v_mov_b32_e32 v0, v4 7968; GFX908-NEXT: s_setpc_b64 s[30:31] 7969; 7970; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 7971; GFX8: ; %bb.0: 7972; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7973; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4 7974; GFX8-NEXT: s_mov_b64 s[6:7], exec 7975; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 7976; GFX8-NEXT: v_readfirstlane_b32 s8, v0 7977; GFX8-NEXT: v_readfirstlane_b32 s9, v1 7978; GFX8-NEXT: v_readfirstlane_b32 s10, v2 7979; GFX8-NEXT: v_readfirstlane_b32 s11, v3 7980; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 7981; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 7982; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 7983; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 7984; GFX8-NEXT: s_nop 0 7985; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024 7986; GFX8-NEXT: ; implicit-def: $vgpr4 7987; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 7988; GFX8-NEXT: s_cbranch_execnz .LBB21_1 7989; GFX8-NEXT: ; %bb.2: 7990; GFX8-NEXT: s_mov_b64 exec, s[6:7] 7991; GFX8-NEXT: s_mov_b64 s[6:7], 0 7992; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5 7993; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 7994; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start 7995; GFX8-NEXT: ; =>This Loop Header: Depth=1 7996; GFX8-NEXT: ; Child Loop BB21_4 Depth 2 7997; GFX8-NEXT: s_waitcnt vmcnt(0) 7998; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 7999; GFX8-NEXT: v_min_f32_e32 v4, v4, v8 8000; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 8001; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 8002; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 8003; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4 8004; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 8005; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc 8006; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 8007; GFX8-NEXT: v_min_f32_e32 v5, v5, v9 8008; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1 8009; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5 8010; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10 8011; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5 8012; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8013; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc 8014; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8015; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16 8016; GFX8-NEXT: v_mov_b32_e32 v4, v5 8017; GFX8-NEXT: s_mov_b64 s[12:13], exec 8018; GFX8-NEXT: v_mov_b32_e32 v5, v6 8019; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 8020; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 8021; GFX8-NEXT: v_readfirstlane_b32 s8, v0 8022; GFX8-NEXT: v_readfirstlane_b32 s9, v1 8023; GFX8-NEXT: v_readfirstlane_b32 s10, v2 8024; GFX8-NEXT: v_readfirstlane_b32 s11, v3 8025; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 8026; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 8027; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 8028; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 8029; GFX8-NEXT: s_waitcnt vmcnt(0) 8030; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc 8031; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5] 8032; GFX8-NEXT: s_cbranch_execnz .LBB21_4 8033; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 8034; GFX8-NEXT: s_mov_b64 exec, s[12:13] 8035; GFX8-NEXT: s_waitcnt vmcnt(0) 8036; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6 8037; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8038; GFX8-NEXT: v_mov_b32_e32 v6, v4 8039; GFX8-NEXT: buffer_wbinvl1 8040; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 8041; GFX8-NEXT: s_cbranch_execnz .LBB21_3 8042; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end 8043; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 8044; GFX8-NEXT: v_mov_b32_e32 v0, v4 8045; GFX8-NEXT: s_setpc_b64 s[30:31] 8046; 8047; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 8048; GFX7: ; %bb.0: 8049; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8050; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 8051; GFX7-NEXT: s_mov_b64 s[6:7], exec 8052; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 8053; GFX7-NEXT: v_readfirstlane_b32 s8, v0 8054; GFX7-NEXT: v_readfirstlane_b32 s9, v1 8055; GFX7-NEXT: v_readfirstlane_b32 s10, v2 8056; GFX7-NEXT: v_readfirstlane_b32 s11, v3 8057; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 8058; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 8059; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 8060; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 8061; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 8062; GFX7-NEXT: ; implicit-def: $vgpr4 8063; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 8064; GFX7-NEXT: s_cbranch_execnz .LBB21_1 8065; GFX7-NEXT: ; %bb.2: 8066; GFX7-NEXT: s_mov_b64 exec, s[6:7] 8067; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 8068; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 8069; GFX7-NEXT: s_waitcnt vmcnt(0) 8070; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 8071; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 8072; GFX7-NEXT: s_mov_b64 s[6:7], 0 8073; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 8074; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 8075; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start 8076; GFX7-NEXT: ; =>This Loop Header: Depth=1 8077; GFX7-NEXT: ; Child Loop BB21_4 Depth 2 8078; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4 8079; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 8080; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7 8081; GFX7-NEXT: v_min_f32_e32 v4, v4, v9 8082; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 8083; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8084; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 8085; GFX7-NEXT: v_min_f32_e32 v7, v7, v10 8086; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16 8087; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 8088; GFX7-NEXT: v_mov_b32_e32 v7, v5 8089; GFX7-NEXT: s_mov_b64 s[12:13], exec 8090; GFX7-NEXT: v_mov_b32_e32 v6, v4 8091; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 8092; GFX7-NEXT: ; => This Inner Loop Header: Depth=2 8093; GFX7-NEXT: v_readfirstlane_b32 s8, v0 8094; GFX7-NEXT: v_readfirstlane_b32 s9, v1 8095; GFX7-NEXT: v_readfirstlane_b32 s10, v2 8096; GFX7-NEXT: v_readfirstlane_b32 s11, v3 8097; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 8098; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 8099; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 8100; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 8101; GFX7-NEXT: s_waitcnt vmcnt(0) 8102; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc 8103; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5] 8104; GFX7-NEXT: s_cbranch_execnz .LBB21_4 8105; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 8106; GFX7-NEXT: s_mov_b64 exec, s[12:13] 8107; GFX7-NEXT: s_waitcnt vmcnt(0) 8108; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 8109; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 8110; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8111; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6 8112; GFX7-NEXT: buffer_wbinvl1 8113; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 8114; GFX7-NEXT: s_cbranch_execnz .LBB21_3 8115; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end 8116; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 8117; GFX7-NEXT: v_mov_b32_e32 v0, v7 8118; GFX7-NEXT: v_mov_b32_e32 v1, v4 8119; GFX7-NEXT: s_setpc_b64 s[30:31] 8120; 8121; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory: 8122; GFX6: ; %bb.0: 8123; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8124; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4 8125; GFX6-NEXT: s_mov_b64 s[6:7], exec 8126; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1 8127; GFX6-NEXT: v_readfirstlane_b32 s8, v0 8128; GFX6-NEXT: v_readfirstlane_b32 s9, v1 8129; GFX6-NEXT: v_readfirstlane_b32 s10, v2 8130; GFX6-NEXT: v_readfirstlane_b32 s11, v3 8131; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 8132; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 8133; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 8134; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 8135; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024 8136; GFX6-NEXT: ; implicit-def: $vgpr4 8137; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 8138; GFX6-NEXT: s_cbranch_execnz .LBB21_1 8139; GFX6-NEXT: ; %bb.2: 8140; GFX6-NEXT: s_mov_b64 exec, s[6:7] 8141; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6 8142; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5 8143; GFX6-NEXT: s_waitcnt vmcnt(0) 8144; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 8145; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 8146; GFX6-NEXT: s_mov_b64 s[6:7], 0 8147; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 8148; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 8149; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start 8150; GFX6-NEXT: ; =>This Loop Header: Depth=1 8151; GFX6-NEXT: ; Child Loop BB21_4 Depth 2 8152; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4 8153; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 8154; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7 8155; GFX6-NEXT: v_min_f32_e32 v4, v4, v9 8156; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 8157; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8158; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4 8159; GFX6-NEXT: v_min_f32_e32 v7, v7, v10 8160; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16 8161; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16 8162; GFX6-NEXT: v_mov_b32_e32 v7, v5 8163; GFX6-NEXT: s_mov_b64 s[12:13], exec 8164; GFX6-NEXT: v_mov_b32_e32 v6, v4 8165; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1 8166; GFX6-NEXT: ; => This Inner Loop Header: Depth=2 8167; GFX6-NEXT: v_readfirstlane_b32 s8, v0 8168; GFX6-NEXT: v_readfirstlane_b32 s9, v1 8169; GFX6-NEXT: v_readfirstlane_b32 s10, v2 8170; GFX6-NEXT: v_readfirstlane_b32 s11, v3 8171; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] 8172; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3] 8173; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 8174; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] 8175; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) 8176; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc 8177; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5] 8178; GFX6-NEXT: s_cbranch_execnz .LBB21_4 8179; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1 8180; GFX6-NEXT: s_mov_b64 exec, s[12:13] 8181; GFX6-NEXT: s_waitcnt vmcnt(0) 8182; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 8183; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6 8184; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 8185; GFX6-NEXT: s_waitcnt expcnt(0) 8186; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6 8187; GFX6-NEXT: buffer_wbinvl1 8188; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7] 8189; GFX6-NEXT: s_cbranch_execnz .LBB21_3 8190; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end 8191; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] 8192; GFX6-NEXT: v_mov_b32_e32 v0, v7 8193; GFX6-NEXT: v_mov_b32_e32 v1, v4 8194; GFX6-NEXT: s_setpc_b64 s[30:31] 8195 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256 8196 %result = atomicrmw fmin ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 8197 ret <2 x bfloat> %result 8198} 8199 8200; -------------------------------------------------------------------- 8201; misc 8202; -------------------------------------------------------------------- 8203 8204define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 { 8205; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8206; GFX12: ; %bb.0: 8207; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8208; GFX12-NEXT: s_wait_expcnt 0x0 8209; GFX12-NEXT: s_wait_samplecnt 0x0 8210; GFX12-NEXT: s_wait_bvhcnt 0x0 8211; GFX12-NEXT: s_wait_kmcnt 0x0 8212; GFX12-NEXT: v_mov_b32_e32 v1, s16 8213; GFX12-NEXT: global_wb scope:SCOPE_SYS 8214; GFX12-NEXT: s_wait_storecnt 0x0 8215; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN 8216; GFX12-NEXT: s_wait_loadcnt 0x0 8217; GFX12-NEXT: global_inv scope:SCOPE_SYS 8218; GFX12-NEXT: s_setpc_b64 s[30:31] 8219; 8220; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8221; GFX940: ; %bb.0: 8222; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8223; GFX940-NEXT: v_mov_b32_e32 v1, v0 8224; GFX940-NEXT: v_mov_b32_e32 v0, s16 8225; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024 8226; GFX940-NEXT: s_add_i32 s6, s16, 0x400 8227; GFX940-NEXT: s_mov_b64 s[4:5], 0 8228; GFX940-NEXT: v_max_f32_e32 v2, v1, v1 8229; GFX940-NEXT: v_mov_b32_e32 v3, s6 8230; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start 8231; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8232; GFX940-NEXT: s_waitcnt vmcnt(0) 8233; GFX940-NEXT: v_mov_b32_e32 v5, v0 8234; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 8235; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 8236; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 8237; GFX940-NEXT: buffer_wbl2 sc0 sc1 8238; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0 8239; GFX940-NEXT: s_waitcnt vmcnt(0) 8240; GFX940-NEXT: buffer_inv sc0 sc1 8241; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 8242; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8243; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 8244; GFX940-NEXT: s_cbranch_execnz .LBB22_1 8245; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8246; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 8247; GFX940-NEXT: s_setpc_b64 s[30:31] 8248; 8249; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8250; GFX11: ; %bb.0: 8251; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8252; GFX11-NEXT: v_mov_b32_e32 v1, s16 8253; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8254; GFX11-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen offset:1024 glc 8255; GFX11-NEXT: s_waitcnt vmcnt(0) 8256; GFX11-NEXT: buffer_gl1_inv 8257; GFX11-NEXT: buffer_gl0_inv 8258; GFX11-NEXT: s_setpc_b64 s[30:31] 8259; 8260; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8261; GFX10: ; %bb.0: 8262; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8263; GFX10-NEXT: v_mov_b32_e32 v1, s20 8264; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8265; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 8266; GFX10-NEXT: s_waitcnt vmcnt(0) 8267; GFX10-NEXT: buffer_gl1_inv 8268; GFX10-NEXT: buffer_gl0_inv 8269; GFX10-NEXT: s_setpc_b64 s[30:31] 8270; 8271; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8272; GFX90A: ; %bb.0: 8273; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8274; GFX90A-NEXT: v_mov_b32_e32 v1, v0 8275; GFX90A-NEXT: v_mov_b32_e32 v0, s20 8276; GFX90A-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8277; GFX90A-NEXT: s_add_i32 s6, s20, 0x400 8278; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8279; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1 8280; GFX90A-NEXT: v_mov_b32_e32 v3, s6 8281; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start 8282; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8283; GFX90A-NEXT: s_waitcnt vmcnt(0) 8284; GFX90A-NEXT: v_mov_b32_e32 v5, v0 8285; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 8286; GFX90A-NEXT: v_min_f32_e32 v4, v0, v2 8287; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 8288; GFX90A-NEXT: buffer_wbl2 8289; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 8290; GFX90A-NEXT: s_waitcnt vmcnt(0) 8291; GFX90A-NEXT: buffer_invl2 8292; GFX90A-NEXT: buffer_wbinvl1 8293; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 8294; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8295; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8296; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 8297; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8298; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8299; GFX90A-NEXT: s_setpc_b64 s[30:31] 8300; 8301; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8302; GFX908: ; %bb.0: 8303; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8304; GFX908-NEXT: v_mov_b32_e32 v1, v0 8305; GFX908-NEXT: v_mov_b32_e32 v0, s20 8306; GFX908-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8307; GFX908-NEXT: s_add_i32 s6, s20, 0x400 8308; GFX908-NEXT: s_mov_b64 s[4:5], 0 8309; GFX908-NEXT: v_max_f32_e32 v2, v1, v1 8310; GFX908-NEXT: v_mov_b32_e32 v3, s6 8311; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start 8312; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8313; GFX908-NEXT: s_waitcnt vmcnt(0) 8314; GFX908-NEXT: v_mov_b32_e32 v5, v0 8315; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 8316; GFX908-NEXT: v_min_f32_e32 v4, v0, v2 8317; GFX908-NEXT: v_mov_b32_e32 v0, v4 8318; GFX908-NEXT: v_mov_b32_e32 v1, v5 8319; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 8320; GFX908-NEXT: s_waitcnt vmcnt(0) 8321; GFX908-NEXT: buffer_wbinvl1 8322; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 8323; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8324; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8325; GFX908-NEXT: s_cbranch_execnz .LBB22_1 8326; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8327; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8328; GFX908-NEXT: s_setpc_b64 s[30:31] 8329; 8330; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8331; GFX8: ; %bb.0: 8332; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8333; GFX8-NEXT: v_mov_b32_e32 v1, v0 8334; GFX8-NEXT: v_mov_b32_e32 v0, s20 8335; GFX8-NEXT: buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024 8336; GFX8-NEXT: s_add_i32 s6, s20, 0x400 8337; GFX8-NEXT: s_mov_b64 s[4:5], 0 8338; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1 8339; GFX8-NEXT: v_mov_b32_e32 v3, s6 8340; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start 8341; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8342; GFX8-NEXT: s_waitcnt vmcnt(0) 8343; GFX8-NEXT: v_mov_b32_e32 v5, v0 8344; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 8345; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 8346; GFX8-NEXT: v_mov_b32_e32 v0, v4 8347; GFX8-NEXT: v_mov_b32_e32 v1, v5 8348; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc 8349; GFX8-NEXT: s_waitcnt vmcnt(0) 8350; GFX8-NEXT: buffer_wbinvl1 8351; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 8352; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8353; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8354; GFX8-NEXT: s_cbranch_execnz .LBB22_1 8355; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8356; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8357; GFX8-NEXT: s_setpc_b64 s[30:31] 8358; 8359; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8360; GFX7: ; %bb.0: 8361; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8362; GFX7-NEXT: v_mov_b32_e32 v1, s20 8363; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 8364; GFX7-NEXT: s_waitcnt vmcnt(0) 8365; GFX7-NEXT: buffer_wbinvl1 8366; GFX7-NEXT: s_setpc_b64 s[30:31] 8367; 8368; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_grained_memory: 8369; GFX6: ; %bb.0: 8370; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8371; GFX6-NEXT: v_mov_b32_e32 v1, s20 8372; GFX6-NEXT: buffer_atomic_fmin v0, v1, s[16:19], 0 offen offset:1024 glc 8373; GFX6-NEXT: s_waitcnt vmcnt(0) 8374; GFX6-NEXT: buffer_wbinvl1 8375; GFX6-NEXT: s_waitcnt expcnt(0) 8376; GFX6-NEXT: s_setpc_b64 s[30:31] 8377 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 8378 %result = atomicrmw fmin ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 8379 ret float %result 8380} 8381 8382attributes #0 = { nounwind } 8383 8384!0 = !{} 8385