1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10 11; -------------------------------------------------------------------- 12; float 13; -------------------------------------------------------------------- 14 15define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 16; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 17; GFX12: ; %bb.0: 18; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 19; GFX12-NEXT: s_wait_expcnt 0x0 20; GFX12-NEXT: s_wait_samplecnt 0x0 21; GFX12-NEXT: s_wait_bvhcnt 0x0 22; GFX12-NEXT: s_wait_kmcnt 0x0 23; GFX12-NEXT: s_wait_storecnt 0x0 24; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 25; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 26; GFX12-NEXT: global_inv scope:SCOPE_DEV 27; GFX12-NEXT: s_setpc_b64 s[30:31] 28; 29; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 30; GFX940: ; %bb.0: 31; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX940-NEXT: flat_load_dword v3, v[0:1] 33; GFX940-NEXT: s_mov_b64 s[0:1], 0 34; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 35; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start 36; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 37; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 38; GFX940-NEXT: v_mov_b32_e32 v5, v3 39; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 40; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 41; GFX940-NEXT: buffer_wbl2 sc1 42; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 43; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 44; GFX940-NEXT: buffer_inv sc1 45; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 46; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 47; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 48; GFX940-NEXT: s_cbranch_execnz .LBB0_1 49; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 50; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 51; GFX940-NEXT: v_mov_b32_e32 v0, v3 52; GFX940-NEXT: s_setpc_b64 s[30:31] 53; 54; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 55; GFX11: ; %bb.0: 56; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 57; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 58; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc 59; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 60; GFX11-NEXT: buffer_gl1_inv 61; GFX11-NEXT: buffer_gl0_inv 62; GFX11-NEXT: s_setpc_b64 s[30:31] 63; 64; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 65; GFX10: ; %bb.0: 66; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 67; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 68; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 69; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 70; GFX10-NEXT: buffer_gl1_inv 71; GFX10-NEXT: buffer_gl0_inv 72; GFX10-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 75; GFX90A: ; %bb.0: 76; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX90A-NEXT: flat_load_dword v3, v[0:1] 78; GFX90A-NEXT: s_mov_b64 s[4:5], 0 79; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 80; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start 81; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 82; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 83; GFX90A-NEXT: v_mov_b32_e32 v5, v3 84; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 85; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 86; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 87; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 88; GFX90A-NEXT: buffer_wbinvl1 89; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 90; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 91; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 92; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 93; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 94; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 95; GFX90A-NEXT: v_mov_b32_e32 v0, v3 96; GFX90A-NEXT: s_setpc_b64 s[30:31] 97; 98; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 99; GFX908: ; %bb.0: 100; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 101; GFX908-NEXT: flat_load_dword v3, v[0:1] 102; GFX908-NEXT: s_mov_b64 s[4:5], 0 103; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 104; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start 105; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 106; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 107; GFX908-NEXT: v_mov_b32_e32 v4, v3 108; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 109; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 110; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 111; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 112; GFX908-NEXT: buffer_wbinvl1 113; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 114; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 115; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 116; GFX908-NEXT: s_cbranch_execnz .LBB0_1 117; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 118; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 119; GFX908-NEXT: v_mov_b32_e32 v0, v3 120; GFX908-NEXT: s_setpc_b64 s[30:31] 121; 122; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 123; GFX8: ; %bb.0: 124; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 125; GFX8-NEXT: flat_load_dword v3, v[0:1] 126; GFX8-NEXT: s_mov_b64 s[4:5], 0 127; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 128; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start 129; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 130; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 131; GFX8-NEXT: v_mov_b32_e32 v4, v3 132; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 133; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 134; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 135; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 136; GFX8-NEXT: buffer_wbinvl1 137; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 138; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 139; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 140; GFX8-NEXT: s_cbranch_execnz .LBB0_1 141; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 142; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 143; GFX8-NEXT: v_mov_b32_e32 v0, v3 144; GFX8-NEXT: s_setpc_b64 s[30:31] 145; 146; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory: 147; GFX7: ; %bb.0: 148; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 150; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 151; GFX7-NEXT: buffer_wbinvl1 152; GFX7-NEXT: s_setpc_b64 s[30:31] 153 %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 154 ret float %result 155} 156 157define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 158; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 159; GFX12: ; %bb.0: 160; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 161; GFX12-NEXT: s_wait_expcnt 0x0 162; GFX12-NEXT: s_wait_samplecnt 0x0 163; GFX12-NEXT: s_wait_bvhcnt 0x0 164; GFX12-NEXT: s_wait_kmcnt 0x0 165; GFX12-NEXT: s_wait_storecnt 0x0 166; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 167; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 168; GFX12-NEXT: global_inv scope:SCOPE_DEV 169; GFX12-NEXT: s_setpc_b64 s[30:31] 170; 171; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 172; GFX940: ; %bb.0: 173; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 175; GFX940-NEXT: s_mov_b64 s[0:1], 0 176; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 177; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start 178; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 179; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 180; GFX940-NEXT: v_mov_b32_e32 v5, v3 181; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 182; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 183; GFX940-NEXT: buffer_wbl2 sc1 184; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 185; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 186; GFX940-NEXT: buffer_inv sc1 187; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 188; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 189; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 190; GFX940-NEXT: s_cbranch_execnz .LBB1_1 191; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 192; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 193; GFX940-NEXT: v_mov_b32_e32 v0, v3 194; GFX940-NEXT: s_setpc_b64 s[30:31] 195; 196; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 197; GFX11: ; %bb.0: 198; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 200; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc 201; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 202; GFX11-NEXT: buffer_gl1_inv 203; GFX11-NEXT: buffer_gl0_inv 204; GFX11-NEXT: s_setpc_b64 s[30:31] 205; 206; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 207; GFX10: ; %bb.0: 208; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 210; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 211; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 212; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 213; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 214; GFX10-NEXT: buffer_gl1_inv 215; GFX10-NEXT: buffer_gl0_inv 216; GFX10-NEXT: s_setpc_b64 s[30:31] 217; 218; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 219; GFX90A: ; %bb.0: 220; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 221; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 222; GFX90A-NEXT: s_mov_b64 s[4:5], 0 223; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 224; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start 225; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 226; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 227; GFX90A-NEXT: v_mov_b32_e32 v5, v3 228; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 229; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 230; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 231; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 232; GFX90A-NEXT: buffer_wbinvl1 233; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 234; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 235; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 236; GFX90A-NEXT: s_cbranch_execnz .LBB1_1 237; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 238; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 239; GFX90A-NEXT: v_mov_b32_e32 v0, v3 240; GFX90A-NEXT: s_setpc_b64 s[30:31] 241; 242; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 243; GFX908: ; %bb.0: 244; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 245; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 246; GFX908-NEXT: s_mov_b64 s[4:5], 0 247; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 248; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start 249; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 250; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 251; GFX908-NEXT: v_mov_b32_e32 v4, v3 252; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 253; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 254; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 255; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 256; GFX908-NEXT: buffer_wbinvl1 257; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 258; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 259; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 260; GFX908-NEXT: s_cbranch_execnz .LBB1_1 261; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 262; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 263; GFX908-NEXT: v_mov_b32_e32 v0, v3 264; GFX908-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 267; GFX8: ; %bb.0: 268; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 270; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 271; GFX8-NEXT: flat_load_dword v0, v[3:4] 272; GFX8-NEXT: s_mov_b64 s[4:5], 0 273; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 274; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start 275; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 276; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 277; GFX8-NEXT: v_mov_b32_e32 v6, v0 278; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 279; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 280; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 281; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 282; GFX8-NEXT: buffer_wbinvl1 283; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 284; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 285; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 286; GFX8-NEXT: s_cbranch_execnz .LBB1_1 287; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 288; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 289; GFX8-NEXT: s_setpc_b64 s[30:31] 290; 291; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 292; GFX7: ; %bb.0: 293; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 294; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 295; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 296; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 297; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 298; GFX7-NEXT: buffer_wbinvl1 299; GFX7-NEXT: s_setpc_b64 s[30:31] 300 %gep = getelementptr float, ptr %ptr, i64 511 301 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 302 ret float %result 303} 304 305define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 306; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 307; GFX12: ; %bb.0: 308; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 309; GFX12-NEXT: s_wait_expcnt 0x0 310; GFX12-NEXT: s_wait_samplecnt 0x0 311; GFX12-NEXT: s_wait_bvhcnt 0x0 312; GFX12-NEXT: s_wait_kmcnt 0x0 313; GFX12-NEXT: s_wait_storecnt 0x0 314; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 315; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 316; GFX12-NEXT: global_inv scope:SCOPE_DEV 317; GFX12-NEXT: s_setpc_b64 s[30:31] 318; 319; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 320; GFX940: ; %bb.0: 321; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 322; GFX940-NEXT: v_mov_b32_e32 v4, v0 323; GFX940-NEXT: v_mov_b32_e32 v5, v1 324; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 325; GFX940-NEXT: s_movk_i32 s0, 0xf800 326; GFX940-NEXT: s_nop 0 327; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 328; GFX940-NEXT: flat_load_dword v0, v[0:1] 329; GFX940-NEXT: s_mov_b32 s1, -1 330; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 331; GFX940-NEXT: s_mov_b64 s[0:1], 0 332; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 333; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start 334; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 335; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 336; GFX940-NEXT: v_mov_b32_e32 v3, v0 337; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 338; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 339; GFX940-NEXT: buffer_wbl2 sc1 340; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 341; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 342; GFX940-NEXT: buffer_inv sc1 343; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 344; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 345; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 346; GFX940-NEXT: s_cbranch_execnz .LBB2_1 347; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 348; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 349; GFX940-NEXT: s_setpc_b64 s[30:31] 350; 351; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 352; GFX11: ; %bb.0: 353; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 355; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 356; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 357; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc 358; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 359; GFX11-NEXT: buffer_gl1_inv 360; GFX11-NEXT: buffer_gl0_inv 361; GFX11-NEXT: s_setpc_b64 s[30:31] 362; 363; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 364; GFX10: ; %bb.0: 365; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 366; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 367; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 368; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 369; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 370; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 371; GFX10-NEXT: buffer_gl1_inv 372; GFX10-NEXT: buffer_gl0_inv 373; GFX10-NEXT: s_setpc_b64 s[30:31] 374; 375; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 376; GFX90A: ; %bb.0: 377; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 378; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 379; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 380; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 381; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 382; GFX90A-NEXT: flat_load_dword v0, v[0:1] 383; GFX90A-NEXT: s_mov_b64 s[4:5], 0 384; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 385; GFX90A-NEXT: .LBB2_1: ; %atomicrmw.start 386; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 387; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 388; GFX90A-NEXT: v_mov_b32_e32 v3, v0 389; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 390; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 391; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc 392; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 393; GFX90A-NEXT: buffer_wbinvl1 394; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 395; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 396; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 397; GFX90A-NEXT: s_cbranch_execnz .LBB2_1 398; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 399; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 400; GFX90A-NEXT: s_setpc_b64 s[30:31] 401; 402; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 403; GFX908: ; %bb.0: 404; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 405; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 406; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 407; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 408; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 409; GFX908-NEXT: flat_load_dword v0, v[0:1] 410; GFX908-NEXT: s_mov_b64 s[4:5], 0 411; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 412; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start 413; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 414; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 415; GFX908-NEXT: v_mov_b32_e32 v6, v0 416; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 417; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 418; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 419; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 420; GFX908-NEXT: buffer_wbinvl1 421; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 422; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 423; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 424; GFX908-NEXT: s_cbranch_execnz .LBB2_1 425; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 426; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 427; GFX908-NEXT: s_setpc_b64 s[30:31] 428; 429; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 430; GFX8: ; %bb.0: 431; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 433; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 434; GFX8-NEXT: flat_load_dword v0, v[3:4] 435; GFX8-NEXT: s_mov_b64 s[4:5], 0 436; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 437; GFX8-NEXT: .LBB2_1: ; %atomicrmw.start 438; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 439; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 440; GFX8-NEXT: v_mov_b32_e32 v6, v0 441; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 442; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 443; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 444; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 445; GFX8-NEXT: buffer_wbinvl1 446; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 447; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 448; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 449; GFX8-NEXT: s_cbranch_execnz .LBB2_1 450; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 451; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 452; GFX8-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 455; GFX7: ; %bb.0: 456; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 458; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 459; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 460; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 461; GFX7-NEXT: buffer_wbinvl1 462; GFX7-NEXT: s_setpc_b64 s[30:31] 463 %gep = getelementptr float, ptr %ptr, i64 -512 464 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 465 ret float %result 466} 467 468define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 469; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 470; GFX12: ; %bb.0: 471; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 472; GFX12-NEXT: s_wait_expcnt 0x0 473; GFX12-NEXT: s_wait_samplecnt 0x0 474; GFX12-NEXT: s_wait_bvhcnt 0x0 475; GFX12-NEXT: s_wait_kmcnt 0x0 476; GFX12-NEXT: s_wait_storecnt 0x0 477; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV 478; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 479; GFX12-NEXT: global_inv scope:SCOPE_DEV 480; GFX12-NEXT: s_setpc_b64 s[30:31] 481; 482; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 483; GFX940: ; %bb.0: 484; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 485; GFX940-NEXT: flat_load_dword v3, v[0:1] 486; GFX940-NEXT: s_mov_b64 s[0:1], 0 487; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 488; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start 489; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 490; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 491; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 492; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 493; GFX940-NEXT: buffer_wbl2 sc1 494; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 495; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 496; GFX940-NEXT: buffer_inv sc1 497; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 498; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 499; GFX940-NEXT: v_mov_b32_e32 v3, v2 500; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 501; GFX940-NEXT: s_cbranch_execnz .LBB3_1 502; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 503; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 504; GFX940-NEXT: s_setpc_b64 s[30:31] 505; 506; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 507; GFX11: ; %bb.0: 508; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 509; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 510; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 511; GFX11-NEXT: s_waitcnt lgkmcnt(0) 512; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 513; GFX11-NEXT: buffer_gl1_inv 514; GFX11-NEXT: buffer_gl0_inv 515; GFX11-NEXT: s_setpc_b64 s[30:31] 516; 517; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 518; GFX10: ; %bb.0: 519; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 520; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 521; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 522; GFX10-NEXT: s_waitcnt lgkmcnt(0) 523; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 524; GFX10-NEXT: buffer_gl1_inv 525; GFX10-NEXT: buffer_gl0_inv 526; GFX10-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 529; GFX90A: ; %bb.0: 530; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX90A-NEXT: flat_load_dword v3, v[0:1] 532; GFX90A-NEXT: s_mov_b64 s[4:5], 0 533; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 534; GFX90A-NEXT: .LBB3_1: ; %atomicrmw.start 535; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 536; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 537; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 538; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 539; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 540; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 541; GFX90A-NEXT: buffer_wbinvl1 542; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 543; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 544; GFX90A-NEXT: v_mov_b32_e32 v3, v2 545; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 546; GFX90A-NEXT: s_cbranch_execnz .LBB3_1 547; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 548; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 549; GFX90A-NEXT: s_setpc_b64 s[30:31] 550; 551; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 552; GFX908: ; %bb.0: 553; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 554; GFX908-NEXT: flat_load_dword v3, v[0:1] 555; GFX908-NEXT: s_mov_b64 s[4:5], 0 556; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 557; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start 558; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 559; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 560; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 561; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 562; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 563; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 564; GFX908-NEXT: buffer_wbinvl1 565; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 566; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 567; GFX908-NEXT: v_mov_b32_e32 v3, v2 568; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 569; GFX908-NEXT: s_cbranch_execnz .LBB3_1 570; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 571; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 572; GFX908-NEXT: s_setpc_b64 s[30:31] 573; 574; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 575; GFX8: ; %bb.0: 576; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; GFX8-NEXT: flat_load_dword v3, v[0:1] 578; GFX8-NEXT: s_mov_b64 s[4:5], 0 579; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 580; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start 581; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 582; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 583; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 584; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 585; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 586; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 587; GFX8-NEXT: buffer_wbinvl1 588; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 589; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 590; GFX8-NEXT: v_mov_b32_e32 v3, v2 591; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 592; GFX8-NEXT: s_cbranch_execnz .LBB3_1 593; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 594; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 595; GFX8-NEXT: s_setpc_b64 s[30:31] 596; 597; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory: 598; GFX7: ; %bb.0: 599; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 600; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 601; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 602; GFX7-NEXT: buffer_wbinvl1 603; GFX7-NEXT: s_setpc_b64 s[30:31] 604 %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 605 ret void 606} 607 608define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 609; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 610; GFX12: ; %bb.0: 611; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 612; GFX12-NEXT: s_wait_expcnt 0x0 613; GFX12-NEXT: s_wait_samplecnt 0x0 614; GFX12-NEXT: s_wait_bvhcnt 0x0 615; GFX12-NEXT: s_wait_kmcnt 0x0 616; GFX12-NEXT: s_wait_storecnt 0x0 617; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 618; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 619; GFX12-NEXT: global_inv scope:SCOPE_DEV 620; GFX12-NEXT: s_setpc_b64 s[30:31] 621; 622; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 623; GFX940: ; %bb.0: 624; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 625; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 626; GFX940-NEXT: s_mov_b64 s[0:1], 0 627; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 628; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start 629; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 630; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 631; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 632; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 633; GFX940-NEXT: buffer_wbl2 sc1 634; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 635; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 636; GFX940-NEXT: buffer_inv sc1 637; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 638; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 639; GFX940-NEXT: v_mov_b32_e32 v3, v2 640; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 641; GFX940-NEXT: s_cbranch_execnz .LBB4_1 642; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 643; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 644; GFX940-NEXT: s_setpc_b64 s[30:31] 645; 646; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 647; GFX11: ; %bb.0: 648; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 649; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 650; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 651; GFX11-NEXT: s_waitcnt lgkmcnt(0) 652; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 653; GFX11-NEXT: buffer_gl1_inv 654; GFX11-NEXT: buffer_gl0_inv 655; GFX11-NEXT: s_setpc_b64 s[30:31] 656; 657; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 658; GFX10: ; %bb.0: 659; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 660; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 661; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 662; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 663; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 664; GFX10-NEXT: s_waitcnt lgkmcnt(0) 665; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 666; GFX10-NEXT: buffer_gl1_inv 667; GFX10-NEXT: buffer_gl0_inv 668; GFX10-NEXT: s_setpc_b64 s[30:31] 669; 670; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 671; GFX90A: ; %bb.0: 672; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 673; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 674; GFX90A-NEXT: s_mov_b64 s[4:5], 0 675; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 676; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start 677; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 678; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 679; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 680; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 681; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 682; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 683; GFX90A-NEXT: buffer_wbinvl1 684; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 685; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 686; GFX90A-NEXT: v_mov_b32_e32 v3, v2 687; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 688; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 689; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 690; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 691; GFX90A-NEXT: s_setpc_b64 s[30:31] 692; 693; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 694; GFX908: ; %bb.0: 695; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 696; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 697; GFX908-NEXT: s_mov_b64 s[4:5], 0 698; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 699; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start 700; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 701; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 702; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 703; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 704; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 705; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 706; GFX908-NEXT: buffer_wbinvl1 707; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 708; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 709; GFX908-NEXT: v_mov_b32_e32 v3, v2 710; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 711; GFX908-NEXT: s_cbranch_execnz .LBB4_1 712; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 713; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 714; GFX908-NEXT: s_setpc_b64 s[30:31] 715; 716; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 717; GFX8: ; %bb.0: 718; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 719; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 720; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 721; GFX8-NEXT: flat_load_dword v3, v[0:1] 722; GFX8-NEXT: s_mov_b64 s[4:5], 0 723; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 724; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 725; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 726; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 727; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 728; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 729; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 730; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 731; GFX8-NEXT: buffer_wbinvl1 732; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 733; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 734; GFX8-NEXT: v_mov_b32_e32 v3, v2 735; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 736; GFX8-NEXT: s_cbranch_execnz .LBB4_1 737; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 738; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 739; GFX8-NEXT: s_setpc_b64 s[30:31] 740; 741; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 742; GFX7: ; %bb.0: 743; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 745; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 746; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 747; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 748; GFX7-NEXT: buffer_wbinvl1 749; GFX7-NEXT: s_setpc_b64 s[30:31] 750 %gep = getelementptr float, ptr %ptr, i64 511 751 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 752 ret void 753} 754 755define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 756; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 757; GFX12: ; %bb.0: 758; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 759; GFX12-NEXT: s_wait_expcnt 0x0 760; GFX12-NEXT: s_wait_samplecnt 0x0 761; GFX12-NEXT: s_wait_bvhcnt 0x0 762; GFX12-NEXT: s_wait_kmcnt 0x0 763; GFX12-NEXT: s_wait_storecnt 0x0 764; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 765; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 766; GFX12-NEXT: global_inv scope:SCOPE_DEV 767; GFX12-NEXT: s_setpc_b64 s[30:31] 768; 769; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 770; GFX940: ; %bb.0: 771; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 772; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 773; GFX940-NEXT: s_movk_i32 s0, 0xf800 774; GFX940-NEXT: s_nop 0 775; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 776; GFX940-NEXT: flat_load_dword v3, v[4:5] 777; GFX940-NEXT: s_mov_b32 s1, -1 778; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 779; GFX940-NEXT: s_mov_b64 s[0:1], 0 780; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 781; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start 782; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 783; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 784; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 785; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 786; GFX940-NEXT: buffer_wbl2 sc1 787; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 788; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 789; GFX940-NEXT: buffer_inv sc1 790; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 791; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 792; GFX940-NEXT: v_mov_b32_e32 v3, v2 793; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 794; GFX940-NEXT: s_cbranch_execnz .LBB5_1 795; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 796; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 797; GFX940-NEXT: s_setpc_b64 s[30:31] 798; 799; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 800; GFX11: ; %bb.0: 801; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 802; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 803; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 804; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 805; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 806; GFX11-NEXT: s_waitcnt lgkmcnt(0) 807; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 808; GFX11-NEXT: buffer_gl1_inv 809; GFX11-NEXT: buffer_gl0_inv 810; GFX11-NEXT: s_setpc_b64 s[30:31] 811; 812; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 813; GFX10: ; %bb.0: 814; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 815; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 816; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 817; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 818; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 819; GFX10-NEXT: s_waitcnt lgkmcnt(0) 820; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 821; GFX10-NEXT: buffer_gl1_inv 822; GFX10-NEXT: buffer_gl0_inv 823; GFX10-NEXT: s_setpc_b64 s[30:31] 824; 825; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 826; GFX90A: ; %bb.0: 827; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 828; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 829; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 830; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 831; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 832; GFX90A-NEXT: flat_load_dword v1, v[0:1] 833; GFX90A-NEXT: s_mov_b64 s[4:5], 0 834; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 835; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start 836; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 837; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 838; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 839; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 840; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 841; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 842; GFX90A-NEXT: buffer_wbinvl1 843; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 844; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 845; GFX90A-NEXT: v_mov_b32_e32 v1, v0 846; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 847; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 848; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 849; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 850; GFX90A-NEXT: s_setpc_b64 s[30:31] 851; 852; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 853; GFX908: ; %bb.0: 854; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 855; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 856; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 857; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 858; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 859; GFX908-NEXT: flat_load_dword v1, v[0:1] 860; GFX908-NEXT: s_mov_b64 s[4:5], 0 861; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 862; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start 863; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 864; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 865; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 866; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 867; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 868; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 869; GFX908-NEXT: buffer_wbinvl1 870; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 871; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 872; GFX908-NEXT: v_mov_b32_e32 v1, v0 873; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 874; GFX908-NEXT: s_cbranch_execnz .LBB5_1 875; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 876; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 877; GFX908-NEXT: s_setpc_b64 s[30:31] 878; 879; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 880; GFX8: ; %bb.0: 881; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 882; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 883; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 884; GFX8-NEXT: flat_load_dword v3, v[0:1] 885; GFX8-NEXT: s_mov_b64 s[4:5], 0 886; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 887; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 888; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 889; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 890; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 891; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 892; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 893; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 894; GFX8-NEXT: buffer_wbinvl1 895; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 896; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 897; GFX8-NEXT: v_mov_b32_e32 v3, v2 898; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 899; GFX8-NEXT: s_cbranch_execnz .LBB5_1 900; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 901; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 902; GFX8-NEXT: s_setpc_b64 s[30:31] 903; 904; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: 905; GFX7: ; %bb.0: 906; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 907; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 908; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 909; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 910; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 911; GFX7-NEXT: buffer_wbinvl1 912; GFX7-NEXT: s_setpc_b64 s[30:31] 913 %gep = getelementptr float, ptr %ptr, i64 -512 914 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 915 ret void 916} 917 918define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 919; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 920; GFX12: ; %bb.0: 921; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 922; GFX12-NEXT: s_wait_expcnt 0x0 923; GFX12-NEXT: s_wait_samplecnt 0x0 924; GFX12-NEXT: s_wait_bvhcnt 0x0 925; GFX12-NEXT: s_wait_kmcnt 0x0 926; GFX12-NEXT: global_wb scope:SCOPE_SYS 927; GFX12-NEXT: s_wait_storecnt 0x0 928; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 929; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 930; GFX12-NEXT: global_inv scope:SCOPE_SYS 931; GFX12-NEXT: s_setpc_b64 s[30:31] 932; 933; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 934; GFX940: ; %bb.0: 935; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 936; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 937; GFX940-NEXT: s_mov_b64 s[0:1], 0 938; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 939; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start 940; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 941; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 942; GFX940-NEXT: v_mov_b32_e32 v5, v3 943; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 944; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 945; GFX940-NEXT: buffer_wbl2 sc0 sc1 946; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 947; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 948; GFX940-NEXT: buffer_inv sc0 sc1 949; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 950; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 951; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 952; GFX940-NEXT: s_cbranch_execnz .LBB6_1 953; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 954; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 955; GFX940-NEXT: v_mov_b32_e32 v0, v3 956; GFX940-NEXT: s_setpc_b64 s[30:31] 957; 958; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 959; GFX11: ; %bb.0: 960; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 961; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 962; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc 963; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 964; GFX11-NEXT: buffer_gl1_inv 965; GFX11-NEXT: buffer_gl0_inv 966; GFX11-NEXT: s_setpc_b64 s[30:31] 967; 968; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 969; GFX10: ; %bb.0: 970; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 971; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 972; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 973; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 974; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 975; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 976; GFX10-NEXT: buffer_gl1_inv 977; GFX10-NEXT: buffer_gl0_inv 978; GFX10-NEXT: s_setpc_b64 s[30:31] 979; 980; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 981; GFX90A: ; %bb.0: 982; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 983; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 984; GFX90A-NEXT: s_mov_b64 s[4:5], 0 985; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 986; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start 987; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 988; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 989; GFX90A-NEXT: v_mov_b32_e32 v5, v3 990; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 991; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 992; GFX90A-NEXT: buffer_wbl2 993; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 994; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 995; GFX90A-NEXT: buffer_invl2 996; GFX90A-NEXT: buffer_wbinvl1 997; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 998; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 999; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1000; GFX90A-NEXT: s_cbranch_execnz .LBB6_1 1001; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1002; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1003; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1004; GFX90A-NEXT: s_setpc_b64 s[30:31] 1005; 1006; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1007; GFX908: ; %bb.0: 1008; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1009; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1010; GFX908-NEXT: s_mov_b64 s[4:5], 0 1011; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 1012; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 1013; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1014; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1015; GFX908-NEXT: v_mov_b32_e32 v4, v3 1016; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 1017; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 1018; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1019; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1020; GFX908-NEXT: buffer_wbinvl1 1021; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1022; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1023; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1024; GFX908-NEXT: s_cbranch_execnz .LBB6_1 1025; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1026; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1027; GFX908-NEXT: v_mov_b32_e32 v0, v3 1028; GFX908-NEXT: s_setpc_b64 s[30:31] 1029; 1030; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1031; GFX8: ; %bb.0: 1032; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1033; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 1034; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1035; GFX8-NEXT: flat_load_dword v0, v[3:4] 1036; GFX8-NEXT: s_mov_b64 s[4:5], 0 1037; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 1038; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 1039; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1040; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1041; GFX8-NEXT: v_mov_b32_e32 v6, v0 1042; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 1043; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 1044; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 1045; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1046; GFX8-NEXT: buffer_wbinvl1 1047; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 1048; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1049; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1050; GFX8-NEXT: s_cbranch_execnz .LBB6_1 1051; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1052; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1053; GFX8-NEXT: s_setpc_b64 s[30:31] 1054; 1055; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1056; GFX7: ; %bb.0: 1057; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1058; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1059; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1060; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1061; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1062; GFX7-NEXT: buffer_wbinvl1 1063; GFX7-NEXT: s_setpc_b64 s[30:31] 1064 %gep = getelementptr float, ptr %ptr, i64 511 1065 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 1066 ret float %result 1067} 1068 1069define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 { 1070; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1071; GFX12: ; %bb.0: 1072; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1073; GFX12-NEXT: s_wait_expcnt 0x0 1074; GFX12-NEXT: s_wait_samplecnt 0x0 1075; GFX12-NEXT: s_wait_bvhcnt 0x0 1076; GFX12-NEXT: s_wait_kmcnt 0x0 1077; GFX12-NEXT: global_wb scope:SCOPE_SYS 1078; GFX12-NEXT: s_wait_storecnt 0x0 1079; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS 1080; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1081; GFX12-NEXT: global_inv scope:SCOPE_SYS 1082; GFX12-NEXT: s_setpc_b64 s[30:31] 1083; 1084; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1085; GFX940: ; %bb.0: 1086; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1087; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 1088; GFX940-NEXT: s_mov_b64 s[0:1], 0 1089; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 1090; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start 1091; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1092; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1093; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 1094; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 1095; GFX940-NEXT: buffer_wbl2 sc0 sc1 1096; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 1097; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1098; GFX940-NEXT: buffer_inv sc0 sc1 1099; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1100; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1101; GFX940-NEXT: v_mov_b32_e32 v3, v2 1102; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1103; GFX940-NEXT: s_cbranch_execnz .LBB7_1 1104; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1105; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1106; GFX940-NEXT: s_setpc_b64 s[30:31] 1107; 1108; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1109; GFX11: ; %bb.0: 1110; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1111; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1112; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 1113; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1115; GFX11-NEXT: buffer_gl1_inv 1116; GFX11-NEXT: buffer_gl0_inv 1117; GFX11-NEXT: s_setpc_b64 s[30:31] 1118; 1119; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1120; GFX10: ; %bb.0: 1121; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1122; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1123; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1124; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1125; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 1126; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1128; GFX10-NEXT: buffer_gl1_inv 1129; GFX10-NEXT: buffer_gl0_inv 1130; GFX10-NEXT: s_setpc_b64 s[30:31] 1131; 1132; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1133; GFX90A: ; %bb.0: 1134; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1135; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 1136; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1137; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 1138; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start 1139; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1140; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1141; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 1142; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 1143; GFX90A-NEXT: buffer_wbl2 1144; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 1145; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1146; GFX90A-NEXT: buffer_invl2 1147; GFX90A-NEXT: buffer_wbinvl1 1148; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1149; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1150; GFX90A-NEXT: v_mov_b32_e32 v3, v2 1151; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1152; GFX90A-NEXT: s_cbranch_execnz .LBB7_1 1153; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1154; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1155; GFX90A-NEXT: s_setpc_b64 s[30:31] 1156; 1157; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1158; GFX908: ; %bb.0: 1159; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1160; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1161; GFX908-NEXT: s_mov_b64 s[4:5], 0 1162; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 1163; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start 1164; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1165; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1166; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 1167; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 1168; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 1169; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1170; GFX908-NEXT: buffer_wbinvl1 1171; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1172; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1173; GFX908-NEXT: v_mov_b32_e32 v3, v2 1174; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1175; GFX908-NEXT: s_cbranch_execnz .LBB7_1 1176; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1177; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1178; GFX908-NEXT: s_setpc_b64 s[30:31] 1179; 1180; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1181; GFX8: ; %bb.0: 1182; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1183; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 1184; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1185; GFX8-NEXT: flat_load_dword v3, v[0:1] 1186; GFX8-NEXT: s_mov_b64 s[4:5], 0 1187; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 1188; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start 1189; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1190; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1191; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 1192; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 1193; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1194; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1195; GFX8-NEXT: buffer_wbinvl1 1196; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1197; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1198; GFX8-NEXT: v_mov_b32_e32 v3, v2 1199; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1200; GFX8-NEXT: s_cbranch_execnz .LBB7_1 1201; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1202; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1203; GFX8-NEXT: s_setpc_b64 s[30:31] 1204; 1205; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: 1206; GFX7: ; %bb.0: 1207; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1208; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1209; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1210; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 1211; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1212; GFX7-NEXT: buffer_wbinvl1 1213; GFX7-NEXT: s_setpc_b64 s[30:31] 1214 %gep = getelementptr float, ptr %ptr, i64 511 1215 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 1216 ret void 1217} 1218 1219define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 1220; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1221; GFX12: ; %bb.0: 1222; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1223; GFX12-NEXT: s_wait_expcnt 0x0 1224; GFX12-NEXT: s_wait_samplecnt 0x0 1225; GFX12-NEXT: s_wait_bvhcnt 0x0 1226; GFX12-NEXT: s_wait_kmcnt 0x0 1227; GFX12-NEXT: s_wait_storecnt 0x0 1228; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1229; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1230; GFX12-NEXT: global_inv scope:SCOPE_DEV 1231; GFX12-NEXT: s_setpc_b64 s[30:31] 1232; 1233; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1234; GFX940: ; %bb.0: 1235; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1236; GFX940-NEXT: flat_load_dword v3, v[0:1] 1237; GFX940-NEXT: s_mov_b64 s[0:1], 0 1238; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 1239; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start 1240; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1241; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1242; GFX940-NEXT: v_mov_b32_e32 v5, v3 1243; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 1244; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 1245; GFX940-NEXT: buffer_wbl2 sc1 1246; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 1247; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1248; GFX940-NEXT: buffer_inv sc1 1249; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1250; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1251; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1252; GFX940-NEXT: s_cbranch_execnz .LBB8_1 1253; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1254; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1255; GFX940-NEXT: v_mov_b32_e32 v0, v3 1256; GFX940-NEXT: s_setpc_b64 s[30:31] 1257; 1258; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1259; GFX11: ; %bb.0: 1260; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1261; GFX11-NEXT: flat_load_b32 v3, v[0:1] 1262; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 1263; GFX11-NEXT: s_mov_b32 s0, 0 1264; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start 1265; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1266; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1267; GFX11-NEXT: v_mov_b32_e32 v4, v3 1268; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1269; GFX11-NEXT: v_max_f32_e32 v3, v4, v4 1270; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 1271; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1272; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 1273; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1274; GFX11-NEXT: buffer_gl1_inv 1275; GFX11-NEXT: buffer_gl0_inv 1276; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1277; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1278; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1279; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1280; GFX11-NEXT: s_cbranch_execnz .LBB8_1 1281; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1282; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1283; GFX11-NEXT: v_mov_b32_e32 v0, v3 1284; GFX11-NEXT: s_setpc_b64 s[30:31] 1285; 1286; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1287; GFX10: ; %bb.0: 1288; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1289; GFX10-NEXT: flat_load_dword v3, v[0:1] 1290; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 1291; GFX10-NEXT: s_mov_b32 s4, 0 1292; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start 1293; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 1294; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1295; GFX10-NEXT: v_mov_b32_e32 v4, v3 1296; GFX10-NEXT: v_max_f32_e32 v3, v4, v4 1297; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 1298; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1299; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1300; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1301; GFX10-NEXT: buffer_gl1_inv 1302; GFX10-NEXT: buffer_gl0_inv 1303; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 1304; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 1305; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 1306; GFX10-NEXT: s_cbranch_execnz .LBB8_1 1307; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 1308; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 1309; GFX10-NEXT: v_mov_b32_e32 v0, v3 1310; GFX10-NEXT: s_setpc_b64 s[30:31] 1311; 1312; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1313; GFX90A: ; %bb.0: 1314; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1315; GFX90A-NEXT: flat_load_dword v3, v[0:1] 1316; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1317; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 1318; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start 1319; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1320; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1321; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1322; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 1323; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 1324; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 1325; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1326; GFX90A-NEXT: buffer_wbinvl1 1327; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1328; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1329; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1330; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 1331; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1332; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1333; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1334; GFX90A-NEXT: s_setpc_b64 s[30:31] 1335; 1336; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1337; GFX908: ; %bb.0: 1338; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1339; GFX908-NEXT: flat_load_dword v3, v[0:1] 1340; GFX908-NEXT: s_mov_b64 s[4:5], 0 1341; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 1342; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 1343; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1344; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1345; GFX908-NEXT: v_mov_b32_e32 v4, v3 1346; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 1347; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 1348; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1349; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1350; GFX908-NEXT: buffer_wbinvl1 1351; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1352; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1353; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1354; GFX908-NEXT: s_cbranch_execnz .LBB8_1 1355; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1356; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1357; GFX908-NEXT: v_mov_b32_e32 v0, v3 1358; GFX908-NEXT: s_setpc_b64 s[30:31] 1359; 1360; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1361; GFX8: ; %bb.0: 1362; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1363; GFX8-NEXT: flat_load_dword v3, v[0:1] 1364; GFX8-NEXT: s_mov_b64 s[4:5], 0 1365; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 1366; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 1367; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1368; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1369; GFX8-NEXT: v_mov_b32_e32 v4, v3 1370; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 1371; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 1372; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1373; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1374; GFX8-NEXT: buffer_wbinvl1 1375; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1376; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1377; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1378; GFX8-NEXT: s_cbranch_execnz .LBB8_1 1379; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1380; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1381; GFX8-NEXT: v_mov_b32_e32 v0, v3 1382; GFX8-NEXT: s_setpc_b64 s[30:31] 1383; 1384; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_remote_memory: 1385; GFX7: ; %bb.0: 1386; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1387; GFX7-NEXT: flat_load_dword v3, v[0:1] 1388; GFX7-NEXT: s_mov_b64 s[4:5], 0 1389; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 1390; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start 1391; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 1392; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1393; GFX7-NEXT: v_mov_b32_e32 v4, v3 1394; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v4 1395; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 1396; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1397; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1398; GFX7-NEXT: buffer_wbinvl1 1399; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1400; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1401; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 1402; GFX7-NEXT: s_cbranch_execnz .LBB8_1 1403; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 1404; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 1405; GFX7-NEXT: v_mov_b32_e32 v0, v3 1406; GFX7-NEXT: s_setpc_b64 s[30:31] 1407 %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1408 ret float %result 1409} 1410 1411define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 { 1412; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1413; GFX12: ; %bb.0: 1414; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1415; GFX12-NEXT: s_wait_expcnt 0x0 1416; GFX12-NEXT: s_wait_samplecnt 0x0 1417; GFX12-NEXT: s_wait_bvhcnt 0x0 1418; GFX12-NEXT: s_wait_kmcnt 0x0 1419; GFX12-NEXT: s_wait_storecnt 0x0 1420; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1421; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1422; GFX12-NEXT: global_inv scope:SCOPE_DEV 1423; GFX12-NEXT: s_setpc_b64 s[30:31] 1424; 1425; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1426; GFX940: ; %bb.0: 1427; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1428; GFX940-NEXT: flat_load_dword v3, v[0:1] 1429; GFX940-NEXT: s_mov_b64 s[0:1], 0 1430; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 1431; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start 1432; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1433; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1434; GFX940-NEXT: v_mov_b32_e32 v5, v3 1435; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 1436; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 1437; GFX940-NEXT: buffer_wbl2 sc1 1438; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 1439; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1440; GFX940-NEXT: buffer_inv sc1 1441; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1442; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1443; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1444; GFX940-NEXT: s_cbranch_execnz .LBB9_1 1445; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1446; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1447; GFX940-NEXT: v_mov_b32_e32 v0, v3 1448; GFX940-NEXT: s_setpc_b64 s[30:31] 1449; 1450; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1451; GFX11: ; %bb.0: 1452; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1453; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1454; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc 1455; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1456; GFX11-NEXT: buffer_gl1_inv 1457; GFX11-NEXT: buffer_gl0_inv 1458; GFX11-NEXT: s_setpc_b64 s[30:31] 1459; 1460; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1461; GFX10: ; %bb.0: 1462; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1463; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1464; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1465; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1466; GFX10-NEXT: buffer_gl1_inv 1467; GFX10-NEXT: buffer_gl0_inv 1468; GFX10-NEXT: s_setpc_b64 s[30:31] 1469; 1470; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1471; GFX90A: ; %bb.0: 1472; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1473; GFX90A-NEXT: flat_load_dword v3, v[0:1] 1474; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1475; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 1476; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start 1477; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1478; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1479; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1480; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 1481; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 1482; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 1483; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1484; GFX90A-NEXT: buffer_wbinvl1 1485; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1486; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1487; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1488; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 1489; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1490; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1491; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1492; GFX90A-NEXT: s_setpc_b64 s[30:31] 1493; 1494; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1495; GFX908: ; %bb.0: 1496; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1497; GFX908-NEXT: flat_load_dword v3, v[0:1] 1498; GFX908-NEXT: s_mov_b64 s[4:5], 0 1499; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 1500; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 1501; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1502; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1503; GFX908-NEXT: v_mov_b32_e32 v4, v3 1504; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 1505; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 1506; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1507; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1508; GFX908-NEXT: buffer_wbinvl1 1509; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1510; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1511; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1512; GFX908-NEXT: s_cbranch_execnz .LBB9_1 1513; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1514; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1515; GFX908-NEXT: v_mov_b32_e32 v0, v3 1516; GFX908-NEXT: s_setpc_b64 s[30:31] 1517; 1518; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1519; GFX8: ; %bb.0: 1520; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1521; GFX8-NEXT: flat_load_dword v3, v[0:1] 1522; GFX8-NEXT: s_mov_b64 s[4:5], 0 1523; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 1524; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 1525; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1526; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1527; GFX8-NEXT: v_mov_b32_e32 v4, v3 1528; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 1529; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 1530; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1531; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1532; GFX8-NEXT: buffer_wbinvl1 1533; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1534; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1535; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1536; GFX8-NEXT: s_cbranch_execnz .LBB9_1 1537; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1538; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1539; GFX8-NEXT: v_mov_b32_e32 v0, v3 1540; GFX8-NEXT: s_setpc_b64 s[30:31] 1541; 1542; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 1543; GFX7: ; %bb.0: 1544; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1545; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1546; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1547; GFX7-NEXT: buffer_wbinvl1 1548; GFX7-NEXT: s_setpc_b64 s[30:31] 1549 %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 1550 ret float %result 1551} 1552 1553; -------------------------------------------------------------------- 1554; float with ftz/daz 1555; -------------------------------------------------------------------- 1556 1557define float @flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 1558; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1559; GFX12: ; %bb.0: 1560; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1561; GFX12-NEXT: s_wait_expcnt 0x0 1562; GFX12-NEXT: s_wait_samplecnt 0x0 1563; GFX12-NEXT: s_wait_bvhcnt 0x0 1564; GFX12-NEXT: s_wait_kmcnt 0x0 1565; GFX12-NEXT: s_wait_storecnt 0x0 1566; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1567; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1568; GFX12-NEXT: global_inv scope:SCOPE_DEV 1569; GFX12-NEXT: s_setpc_b64 s[30:31] 1570; 1571; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1572; GFX940: ; %bb.0: 1573; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1574; GFX940-NEXT: flat_load_dword v3, v[0:1] 1575; GFX940-NEXT: s_mov_b64 s[0:1], 0 1576; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 1577; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start 1578; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1579; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1580; GFX940-NEXT: v_mov_b32_e32 v5, v3 1581; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 1582; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 1583; GFX940-NEXT: buffer_wbl2 sc1 1584; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 1585; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1586; GFX940-NEXT: buffer_inv sc1 1587; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1588; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1589; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1590; GFX940-NEXT: s_cbranch_execnz .LBB10_1 1591; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1592; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1593; GFX940-NEXT: v_mov_b32_e32 v0, v3 1594; GFX940-NEXT: s_setpc_b64 s[30:31] 1595; 1596; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1597; GFX11: ; %bb.0: 1598; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1599; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1600; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc 1601; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1602; GFX11-NEXT: buffer_gl1_inv 1603; GFX11-NEXT: buffer_gl0_inv 1604; GFX11-NEXT: s_setpc_b64 s[30:31] 1605; 1606; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1607; GFX10: ; %bb.0: 1608; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1609; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1610; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1611; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1612; GFX10-NEXT: buffer_gl1_inv 1613; GFX10-NEXT: buffer_gl0_inv 1614; GFX10-NEXT: s_setpc_b64 s[30:31] 1615; 1616; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1617; GFX90A: ; %bb.0: 1618; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1619; GFX90A-NEXT: flat_load_dword v3, v[0:1] 1620; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1621; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 1622; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start 1623; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1624; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1625; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1626; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 1627; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 1628; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 1629; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1630; GFX90A-NEXT: buffer_wbinvl1 1631; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1632; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1633; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1634; GFX90A-NEXT: s_cbranch_execnz .LBB10_1 1635; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1636; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1637; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1638; GFX90A-NEXT: s_setpc_b64 s[30:31] 1639; 1640; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1641; GFX908: ; %bb.0: 1642; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1643; GFX908-NEXT: flat_load_dword v3, v[0:1] 1644; GFX908-NEXT: s_mov_b64 s[4:5], 0 1645; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 1646; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start 1647; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1648; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1649; GFX908-NEXT: v_mov_b32_e32 v4, v3 1650; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 1651; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 1652; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1653; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1654; GFX908-NEXT: buffer_wbinvl1 1655; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1656; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1657; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1658; GFX908-NEXT: s_cbranch_execnz .LBB10_1 1659; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1660; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1661; GFX908-NEXT: v_mov_b32_e32 v0, v3 1662; GFX908-NEXT: s_setpc_b64 s[30:31] 1663; 1664; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1665; GFX8: ; %bb.0: 1666; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1667; GFX8-NEXT: flat_load_dword v3, v[0:1] 1668; GFX8-NEXT: s_mov_b64 s[4:5], 0 1669; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 1670; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start 1671; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1672; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1673; GFX8-NEXT: v_mov_b32_e32 v4, v3 1674; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 1675; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 1676; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1677; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1678; GFX8-NEXT: buffer_wbinvl1 1679; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1680; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1681; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1682; GFX8-NEXT: s_cbranch_execnz .LBB10_1 1683; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1684; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1685; GFX8-NEXT: v_mov_b32_e32 v0, v3 1686; GFX8-NEXT: s_setpc_b64 s[30:31] 1687; 1688; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__ftz__amdgpu_no_fine_grained_memory: 1689; GFX7: ; %bb.0: 1690; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1691; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1692; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1693; GFX7-NEXT: buffer_wbinvl1 1694; GFX7-NEXT: s_setpc_b64 s[30:31] 1695 %result = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1696 ret float %result 1697} 1698 1699define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 1700; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1701; GFX12: ; %bb.0: 1702; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1703; GFX12-NEXT: s_wait_expcnt 0x0 1704; GFX12-NEXT: s_wait_samplecnt 0x0 1705; GFX12-NEXT: s_wait_bvhcnt 0x0 1706; GFX12-NEXT: s_wait_kmcnt 0x0 1707; GFX12-NEXT: s_wait_storecnt 0x0 1708; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1709; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1710; GFX12-NEXT: global_inv scope:SCOPE_DEV 1711; GFX12-NEXT: s_setpc_b64 s[30:31] 1712; 1713; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1714; GFX940: ; %bb.0: 1715; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1716; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 1717; GFX940-NEXT: s_mov_b64 s[0:1], 0 1718; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 1719; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start 1720; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1721; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1722; GFX940-NEXT: v_mov_b32_e32 v5, v3 1723; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 1724; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 1725; GFX940-NEXT: buffer_wbl2 sc1 1726; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 1727; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1728; GFX940-NEXT: buffer_inv sc1 1729; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1730; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1731; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1732; GFX940-NEXT: s_cbranch_execnz .LBB11_1 1733; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1734; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1735; GFX940-NEXT: v_mov_b32_e32 v0, v3 1736; GFX940-NEXT: s_setpc_b64 s[30:31] 1737; 1738; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1739; GFX11: ; %bb.0: 1740; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1741; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1742; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc 1743; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1744; GFX11-NEXT: buffer_gl1_inv 1745; GFX11-NEXT: buffer_gl0_inv 1746; GFX11-NEXT: s_setpc_b64 s[30:31] 1747; 1748; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1749; GFX10: ; %bb.0: 1750; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1751; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 1752; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1753; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1754; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1755; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1756; GFX10-NEXT: buffer_gl1_inv 1757; GFX10-NEXT: buffer_gl0_inv 1758; GFX10-NEXT: s_setpc_b64 s[30:31] 1759; 1760; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1761; GFX90A: ; %bb.0: 1762; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1763; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 1764; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1765; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 1766; GFX90A-NEXT: .LBB11_1: ; %atomicrmw.start 1767; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1768; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1769; GFX90A-NEXT: v_mov_b32_e32 v5, v3 1770; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 1771; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 1772; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 1773; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1774; GFX90A-NEXT: buffer_wbinvl1 1775; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1776; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1777; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1778; GFX90A-NEXT: s_cbranch_execnz .LBB11_1 1779; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1780; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1781; GFX90A-NEXT: v_mov_b32_e32 v0, v3 1782; GFX90A-NEXT: s_setpc_b64 s[30:31] 1783; 1784; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1785; GFX908: ; %bb.0: 1786; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1787; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 1788; GFX908-NEXT: s_mov_b64 s[4:5], 0 1789; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 1790; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 1791; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1792; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1793; GFX908-NEXT: v_mov_b32_e32 v4, v3 1794; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 1795; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 1796; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 1797; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1798; GFX908-NEXT: buffer_wbinvl1 1799; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1800; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1801; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1802; GFX908-NEXT: s_cbranch_execnz .LBB11_1 1803; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1804; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1805; GFX908-NEXT: v_mov_b32_e32 v0, v3 1806; GFX908-NEXT: s_setpc_b64 s[30:31] 1807; 1808; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1809; GFX8: ; %bb.0: 1810; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1811; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 1812; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 1813; GFX8-NEXT: flat_load_dword v0, v[3:4] 1814; GFX8-NEXT: s_mov_b64 s[4:5], 0 1815; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 1816; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 1817; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1818; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1819; GFX8-NEXT: v_mov_b32_e32 v6, v0 1820; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 1821; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 1822; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 1823; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1824; GFX8-NEXT: buffer_wbinvl1 1825; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 1826; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1827; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1828; GFX8-NEXT: s_cbranch_execnz .LBB11_1 1829; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1830; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1831; GFX8-NEXT: s_setpc_b64 s[30:31] 1832; 1833; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 1834; GFX7: ; %bb.0: 1835; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1836; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 1837; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1838; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1839; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1840; GFX7-NEXT: buffer_wbinvl1 1841; GFX7-NEXT: s_setpc_b64 s[30:31] 1842 %gep = getelementptr float, ptr %ptr, i64 511 1843 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1844 ret float %result 1845} 1846 1847define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 1848; GFX12-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1849; GFX12: ; %bb.0: 1850; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1851; GFX12-NEXT: s_wait_expcnt 0x0 1852; GFX12-NEXT: s_wait_samplecnt 0x0 1853; GFX12-NEXT: s_wait_bvhcnt 0x0 1854; GFX12-NEXT: s_wait_kmcnt 0x0 1855; GFX12-NEXT: s_wait_storecnt 0x0 1856; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1857; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1858; GFX12-NEXT: global_inv scope:SCOPE_DEV 1859; GFX12-NEXT: s_setpc_b64 s[30:31] 1860; 1861; GFX940-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1862; GFX940: ; %bb.0: 1863; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1864; GFX940-NEXT: v_mov_b32_e32 v4, v0 1865; GFX940-NEXT: v_mov_b32_e32 v5, v1 1866; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 1867; GFX940-NEXT: s_movk_i32 s0, 0xf800 1868; GFX940-NEXT: s_nop 0 1869; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 1870; GFX940-NEXT: flat_load_dword v0, v[0:1] 1871; GFX940-NEXT: s_mov_b32 s1, -1 1872; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 1873; GFX940-NEXT: s_mov_b64 s[0:1], 0 1874; GFX940-NEXT: v_max_f32_e32 v1, v2, v2 1875; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start 1876; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1877; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1878; GFX940-NEXT: v_mov_b32_e32 v3, v0 1879; GFX940-NEXT: v_max_f32_e32 v0, v3, v3 1880; GFX940-NEXT: v_min_f32_e32 v2, v0, v1 1881; GFX940-NEXT: buffer_wbl2 sc1 1882; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 1883; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1884; GFX940-NEXT: buffer_inv sc1 1885; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 1886; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1887; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1888; GFX940-NEXT: s_cbranch_execnz .LBB12_1 1889; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1890; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1891; GFX940-NEXT: s_setpc_b64 s[30:31] 1892; 1893; GFX11-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1894; GFX11: ; %bb.0: 1895; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1896; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1897; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1898; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1899; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 glc 1900; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1901; GFX11-NEXT: buffer_gl1_inv 1902; GFX11-NEXT: buffer_gl0_inv 1903; GFX11-NEXT: s_setpc_b64 s[30:31] 1904; 1905; GFX10-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1906; GFX10: ; %bb.0: 1907; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1908; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 1909; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 1910; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1911; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 1912; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1913; GFX10-NEXT: buffer_gl1_inv 1914; GFX10-NEXT: buffer_gl0_inv 1915; GFX10-NEXT: s_setpc_b64 s[30:31] 1916; 1917; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1918; GFX90A: ; %bb.0: 1919; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1920; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 1921; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 1922; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1923; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1924; GFX90A-NEXT: flat_load_dword v0, v[0:1] 1925; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1926; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2 1927; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start 1928; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1929; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1930; GFX90A-NEXT: v_mov_b32_e32 v3, v0 1931; GFX90A-NEXT: v_max_f32_e32 v0, v3, v3 1932; GFX90A-NEXT: v_min_f32_e32 v2, v0, v1 1933; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc 1934; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1935; GFX90A-NEXT: buffer_wbinvl1 1936; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 1937; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1938; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1939; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 1940; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1941; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1942; GFX90A-NEXT: s_setpc_b64 s[30:31] 1943; 1944; GFX908-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1945; GFX908: ; %bb.0: 1946; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1947; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 1948; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 1949; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 1950; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 1951; GFX908-NEXT: flat_load_dword v0, v[0:1] 1952; GFX908-NEXT: s_mov_b64 s[4:5], 0 1953; GFX908-NEXT: v_max_f32_e32 v1, v2, v2 1954; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start 1955; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1956; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1957; GFX908-NEXT: v_mov_b32_e32 v6, v0 1958; GFX908-NEXT: v_max_f32_e32 v0, v6, v6 1959; GFX908-NEXT: v_min_f32_e32 v5, v0, v1 1960; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 1961; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1962; GFX908-NEXT: buffer_wbinvl1 1963; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 1964; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1965; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1966; GFX908-NEXT: s_cbranch_execnz .LBB12_1 1967; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1968; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1969; GFX908-NEXT: s_setpc_b64 s[30:31] 1970; 1971; GFX8-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1972; GFX8: ; %bb.0: 1973; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1974; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 1975; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 1976; GFX8-NEXT: flat_load_dword v0, v[3:4] 1977; GFX8-NEXT: s_mov_b64 s[4:5], 0 1978; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 1979; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start 1980; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1981; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1982; GFX8-NEXT: v_mov_b32_e32 v6, v0 1983; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 1984; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 1985; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 1986; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1987; GFX8-NEXT: buffer_wbinvl1 1988; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 1989; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1990; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1991; GFX8-NEXT: s_cbranch_execnz .LBB12_1 1992; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1993; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1994; GFX8-NEXT: s_setpc_b64 s[30:31] 1995; 1996; GFX7-LABEL: flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 1997; GFX7: ; %bb.0: 1998; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1999; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 2000; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 2001; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 2002; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2003; GFX7-NEXT: buffer_wbinvl1 2004; GFX7-NEXT: s_setpc_b64 s[30:31] 2005 %gep = getelementptr float, ptr %ptr, i64 -512 2006 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2007 ret float %result 2008} 2009 2010define void @flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2011; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2012; GFX12: ; %bb.0: 2013; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2014; GFX12-NEXT: s_wait_expcnt 0x0 2015; GFX12-NEXT: s_wait_samplecnt 0x0 2016; GFX12-NEXT: s_wait_bvhcnt 0x0 2017; GFX12-NEXT: s_wait_kmcnt 0x0 2018; GFX12-NEXT: s_wait_storecnt 0x0 2019; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV 2020; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2021; GFX12-NEXT: global_inv scope:SCOPE_DEV 2022; GFX12-NEXT: s_setpc_b64 s[30:31] 2023; 2024; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2025; GFX940: ; %bb.0: 2026; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2027; GFX940-NEXT: flat_load_dword v3, v[0:1] 2028; GFX940-NEXT: s_mov_b64 s[0:1], 0 2029; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 2030; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start 2031; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2032; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2033; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 2034; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 2035; GFX940-NEXT: buffer_wbl2 sc1 2036; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 2037; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2038; GFX940-NEXT: buffer_inv sc1 2039; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2040; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2041; GFX940-NEXT: v_mov_b32_e32 v3, v2 2042; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2043; GFX940-NEXT: s_cbranch_execnz .LBB13_1 2044; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2045; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2046; GFX940-NEXT: s_setpc_b64 s[30:31] 2047; 2048; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2049; GFX11: ; %bb.0: 2050; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2051; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2052; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 2053; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2054; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2055; GFX11-NEXT: buffer_gl1_inv 2056; GFX11-NEXT: buffer_gl0_inv 2057; GFX11-NEXT: s_setpc_b64 s[30:31] 2058; 2059; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2060; GFX10: ; %bb.0: 2061; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2062; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2063; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 2064; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2065; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2066; GFX10-NEXT: buffer_gl1_inv 2067; GFX10-NEXT: buffer_gl0_inv 2068; GFX10-NEXT: s_setpc_b64 s[30:31] 2069; 2070; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2071; GFX90A: ; %bb.0: 2072; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2073; GFX90A-NEXT: flat_load_dword v3, v[0:1] 2074; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2075; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 2076; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start 2077; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2078; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2079; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 2080; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 2081; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2082; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2083; GFX90A-NEXT: buffer_wbinvl1 2084; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2085; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2086; GFX90A-NEXT: v_mov_b32_e32 v3, v2 2087; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2088; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 2089; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2090; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2091; GFX90A-NEXT: s_setpc_b64 s[30:31] 2092; 2093; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2094; GFX908: ; %bb.0: 2095; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2096; GFX908-NEXT: flat_load_dword v3, v[0:1] 2097; GFX908-NEXT: s_mov_b64 s[4:5], 0 2098; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 2099; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 2100; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2101; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2102; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 2103; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 2104; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2105; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2106; GFX908-NEXT: buffer_wbinvl1 2107; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2108; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2109; GFX908-NEXT: v_mov_b32_e32 v3, v2 2110; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2111; GFX908-NEXT: s_cbranch_execnz .LBB13_1 2112; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2113; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2114; GFX908-NEXT: s_setpc_b64 s[30:31] 2115; 2116; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2117; GFX8: ; %bb.0: 2118; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2119; GFX8-NEXT: flat_load_dword v3, v[0:1] 2120; GFX8-NEXT: s_mov_b64 s[4:5], 0 2121; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 2122; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 2123; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2124; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2125; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 2126; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 2127; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2128; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2129; GFX8-NEXT: buffer_wbinvl1 2130; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2131; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2132; GFX8-NEXT: v_mov_b32_e32 v3, v2 2133; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2134; GFX8-NEXT: s_cbranch_execnz .LBB13_1 2135; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2136; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2137; GFX8-NEXT: s_setpc_b64 s[30:31] 2138; 2139; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__ftz__amdgpu_no_fine_grained_memory: 2140; GFX7: ; %bb.0: 2141; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2142; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 2143; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2144; GFX7-NEXT: buffer_wbinvl1 2145; GFX7-NEXT: s_setpc_b64 s[30:31] 2146 %unused = atomicrmw fmin ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2147 ret void 2148} 2149 2150define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2151; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2152; GFX12: ; %bb.0: 2153; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2154; GFX12-NEXT: s_wait_expcnt 0x0 2155; GFX12-NEXT: s_wait_samplecnt 0x0 2156; GFX12-NEXT: s_wait_bvhcnt 0x0 2157; GFX12-NEXT: s_wait_kmcnt 0x0 2158; GFX12-NEXT: s_wait_storecnt 0x0 2159; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_DEV 2160; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2161; GFX12-NEXT: global_inv scope:SCOPE_DEV 2162; GFX12-NEXT: s_setpc_b64 s[30:31] 2163; 2164; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2165; GFX940: ; %bb.0: 2166; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2167; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 2168; GFX940-NEXT: s_mov_b64 s[0:1], 0 2169; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 2170; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start 2171; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2172; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2173; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 2174; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 2175; GFX940-NEXT: buffer_wbl2 sc1 2176; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 2177; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2178; GFX940-NEXT: buffer_inv sc1 2179; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2180; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2181; GFX940-NEXT: v_mov_b32_e32 v3, v2 2182; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2183; GFX940-NEXT: s_cbranch_execnz .LBB14_1 2184; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2185; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2186; GFX940-NEXT: s_setpc_b64 s[30:31] 2187; 2188; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2189; GFX11: ; %bb.0: 2190; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2191; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2192; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 2193; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2194; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2195; GFX11-NEXT: buffer_gl1_inv 2196; GFX11-NEXT: buffer_gl0_inv 2197; GFX11-NEXT: s_setpc_b64 s[30:31] 2198; 2199; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2200; GFX10: ; %bb.0: 2201; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2202; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 2203; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2204; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2205; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 2206; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2207; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2208; GFX10-NEXT: buffer_gl1_inv 2209; GFX10-NEXT: buffer_gl0_inv 2210; GFX10-NEXT: s_setpc_b64 s[30:31] 2211; 2212; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2213; GFX90A: ; %bb.0: 2214; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2215; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 2216; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2217; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 2218; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start 2219; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2220; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2221; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 2222; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 2223; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 2224; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2225; GFX90A-NEXT: buffer_wbinvl1 2226; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2227; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2228; GFX90A-NEXT: v_mov_b32_e32 v3, v2 2229; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2230; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 2231; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2232; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2233; GFX90A-NEXT: s_setpc_b64 s[30:31] 2234; 2235; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2236; GFX908: ; %bb.0: 2237; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2238; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 2239; GFX908-NEXT: s_mov_b64 s[4:5], 0 2240; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 2241; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 2242; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2243; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2244; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 2245; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 2246; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 2247; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2248; GFX908-NEXT: buffer_wbinvl1 2249; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2250; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2251; GFX908-NEXT: v_mov_b32_e32 v3, v2 2252; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2253; GFX908-NEXT: s_cbranch_execnz .LBB14_1 2254; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2255; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2256; GFX908-NEXT: s_setpc_b64 s[30:31] 2257; 2258; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2259; GFX8: ; %bb.0: 2260; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2261; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 2262; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2263; GFX8-NEXT: flat_load_dword v3, v[0:1] 2264; GFX8-NEXT: s_mov_b64 s[4:5], 0 2265; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 2266; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 2267; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2268; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2269; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 2270; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 2271; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2272; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2273; GFX8-NEXT: buffer_wbinvl1 2274; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2275; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2276; GFX8-NEXT: v_mov_b32_e32 v3, v2 2277; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2278; GFX8-NEXT: s_cbranch_execnz .LBB14_1 2279; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2280; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2281; GFX8-NEXT: s_setpc_b64 s[30:31] 2282; 2283; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2284; GFX7: ; %bb.0: 2285; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2286; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2287; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2288; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 2289; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2290; GFX7-NEXT: buffer_wbinvl1 2291; GFX7-NEXT: s_setpc_b64 s[30:31] 2292 %gep = getelementptr float, ptr %ptr, i64 511 2293 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2294 ret void 2295} 2296 2297define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2298; GFX12-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2299; GFX12: ; %bb.0: 2300; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2301; GFX12-NEXT: s_wait_expcnt 0x0 2302; GFX12-NEXT: s_wait_samplecnt 0x0 2303; GFX12-NEXT: s_wait_bvhcnt 0x0 2304; GFX12-NEXT: s_wait_kmcnt 0x0 2305; GFX12-NEXT: s_wait_storecnt 0x0 2306; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:-2048 scope:SCOPE_DEV 2307; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2308; GFX12-NEXT: global_inv scope:SCOPE_DEV 2309; GFX12-NEXT: s_setpc_b64 s[30:31] 2310; 2311; GFX940-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2312; GFX940: ; %bb.0: 2313; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2314; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2315; GFX940-NEXT: s_movk_i32 s0, 0xf800 2316; GFX940-NEXT: s_nop 0 2317; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2318; GFX940-NEXT: flat_load_dword v3, v[4:5] 2319; GFX940-NEXT: s_mov_b32 s1, -1 2320; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 2321; GFX940-NEXT: s_mov_b64 s[0:1], 0 2322; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 2323; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start 2324; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2325; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2326; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 2327; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 2328; GFX940-NEXT: buffer_wbl2 sc1 2329; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 2330; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2331; GFX940-NEXT: buffer_inv sc1 2332; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2333; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2334; GFX940-NEXT: v_mov_b32_e32 v3, v2 2335; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2336; GFX940-NEXT: s_cbranch_execnz .LBB15_1 2337; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2338; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2339; GFX940-NEXT: s_setpc_b64 s[30:31] 2340; 2341; GFX11-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2342; GFX11: ; %bb.0: 2343; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2344; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 2345; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 2346; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2347; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 2348; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2349; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2350; GFX11-NEXT: buffer_gl1_inv 2351; GFX11-NEXT: buffer_gl0_inv 2352; GFX11-NEXT: s_setpc_b64 s[30:31] 2353; 2354; GFX10-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2355; GFX10: ; %bb.0: 2356; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2357; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 2358; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 2359; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2360; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 2361; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2362; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2363; GFX10-NEXT: buffer_gl1_inv 2364; GFX10-NEXT: buffer_gl0_inv 2365; GFX10-NEXT: s_setpc_b64 s[30:31] 2366; 2367; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2368; GFX90A: ; %bb.0: 2369; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2370; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 2371; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 2372; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2373; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2374; GFX90A-NEXT: flat_load_dword v1, v[0:1] 2375; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2376; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 2377; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start 2378; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2379; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2380; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 2381; GFX90A-NEXT: v_min_f32_e32 v0, v0, v2 2382; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 2383; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2384; GFX90A-NEXT: buffer_wbinvl1 2385; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2386; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2387; GFX90A-NEXT: v_mov_b32_e32 v1, v0 2388; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2389; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 2390; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2391; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2392; GFX90A-NEXT: s_setpc_b64 s[30:31] 2393; 2394; GFX908-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2395; GFX908: ; %bb.0: 2396; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2397; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 2398; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 2399; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 2400; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 2401; GFX908-NEXT: flat_load_dword v1, v[0:1] 2402; GFX908-NEXT: s_mov_b64 s[4:5], 0 2403; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 2404; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start 2405; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2406; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2407; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 2408; GFX908-NEXT: v_min_f32_e32 v0, v0, v2 2409; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 2410; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2411; GFX908-NEXT: buffer_wbinvl1 2412; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 2413; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2414; GFX908-NEXT: v_mov_b32_e32 v1, v0 2415; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2416; GFX908-NEXT: s_cbranch_execnz .LBB15_1 2417; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2418; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2419; GFX908-NEXT: s_setpc_b64 s[30:31] 2420; 2421; GFX8-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2422; GFX8: ; %bb.0: 2423; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2424; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 2425; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 2426; GFX8-NEXT: flat_load_dword v3, v[0:1] 2427; GFX8-NEXT: s_mov_b64 s[4:5], 0 2428; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 2429; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start 2430; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2431; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2432; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 2433; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 2434; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2435; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2436; GFX8-NEXT: buffer_wbinvl1 2437; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2438; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2439; GFX8-NEXT: v_mov_b32_e32 v3, v2 2440; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2441; GFX8-NEXT: s_cbranch_execnz .LBB15_1 2442; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2443; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2444; GFX8-NEXT: s_setpc_b64 s[30:31] 2445; 2446; GFX7-LABEL: flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: 2447; GFX7: ; %bb.0: 2448; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2449; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 2450; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 2451; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 2452; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2453; GFX7-NEXT: buffer_wbinvl1 2454; GFX7-NEXT: s_setpc_b64 s[30:31] 2455 %gep = getelementptr float, ptr %ptr, i64 -512 2456 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2457 ret void 2458} 2459 2460define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2461; GFX12-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2462; GFX12: ; %bb.0: 2463; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2464; GFX12-NEXT: s_wait_expcnt 0x0 2465; GFX12-NEXT: s_wait_samplecnt 0x0 2466; GFX12-NEXT: s_wait_bvhcnt 0x0 2467; GFX12-NEXT: s_wait_kmcnt 0x0 2468; GFX12-NEXT: global_wb scope:SCOPE_SYS 2469; GFX12-NEXT: s_wait_storecnt 0x0 2470; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 2471; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2472; GFX12-NEXT: global_inv scope:SCOPE_SYS 2473; GFX12-NEXT: s_setpc_b64 s[30:31] 2474; 2475; GFX940-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2476; GFX940: ; %bb.0: 2477; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2478; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 2479; GFX940-NEXT: s_mov_b64 s[0:1], 0 2480; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 2481; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start 2482; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2483; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2484; GFX940-NEXT: v_mov_b32_e32 v5, v3 2485; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 2486; GFX940-NEXT: v_min_f32_e32 v4, v3, v2 2487; GFX940-NEXT: buffer_wbl2 sc0 sc1 2488; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 2489; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2490; GFX940-NEXT: buffer_inv sc0 sc1 2491; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2492; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2493; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2494; GFX940-NEXT: s_cbranch_execnz .LBB16_1 2495; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2496; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2497; GFX940-NEXT: v_mov_b32_e32 v0, v3 2498; GFX940-NEXT: s_setpc_b64 s[30:31] 2499; 2500; GFX11-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2501; GFX11: ; %bb.0: 2502; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2503; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2504; GFX11-NEXT: flat_atomic_min_f32 v0, v[0:1], v2 offset:2044 glc 2505; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2506; GFX11-NEXT: buffer_gl1_inv 2507; GFX11-NEXT: buffer_gl0_inv 2508; GFX11-NEXT: s_setpc_b64 s[30:31] 2509; 2510; GFX10-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2511; GFX10: ; %bb.0: 2512; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2513; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 2514; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2515; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2516; GFX10-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 2517; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2518; GFX10-NEXT: buffer_gl1_inv 2519; GFX10-NEXT: buffer_gl0_inv 2520; GFX10-NEXT: s_setpc_b64 s[30:31] 2521; 2522; GFX90A-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2523; GFX90A: ; %bb.0: 2524; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2525; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 2526; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2527; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 2528; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start 2529; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2530; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2531; GFX90A-NEXT: v_mov_b32_e32 v5, v3 2532; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 2533; GFX90A-NEXT: v_min_f32_e32 v4, v3, v2 2534; GFX90A-NEXT: buffer_wbl2 2535; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 2536; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2537; GFX90A-NEXT: buffer_invl2 2538; GFX90A-NEXT: buffer_wbinvl1 2539; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 2540; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2541; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2542; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 2543; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2544; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2545; GFX90A-NEXT: v_mov_b32_e32 v0, v3 2546; GFX90A-NEXT: s_setpc_b64 s[30:31] 2547; 2548; GFX908-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2549; GFX908: ; %bb.0: 2550; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2551; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 2552; GFX908-NEXT: s_mov_b64 s[4:5], 0 2553; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 2554; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start 2555; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2556; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2557; GFX908-NEXT: v_mov_b32_e32 v4, v3 2558; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 2559; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 2560; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 2561; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2562; GFX908-NEXT: buffer_wbinvl1 2563; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 2564; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2565; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2566; GFX908-NEXT: s_cbranch_execnz .LBB16_1 2567; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2568; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2569; GFX908-NEXT: v_mov_b32_e32 v0, v3 2570; GFX908-NEXT: s_setpc_b64 s[30:31] 2571; 2572; GFX8-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2573; GFX8: ; %bb.0: 2574; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2575; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 2576; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 2577; GFX8-NEXT: flat_load_dword v0, v[3:4] 2578; GFX8-NEXT: s_mov_b64 s[4:5], 0 2579; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 2580; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start 2581; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2582; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2583; GFX8-NEXT: v_mov_b32_e32 v6, v0 2584; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v6 2585; GFX8-NEXT: v_min_f32_e32 v5, v0, v1 2586; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 2587; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2588; GFX8-NEXT: buffer_wbinvl1 2589; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 2590; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2591; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2592; GFX8-NEXT: s_cbranch_execnz .LBB16_1 2593; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2594; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2595; GFX8-NEXT: s_setpc_b64 s[30:31] 2596; 2597; GFX7-LABEL: flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2598; GFX7: ; %bb.0: 2599; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2600; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2601; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2602; GFX7-NEXT: flat_atomic_fmin v0, v[0:1], v2 glc 2603; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2604; GFX7-NEXT: buffer_wbinvl1 2605; GFX7-NEXT: s_setpc_b64 s[30:31] 2606 %gep = getelementptr float, ptr %ptr, i64 511 2607 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 2608 ret float %result 2609} 2610 2611define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 { 2612; GFX12-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2613; GFX12: ; %bb.0: 2614; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2615; GFX12-NEXT: s_wait_expcnt 0x0 2616; GFX12-NEXT: s_wait_samplecnt 0x0 2617; GFX12-NEXT: s_wait_bvhcnt 0x0 2618; GFX12-NEXT: s_wait_kmcnt 0x0 2619; GFX12-NEXT: global_wb scope:SCOPE_SYS 2620; GFX12-NEXT: s_wait_storecnt 0x0 2621; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 offset:2044 scope:SCOPE_SYS 2622; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 2623; GFX12-NEXT: global_inv scope:SCOPE_SYS 2624; GFX12-NEXT: s_setpc_b64 s[30:31] 2625; 2626; GFX940-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2627; GFX940: ; %bb.0: 2628; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2629; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 2630; GFX940-NEXT: s_mov_b64 s[0:1], 0 2631; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 2632; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start 2633; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 2634; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2635; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 2636; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 2637; GFX940-NEXT: buffer_wbl2 sc0 sc1 2638; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 2639; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2640; GFX940-NEXT: buffer_inv sc0 sc1 2641; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2642; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 2643; GFX940-NEXT: v_mov_b32_e32 v3, v2 2644; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 2645; GFX940-NEXT: s_cbranch_execnz .LBB17_1 2646; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 2647; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2648; GFX940-NEXT: s_setpc_b64 s[30:31] 2649; 2650; GFX11-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2651; GFX11: ; %bb.0: 2652; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2653; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2654; GFX11-NEXT: flat_atomic_min_f32 v[0:1], v2 offset:2044 2655; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2656; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2657; GFX11-NEXT: buffer_gl1_inv 2658; GFX11-NEXT: buffer_gl0_inv 2659; GFX11-NEXT: s_setpc_b64 s[30:31] 2660; 2661; GFX10-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2662; GFX10: ; %bb.0: 2663; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2664; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 2665; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2666; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2667; GFX10-NEXT: flat_atomic_fmin v[0:1], v2 2668; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2669; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2670; GFX10-NEXT: buffer_gl1_inv 2671; GFX10-NEXT: buffer_gl0_inv 2672; GFX10-NEXT: s_setpc_b64 s[30:31] 2673; 2674; GFX90A-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2675; GFX90A: ; %bb.0: 2676; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2677; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 2678; GFX90A-NEXT: s_mov_b64 s[4:5], 0 2679; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 2680; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start 2681; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 2682; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2683; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 2684; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 2685; GFX90A-NEXT: buffer_wbl2 2686; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 2687; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2688; GFX90A-NEXT: buffer_invl2 2689; GFX90A-NEXT: buffer_wbinvl1 2690; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2691; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2692; GFX90A-NEXT: v_mov_b32_e32 v3, v2 2693; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 2694; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 2695; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 2696; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2697; GFX90A-NEXT: s_setpc_b64 s[30:31] 2698; 2699; GFX908-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2700; GFX908: ; %bb.0: 2701; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2702; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 2703; GFX908-NEXT: s_mov_b64 s[4:5], 0 2704; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 2705; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start 2706; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2707; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2708; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 2709; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 2710; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 2711; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2712; GFX908-NEXT: buffer_wbinvl1 2713; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2714; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2715; GFX908-NEXT: v_mov_b32_e32 v3, v2 2716; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2717; GFX908-NEXT: s_cbranch_execnz .LBB17_1 2718; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2719; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2720; GFX908-NEXT: s_setpc_b64 s[30:31] 2721; 2722; GFX8-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2723; GFX8: ; %bb.0: 2724; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2725; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 2726; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2727; GFX8-NEXT: flat_load_dword v3, v[0:1] 2728; GFX8-NEXT: s_mov_b64 s[4:5], 0 2729; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 2730; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start 2731; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2732; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2733; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 2734; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 2735; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 2736; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2737; GFX8-NEXT: buffer_wbinvl1 2738; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 2739; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2740; GFX8-NEXT: v_mov_b32_e32 v3, v2 2741; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2742; GFX8-NEXT: s_cbranch_execnz .LBB17_1 2743; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2744; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2745; GFX8-NEXT: s_setpc_b64 s[30:31] 2746; 2747; GFX7-LABEL: flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: 2748; GFX7: ; %bb.0: 2749; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2750; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 2751; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2752; GFX7-NEXT: flat_atomic_fmin v[0:1], v2 2753; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2754; GFX7-NEXT: buffer_wbinvl1 2755; GFX7-NEXT: s_setpc_b64 s[30:31] 2756 %gep = getelementptr float, ptr %ptr, i64 511 2757 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 2758 ret void 2759} 2760 2761; -------------------------------------------------------------------- 2762; double 2763; -------------------------------------------------------------------- 2764 2765define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 2766; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 2767; GFX12: ; %bb.0: 2768; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2769; GFX12-NEXT: s_wait_expcnt 0x0 2770; GFX12-NEXT: s_wait_samplecnt 0x0 2771; GFX12-NEXT: s_wait_bvhcnt 0x0 2772; GFX12-NEXT: s_wait_kmcnt 0x0 2773; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 2774; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 2775; GFX12-NEXT: s_mov_b32 s0, exec_lo 2776; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 2777; GFX12-NEXT: s_wait_alu 0xfffe 2778; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 2779; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 2780; GFX12-NEXT: s_cbranch_execz .LBB18_4 2781; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global 2782; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] 2783; GFX12-NEXT: s_mov_b32 s1, 0 2784; GFX12-NEXT: .LBB18_2: ; %atomicrmw.start 2785; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 2786; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2787; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 2788; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2789; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] 2790; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] 2791; GFX12-NEXT: s_wait_storecnt 0x0 2792; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 2793; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2794; GFX12-NEXT: global_inv scope:SCOPE_DEV 2795; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 2796; GFX12-NEXT: s_wait_alu 0xfffe 2797; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 2798; GFX12-NEXT: s_wait_alu 0xfffe 2799; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 2800; GFX12-NEXT: s_cbranch_execnz .LBB18_2 2801; GFX12-NEXT: ; %bb.3: ; %Flow 2802; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 2803; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 2804; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 2805; GFX12-NEXT: .LBB18_4: ; %Flow2 2806; GFX12-NEXT: s_wait_alu 0xfffe 2807; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 2808; GFX12-NEXT: s_cbranch_execz .LBB18_6 2809; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private 2810; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 2811; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 2812; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off 2813; GFX12-NEXT: s_wait_loadcnt 0x0 2814; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 2815; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2816; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 2817; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off 2818; GFX12-NEXT: .LBB18_6: ; %atomicrmw.phi 2819; GFX12-NEXT: s_wait_alu 0xfffe 2820; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 2821; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 2822; GFX12-NEXT: s_wait_alu 0xfffe 2823; GFX12-NEXT: s_setpc_b64 s[30:31] 2824; 2825; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 2826; GFX940: ; %bb.0: 2827; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2828; GFX940-NEXT: v_mov_b32_e32 v5, v1 2829; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 2830; GFX940-NEXT: v_mov_b32_e32 v4, v0 2831; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 2832; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 2833; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 2834; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 2835; GFX940-NEXT: s_cbranch_execnz .LBB18_3 2836; GFX940-NEXT: ; %bb.1: ; %Flow 2837; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 2838; GFX940-NEXT: s_cbranch_execnz .LBB18_4 2839; GFX940-NEXT: .LBB18_2: ; %atomicrmw.phi 2840; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2841; GFX940-NEXT: s_setpc_b64 s[30:31] 2842; GFX940-NEXT: .LBB18_3: ; %atomicrmw.global 2843; GFX940-NEXT: buffer_wbl2 sc1 2844; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 2845; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2846; GFX940-NEXT: buffer_inv sc1 2847; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 2848; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 2849; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 2850; GFX940-NEXT: s_cbranch_execz .LBB18_2 2851; GFX940-NEXT: .LBB18_4: ; %atomicrmw.private 2852; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2853; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2854; GFX940-NEXT: s_nop 0 2855; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 2856; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off 2857; GFX940-NEXT: s_waitcnt vmcnt(0) 2858; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2859; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 2860; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 2861; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 2862; GFX940-NEXT: s_waitcnt vmcnt(0) 2863; GFX940-NEXT: s_setpc_b64 s[30:31] 2864; 2865; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 2866; GFX11: ; %bb.0: 2867; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2868; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 2869; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 2870; GFX11-NEXT: s_mov_b32 s0, exec_lo 2871; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 2872; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 2873; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 2874; GFX11-NEXT: s_cbranch_execz .LBB18_4 2875; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global 2876; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] 2877; GFX11-NEXT: s_mov_b32 s1, 0 2878; GFX11-NEXT: .LBB18_2: ; %atomicrmw.start 2879; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2880; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2881; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 2882; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2883; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 2884; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 2885; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2886; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc 2887; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2888; GFX11-NEXT: buffer_gl1_inv 2889; GFX11-NEXT: buffer_gl0_inv 2890; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 2891; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 2892; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2893; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 2894; GFX11-NEXT: s_cbranch_execnz .LBB18_2 2895; GFX11-NEXT: ; %bb.3: ; %Flow 2896; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 2897; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 2898; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 2899; GFX11-NEXT: .LBB18_4: ; %Flow2 2900; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 2901; GFX11-NEXT: s_cbranch_execz .LBB18_6 2902; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private 2903; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 2904; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 2905; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off 2906; GFX11-NEXT: s_waitcnt vmcnt(0) 2907; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 2908; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2909; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 2910; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off 2911; GFX11-NEXT: .LBB18_6: ; %atomicrmw.phi 2912; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 2913; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 2914; GFX11-NEXT: s_setpc_b64 s[30:31] 2915; 2916; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 2917; GFX10: ; %bb.0: 2918; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2919; GFX10-NEXT: v_mov_b32_e32 v5, v1 2920; GFX10-NEXT: v_mov_b32_e32 v4, v0 2921; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 2922; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 2923; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 2924; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 2925; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 2926; GFX10-NEXT: s_cbranch_execnz .LBB18_3 2927; GFX10-NEXT: ; %bb.1: ; %Flow 2928; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 2929; GFX10-NEXT: s_cbranch_execnz .LBB18_4 2930; GFX10-NEXT: .LBB18_2: ; %atomicrmw.phi 2931; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2932; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2933; GFX10-NEXT: s_setpc_b64 s[30:31] 2934; GFX10-NEXT: .LBB18_3: ; %atomicrmw.global 2935; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2936; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 2937; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2938; GFX10-NEXT: buffer_gl1_inv 2939; GFX10-NEXT: buffer_gl0_inv 2940; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 2941; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 2942; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 2943; GFX10-NEXT: s_cbranch_execz .LBB18_2 2944; GFX10-NEXT: .LBB18_4: ; %atomicrmw.private 2945; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 2946; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2947; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 2948; GFX10-NEXT: s_clause 0x1 2949; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 2950; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 2951; GFX10-NEXT: s_waitcnt vmcnt(0) 2952; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2953; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 2954; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 2955; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 2956; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2957; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 2958; GFX10-NEXT: s_setpc_b64 s[30:31] 2959; 2960; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 2961; GFX90A: ; %bb.0: 2962; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2963; GFX90A-NEXT: v_mov_b32_e32 v5, v1 2964; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 2965; GFX90A-NEXT: v_mov_b32_e32 v4, v0 2966; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 2967; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 2968; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 2969; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 2970; GFX90A-NEXT: s_cbranch_execnz .LBB18_3 2971; GFX90A-NEXT: ; %bb.1: ; %Flow 2972; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2973; GFX90A-NEXT: s_cbranch_execnz .LBB18_4 2974; GFX90A-NEXT: .LBB18_2: ; %atomicrmw.phi 2975; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2976; GFX90A-NEXT: s_setpc_b64 s[30:31] 2977; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.global 2978; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc 2979; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 2980; GFX90A-NEXT: buffer_wbinvl1 2981; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 2982; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 2983; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 2984; GFX90A-NEXT: s_cbranch_execz .LBB18_2 2985; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.private 2986; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 2987; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 2988; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 2989; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 2990; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 2991; GFX90A-NEXT: s_waitcnt vmcnt(0) 2992; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2993; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 2994; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 2995; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 2996; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 2997; GFX90A-NEXT: s_waitcnt vmcnt(0) 2998; GFX90A-NEXT: s_setpc_b64 s[30:31] 2999; 3000; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 3001; GFX908: ; %bb.0: 3002; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3003; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 3004; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 3005; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 3006; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 3007; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3008; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3009; GFX908-NEXT: s_cbranch_execz .LBB18_4 3010; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global 3011; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 3012; GFX908-NEXT: s_mov_b64 s[6:7], 0 3013; GFX908-NEXT: .LBB18_2: ; %atomicrmw.start 3014; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3015; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3016; GFX908-NEXT: v_mov_b32_e32 v9, v3 3017; GFX908-NEXT: v_mov_b32_e32 v8, v2 3018; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 3019; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 3020; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 3021; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3022; GFX908-NEXT: buffer_wbinvl1 3023; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 3024; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3025; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3026; GFX908-NEXT: s_cbranch_execnz .LBB18_2 3027; GFX908-NEXT: ; %bb.3: ; %Flow 3028; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3029; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3030; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 3031; GFX908-NEXT: .LBB18_4: ; %Flow2 3032; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3033; GFX908-NEXT: s_cbranch_execz .LBB18_6 3034; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private 3035; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3036; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3037; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 3038; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 3039; GFX908-NEXT: s_waitcnt vmcnt(0) 3040; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 3041; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 3042; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3043; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 3044; GFX908-NEXT: .LBB18_6: ; %atomicrmw.phi 3045; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3046; GFX908-NEXT: v_mov_b32_e32 v0, v2 3047; GFX908-NEXT: v_mov_b32_e32 v1, v3 3048; GFX908-NEXT: s_waitcnt vmcnt(0) 3049; GFX908-NEXT: s_setpc_b64 s[30:31] 3050; 3051; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 3052; GFX8: ; %bb.0: 3053; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3054; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 3055; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 3056; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 3057; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 3058; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 3060; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3061; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3062; GFX8-NEXT: s_cbranch_execz .LBB18_4 3063; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global 3064; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 3065; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 3066; GFX8-NEXT: flat_load_dword v3, v[2:3] 3067; GFX8-NEXT: flat_load_dword v2, v[0:1] 3068; GFX8-NEXT: s_mov_b64 s[6:7], 0 3069; GFX8-NEXT: .LBB18_2: ; %atomicrmw.start 3070; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3071; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3072; GFX8-NEXT: v_mov_b32_e32 v9, v3 3073; GFX8-NEXT: v_mov_b32_e32 v8, v2 3074; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 3075; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 3076; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 3077; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3078; GFX8-NEXT: buffer_wbinvl1 3079; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 3080; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3081; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3082; GFX8-NEXT: s_cbranch_execnz .LBB18_2 3083; GFX8-NEXT: ; %bb.3: ; %Flow 3084; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 3085; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3086; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 3087; GFX8-NEXT: .LBB18_4: ; %Flow2 3088; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3089; GFX8-NEXT: s_cbranch_execz .LBB18_6 3090; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private 3091; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 3092; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 3093; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 3094; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 3095; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen 3096; GFX8-NEXT: s_waitcnt vmcnt(0) 3097; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 3098; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 3099; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 3100; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 3101; GFX8-NEXT: .LBB18_6: ; %atomicrmw.phi 3102; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3103; GFX8-NEXT: v_mov_b32_e32 v0, v2 3104; GFX8-NEXT: v_mov_b32_e32 v1, v3 3105; GFX8-NEXT: s_waitcnt vmcnt(0) 3106; GFX8-NEXT: s_setpc_b64 s[30:31] 3107; 3108; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory: 3109; GFX7: ; %bb.0: 3110; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3111; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 3112; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 3113; GFX7-NEXT: v_mov_b32_e32 v5, v1 3114; GFX7-NEXT: v_mov_b32_e32 v4, v0 3115; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 3116; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3117; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3118; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 3119; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3120; GFX7-NEXT: s_cbranch_execnz .LBB18_3 3121; GFX7-NEXT: ; %bb.1: ; %Flow 3122; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3123; GFX7-NEXT: s_cbranch_execnz .LBB18_4 3124; GFX7-NEXT: .LBB18_2: ; %atomicrmw.phi 3125; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3126; GFX7-NEXT: s_setpc_b64 s[30:31] 3127; GFX7-NEXT: .LBB18_3: ; %atomicrmw.global 3128; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 3129; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3130; GFX7-NEXT: buffer_wbinvl1 3131; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 3132; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 3133; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3134; GFX7-NEXT: s_cbranch_execz .LBB18_2 3135; GFX7-NEXT: .LBB18_4: ; %atomicrmw.private 3136; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3137; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3138; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3139; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 3140; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3141; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 3142; GFX7-NEXT: s_waitcnt vmcnt(0) 3143; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3144; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3145; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3146; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 3147; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3148; GFX7-NEXT: s_waitcnt vmcnt(0) 3149; GFX7-NEXT: s_setpc_b64 s[30:31] 3150 %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3151 ret double %result 3152} 3153 3154define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 3155; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3156; GFX12: ; %bb.0: 3157; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3158; GFX12-NEXT: s_wait_expcnt 0x0 3159; GFX12-NEXT: s_wait_samplecnt 0x0 3160; GFX12-NEXT: s_wait_bvhcnt 0x0 3161; GFX12-NEXT: s_wait_kmcnt 0x0 3162; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 3163; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3164; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3165; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 3166; GFX12-NEXT: s_mov_b32 s0, exec_lo 3167; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 3168; GFX12-NEXT: s_wait_alu 0xfffe 3169; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3170; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 3171; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 3172; GFX12-NEXT: s_cbranch_execnz .LBB19_3 3173; GFX12-NEXT: ; %bb.1: ; %Flow2 3174; GFX12-NEXT: s_wait_alu 0xfffe 3175; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3176; GFX12-NEXT: s_cbranch_execnz .LBB19_6 3177; GFX12-NEXT: .LBB19_2: ; %atomicrmw.phi 3178; GFX12-NEXT: s_wait_alu 0xfffe 3179; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3180; GFX12-NEXT: s_wait_alu 0xfffe 3181; GFX12-NEXT: s_setpc_b64 s[30:31] 3182; GFX12-NEXT: .LBB19_3: ; %atomicrmw.global 3183; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 3184; GFX12-NEXT: s_mov_b32 s1, 0 3185; GFX12-NEXT: .LBB19_4: ; %atomicrmw.start 3186; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3187; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3188; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3189; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3190; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] 3191; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] 3192; GFX12-NEXT: s_wait_storecnt 0x0 3193; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3194; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3195; GFX12-NEXT: global_inv scope:SCOPE_DEV 3196; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3197; GFX12-NEXT: s_wait_alu 0xfffe 3198; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 3199; GFX12-NEXT: s_wait_alu 0xfffe 3200; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3201; GFX12-NEXT: s_cbranch_execnz .LBB19_4 3202; GFX12-NEXT: ; %bb.5: ; %Flow 3203; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 3204; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 3205; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 3206; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3207; GFX12-NEXT: s_cbranch_execz .LBB19_2 3208; GFX12-NEXT: .LBB19_6: ; %atomicrmw.private 3209; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3210; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3211; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off 3212; GFX12-NEXT: s_wait_loadcnt 0x0 3213; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] 3214; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3215; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] 3216; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off 3217; GFX12-NEXT: s_wait_alu 0xfffe 3218; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3219; GFX12-NEXT: s_wait_alu 0xfffe 3220; GFX12-NEXT: s_setpc_b64 s[30:31] 3221; 3222; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3223; GFX940: ; %bb.0: 3224; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3225; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 3226; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 3227; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 3228; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 3229; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 3230; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 3231; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3232; GFX940-NEXT: s_cbranch_execnz .LBB19_3 3233; GFX940-NEXT: ; %bb.1: ; %Flow 3234; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3235; GFX940-NEXT: s_cbranch_execnz .LBB19_4 3236; GFX940-NEXT: .LBB19_2: ; %atomicrmw.phi 3237; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3238; GFX940-NEXT: s_setpc_b64 s[30:31] 3239; GFX940-NEXT: .LBB19_3: ; %atomicrmw.global 3240; GFX940-NEXT: buffer_wbl2 sc1 3241; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 3242; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3243; GFX940-NEXT: buffer_inv sc1 3244; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 3245; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 3246; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3247; GFX940-NEXT: s_cbranch_execz .LBB19_2 3248; GFX940-NEXT: .LBB19_4: ; %atomicrmw.private 3249; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3250; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3251; GFX940-NEXT: s_nop 0 3252; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3253; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off 3254; GFX940-NEXT: s_waitcnt vmcnt(0) 3255; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3256; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3257; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 3258; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3259; GFX940-NEXT: s_waitcnt vmcnt(0) 3260; GFX940-NEXT: s_setpc_b64 s[30:31] 3261; 3262; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3263; GFX11: ; %bb.0: 3264; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3265; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3266; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3267; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3268; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 3269; GFX11-NEXT: s_mov_b32 s0, exec_lo 3270; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 3271; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3272; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 3273; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 3274; GFX11-NEXT: s_cbranch_execnz .LBB19_3 3275; GFX11-NEXT: ; %bb.1: ; %Flow2 3276; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3277; GFX11-NEXT: s_cbranch_execnz .LBB19_6 3278; GFX11-NEXT: .LBB19_2: ; %atomicrmw.phi 3279; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3280; GFX11-NEXT: s_setpc_b64 s[30:31] 3281; GFX11-NEXT: .LBB19_3: ; %atomicrmw.global 3282; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 3283; GFX11-NEXT: s_mov_b32 s1, 0 3284; GFX11-NEXT: .LBB19_4: ; %atomicrmw.start 3285; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3286; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3287; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3288; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3289; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3290; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3291; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3292; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 3293; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3294; GFX11-NEXT: buffer_gl1_inv 3295; GFX11-NEXT: buffer_gl0_inv 3296; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3297; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 3298; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3299; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3300; GFX11-NEXT: s_cbranch_execnz .LBB19_4 3301; GFX11-NEXT: ; %bb.5: ; %Flow 3302; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 3303; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 3304; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 3305; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3306; GFX11-NEXT: s_cbranch_execz .LBB19_2 3307; GFX11-NEXT: .LBB19_6: ; %atomicrmw.private 3308; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3309; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3310; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off 3311; GFX11-NEXT: s_waitcnt vmcnt(0) 3312; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3313; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3314; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3315; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off 3316; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3317; GFX11-NEXT: s_setpc_b64 s[30:31] 3318; 3319; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3320; GFX10: ; %bb.0: 3321; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3322; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0 3323; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 3324; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 3325; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 3326; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 3327; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 3328; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 3329; GFX10-NEXT: s_cbranch_execnz .LBB19_3 3330; GFX10-NEXT: ; %bb.1: ; %Flow 3331; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3332; GFX10-NEXT: s_cbranch_execnz .LBB19_4 3333; GFX10-NEXT: .LBB19_2: ; %atomicrmw.phi 3334; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3335; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3336; GFX10-NEXT: s_setpc_b64 s[30:31] 3337; GFX10-NEXT: .LBB19_3: ; %atomicrmw.global 3338; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3339; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 3340; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3341; GFX10-NEXT: buffer_gl1_inv 3342; GFX10-NEXT: buffer_gl0_inv 3343; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 3344; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 3345; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3346; GFX10-NEXT: s_cbranch_execz .LBB19_2 3347; GFX10-NEXT: .LBB19_4: ; %atomicrmw.private 3348; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3349; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3350; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3351; GFX10-NEXT: s_clause 0x1 3352; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3353; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3354; GFX10-NEXT: s_waitcnt vmcnt(0) 3355; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3356; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3357; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3358; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3359; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3360; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3361; GFX10-NEXT: s_setpc_b64 s[30:31] 3362; 3363; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3364; GFX90A: ; %bb.0: 3365; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3366; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 3367; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3368; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 3369; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3370; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3371; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3372; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3373; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 3374; GFX90A-NEXT: ; %bb.1: ; %Flow 3375; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3376; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 3377; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi 3378; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3379; GFX90A-NEXT: s_setpc_b64 s[30:31] 3380; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global 3381; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc 3382; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3383; GFX90A-NEXT: buffer_wbinvl1 3384; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3385; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 3386; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3387; GFX90A-NEXT: s_cbranch_execz .LBB19_2 3388; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private 3389; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3390; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3391; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3392; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3393; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3394; GFX90A-NEXT: s_waitcnt vmcnt(0) 3395; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3396; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3397; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3398; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3399; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3400; GFX90A-NEXT: s_waitcnt vmcnt(0) 3401; GFX90A-NEXT: s_setpc_b64 s[30:31] 3402; 3403; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3404; GFX908: ; %bb.0: 3405; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3406; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3407; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7f8, v0 3408; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 3409; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 3410; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3411; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3412; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3413; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3414; GFX908-NEXT: s_cbranch_execnz .LBB19_3 3415; GFX908-NEXT: ; %bb.1: ; %Flow2 3416; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3417; GFX908-NEXT: s_cbranch_execnz .LBB19_6 3418; GFX908-NEXT: .LBB19_2: ; %atomicrmw.phi 3419; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3420; GFX908-NEXT: s_setpc_b64 s[30:31] 3421; GFX908-NEXT: .LBB19_3: ; %atomicrmw.global 3422; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3423; GFX908-NEXT: s_mov_b64 s[6:7], 0 3424; GFX908-NEXT: .LBB19_4: ; %atomicrmw.start 3425; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3426; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3427; GFX908-NEXT: v_mov_b32_e32 v9, v1 3428; GFX908-NEXT: v_mov_b32_e32 v8, v0 3429; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3430; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3431; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3432; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3433; GFX908-NEXT: buffer_wbinvl1 3434; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3435; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3436; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3437; GFX908-NEXT: s_cbranch_execnz .LBB19_4 3438; GFX908-NEXT: ; %bb.5: ; %Flow 3439; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3440; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 3441; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 3442; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3443; GFX908-NEXT: s_cbranch_execz .LBB19_2 3444; GFX908-NEXT: .LBB19_6: ; %atomicrmw.private 3445; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3446; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3447; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3448; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3449; GFX908-NEXT: s_waitcnt vmcnt(0) 3450; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3451; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3452; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3453; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3454; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3455; GFX908-NEXT: s_waitcnt vmcnt(0) 3456; GFX908-NEXT: s_setpc_b64 s[30:31] 3457; 3458; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3459; GFX8: ; %bb.0: 3460; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3461; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3462; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 3463; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 3464; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0 3465; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3466; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3467; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3468; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3469; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3470; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3471; GFX8-NEXT: s_cbranch_execnz .LBB19_3 3472; GFX8-NEXT: ; %bb.1: ; %Flow2 3473; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3474; GFX8-NEXT: s_cbranch_execnz .LBB19_6 3475; GFX8-NEXT: .LBB19_2: ; %atomicrmw.phi 3476; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3477; GFX8-NEXT: s_setpc_b64 s[30:31] 3478; GFX8-NEXT: .LBB19_3: ; %atomicrmw.global 3479; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 3480; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 3481; GFX8-NEXT: flat_load_dword v1, v[0:1] 3482; GFX8-NEXT: flat_load_dword v0, v[4:5] 3483; GFX8-NEXT: s_mov_b64 s[6:7], 0 3484; GFX8-NEXT: .LBB19_4: ; %atomicrmw.start 3485; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3486; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3487; GFX8-NEXT: v_mov_b32_e32 v9, v1 3488; GFX8-NEXT: v_mov_b32_e32 v8, v0 3489; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3490; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3491; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3492; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3493; GFX8-NEXT: buffer_wbinvl1 3494; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3495; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3496; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3497; GFX8-NEXT: s_cbranch_execnz .LBB19_4 3498; GFX8-NEXT: ; %bb.5: ; %Flow 3499; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 3500; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 3501; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 3502; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3503; GFX8-NEXT: s_cbranch_execz .LBB19_2 3504; GFX8-NEXT: .LBB19_6: ; %atomicrmw.private 3505; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3506; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3507; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 3508; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3509; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 3510; GFX8-NEXT: s_waitcnt vmcnt(0) 3511; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3512; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3513; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3514; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 3515; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3516; GFX8-NEXT: s_waitcnt vmcnt(0) 3517; GFX8-NEXT: s_setpc_b64 s[30:31] 3518; 3519; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 3520; GFX7: ; %bb.0: 3521; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3522; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 3523; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 3524; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0 3525; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 3526; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3527; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3528; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 3529; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 3530; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3531; GFX7-NEXT: s_cbranch_execnz .LBB19_3 3532; GFX7-NEXT: ; %bb.1: ; %Flow 3533; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3534; GFX7-NEXT: s_cbranch_execnz .LBB19_4 3535; GFX7-NEXT: .LBB19_2: ; %atomicrmw.phi 3536; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3537; GFX7-NEXT: s_setpc_b64 s[30:31] 3538; GFX7-NEXT: .LBB19_3: ; %atomicrmw.global 3539; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 3540; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3541; GFX7-NEXT: buffer_wbinvl1 3542; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 3543; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 3544; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3545; GFX7-NEXT: s_cbranch_execz .LBB19_2 3546; GFX7-NEXT: .LBB19_4: ; %atomicrmw.private 3547; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3548; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3549; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3550; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 3551; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3552; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 3553; GFX7-NEXT: s_waitcnt vmcnt(0) 3554; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3555; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3556; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3557; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 3558; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3559; GFX7-NEXT: s_waitcnt vmcnt(0) 3560; GFX7-NEXT: s_setpc_b64 s[30:31] 3561 %gep = getelementptr double, ptr %ptr, i64 255 3562 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3563 ret double %result 3564} 3565 3566define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 3567; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3568; GFX12: ; %bb.0: 3569; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3570; GFX12-NEXT: s_wait_expcnt 0x0 3571; GFX12-NEXT: s_wait_samplecnt 0x0 3572; GFX12-NEXT: s_wait_bvhcnt 0x0 3573; GFX12-NEXT: s_wait_kmcnt 0x0 3574; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 3575; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 3576; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 3577; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 3578; GFX12-NEXT: s_mov_b32 s0, exec_lo 3579; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 3580; GFX12-NEXT: s_wait_alu 0xfffe 3581; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3582; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v5 3583; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 3584; GFX12-NEXT: s_cbranch_execnz .LBB20_3 3585; GFX12-NEXT: ; %bb.1: ; %Flow2 3586; GFX12-NEXT: s_wait_alu 0xfffe 3587; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3588; GFX12-NEXT: s_cbranch_execnz .LBB20_6 3589; GFX12-NEXT: .LBB20_2: ; %atomicrmw.phi 3590; GFX12-NEXT: s_wait_alu 0xfffe 3591; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3592; GFX12-NEXT: s_wait_alu 0xfffe 3593; GFX12-NEXT: s_setpc_b64 s[30:31] 3594; GFX12-NEXT: .LBB20_3: ; %atomicrmw.global 3595; GFX12-NEXT: flat_load_b64 v[0:1], v[4:5] 3596; GFX12-NEXT: s_mov_b32 s1, 0 3597; GFX12-NEXT: .LBB20_4: ; %atomicrmw.start 3598; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 3599; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3600; GFX12-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3601; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3602; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[8:9], v[8:9] 3603; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[0:1], v[2:3] 3604; GFX12-NEXT: s_wait_storecnt 0x0 3605; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 3606; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3607; GFX12-NEXT: global_inv scope:SCOPE_DEV 3608; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3609; GFX12-NEXT: s_wait_alu 0xfffe 3610; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 3611; GFX12-NEXT: s_wait_alu 0xfffe 3612; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3613; GFX12-NEXT: s_cbranch_execnz .LBB20_4 3614; GFX12-NEXT: ; %bb.5: ; %Flow 3615; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 3616; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 3617; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 3618; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3619; GFX12-NEXT: s_cbranch_execz .LBB20_2 3620; GFX12-NEXT: .LBB20_6: ; %atomicrmw.private 3621; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3622; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3623; GFX12-NEXT: scratch_load_b64 v[0:1], v6, off 3624; GFX12-NEXT: s_wait_loadcnt 0x0 3625; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] 3626; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3627; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] 3628; GFX12-NEXT: scratch_store_b64 v6, v[2:3], off 3629; GFX12-NEXT: s_wait_alu 0xfffe 3630; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 3631; GFX12-NEXT: s_wait_alu 0xfffe 3632; GFX12-NEXT: s_setpc_b64 s[30:31] 3633; 3634; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3635; GFX940: ; %bb.0: 3636; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3637; GFX940-NEXT: s_movk_i32 s0, 0xf800 3638; GFX940-NEXT: s_mov_b32 s1, -1 3639; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 3640; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 3641; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 3642; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 3643; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 3644; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 3645; GFX940-NEXT: s_cbranch_execnz .LBB20_3 3646; GFX940-NEXT: ; %bb.1: ; %Flow 3647; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3648; GFX940-NEXT: s_cbranch_execnz .LBB20_4 3649; GFX940-NEXT: .LBB20_2: ; %atomicrmw.phi 3650; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3651; GFX940-NEXT: s_setpc_b64 s[30:31] 3652; GFX940-NEXT: .LBB20_3: ; %atomicrmw.global 3653; GFX940-NEXT: buffer_wbl2 sc1 3654; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 3655; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3656; GFX940-NEXT: buffer_inv sc1 3657; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 3658; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 3659; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 3660; GFX940-NEXT: s_cbranch_execz .LBB20_2 3661; GFX940-NEXT: .LBB20_4: ; %atomicrmw.private 3662; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3663; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3664; GFX940-NEXT: s_nop 0 3665; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3666; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off 3667; GFX940-NEXT: s_waitcnt vmcnt(0) 3668; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3669; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3670; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 3671; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 3672; GFX940-NEXT: s_waitcnt vmcnt(0) 3673; GFX940-NEXT: s_setpc_b64 s[30:31] 3674; 3675; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3676; GFX11: ; %bb.0: 3677; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3678; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3679; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 3680; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 3681; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 3682; GFX11-NEXT: s_mov_b32 s0, exec_lo 3683; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 3684; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3685; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v5 3686; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 3687; GFX11-NEXT: s_cbranch_execnz .LBB20_3 3688; GFX11-NEXT: ; %bb.1: ; %Flow2 3689; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3690; GFX11-NEXT: s_cbranch_execnz .LBB20_6 3691; GFX11-NEXT: .LBB20_2: ; %atomicrmw.phi 3692; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3693; GFX11-NEXT: s_setpc_b64 s[30:31] 3694; GFX11-NEXT: .LBB20_3: ; %atomicrmw.global 3695; GFX11-NEXT: flat_load_b64 v[0:1], v[4:5] 3696; GFX11-NEXT: s_mov_b32 s1, 0 3697; GFX11-NEXT: .LBB20_4: ; %atomicrmw.start 3698; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 3699; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3700; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 3701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3702; GFX11-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3703; GFX11-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3704; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 3705; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[6:9] glc 3706; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3707; GFX11-NEXT: buffer_gl1_inv 3708; GFX11-NEXT: buffer_gl0_inv 3709; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] 3710; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 3711; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 3712; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 3713; GFX11-NEXT: s_cbranch_execnz .LBB20_4 3714; GFX11-NEXT: ; %bb.5: ; %Flow 3715; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 3716; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 3717; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 3718; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 3719; GFX11-NEXT: s_cbranch_execz .LBB20_2 3720; GFX11-NEXT: .LBB20_6: ; %atomicrmw.private 3721; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3722; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3723; GFX11-NEXT: scratch_load_b64 v[0:1], v6, off 3724; GFX11-NEXT: s_waitcnt vmcnt(0) 3725; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3727; GFX11-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3728; GFX11-NEXT: scratch_store_b64 v6, v[2:3], off 3729; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 3730; GFX11-NEXT: s_setpc_b64 s[30:31] 3731; 3732; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3733; GFX10: ; %bb.0: 3734; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3735; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 3736; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 3737; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 3738; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 3739; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 3740; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 3741; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 3742; GFX10-NEXT: s_cbranch_execnz .LBB20_3 3743; GFX10-NEXT: ; %bb.1: ; %Flow 3744; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3745; GFX10-NEXT: s_cbranch_execnz .LBB20_4 3746; GFX10-NEXT: .LBB20_2: ; %atomicrmw.phi 3747; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3748; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3749; GFX10-NEXT: s_setpc_b64 s[30:31] 3750; GFX10-NEXT: .LBB20_3: ; %atomicrmw.global 3751; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3752; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 3753; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3754; GFX10-NEXT: buffer_gl1_inv 3755; GFX10-NEXT: buffer_gl0_inv 3756; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 3757; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 3758; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 3759; GFX10-NEXT: s_cbranch_execz .LBB20_2 3760; GFX10-NEXT: .LBB20_4: ; %atomicrmw.private 3761; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 3762; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3763; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 3764; GFX10-NEXT: s_clause 0x1 3765; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3766; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3767; GFX10-NEXT: s_waitcnt vmcnt(0) 3768; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3769; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3770; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3771; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3772; GFX10-NEXT: s_waitcnt_depctr 0xffe3 3773; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 3774; GFX10-NEXT: s_setpc_b64 s[30:31] 3775; 3776; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3777; GFX90A: ; %bb.0: 3778; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3779; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 3780; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 3781; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 3782; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3783; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 3784; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 3785; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3786; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 3787; GFX90A-NEXT: ; %bb.1: ; %Flow 3788; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3789; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 3790; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi 3791; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3792; GFX90A-NEXT: s_setpc_b64 s[30:31] 3793; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global 3794; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc 3795; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3796; GFX90A-NEXT: buffer_wbinvl1 3797; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 3798; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 3799; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3800; GFX90A-NEXT: s_cbranch_execz .LBB20_2 3801; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private 3802; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3803; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3804; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3805; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3806; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3807; GFX90A-NEXT: s_waitcnt vmcnt(0) 3808; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3809; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3810; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3811; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3812; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 3813; GFX90A-NEXT: s_waitcnt vmcnt(0) 3814; GFX90A-NEXT: s_setpc_b64 s[30:31] 3815; 3816; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3817; GFX908: ; %bb.0: 3818; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3819; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3820; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 3821; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 3822; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 3823; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 3824; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 3825; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 3826; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3827; GFX908-NEXT: s_cbranch_execnz .LBB20_3 3828; GFX908-NEXT: ; %bb.1: ; %Flow2 3829; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3830; GFX908-NEXT: s_cbranch_execnz .LBB20_6 3831; GFX908-NEXT: .LBB20_2: ; %atomicrmw.phi 3832; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3833; GFX908-NEXT: s_setpc_b64 s[30:31] 3834; GFX908-NEXT: .LBB20_3: ; %atomicrmw.global 3835; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[4:5] 3836; GFX908-NEXT: s_mov_b64 s[6:7], 0 3837; GFX908-NEXT: .LBB20_4: ; %atomicrmw.start 3838; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 3839; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3840; GFX908-NEXT: v_mov_b32_e32 v9, v1 3841; GFX908-NEXT: v_mov_b32_e32 v8, v0 3842; GFX908-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3843; GFX908-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3844; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3845; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3846; GFX908-NEXT: buffer_wbinvl1 3847; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3848; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3849; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 3850; GFX908-NEXT: s_cbranch_execnz .LBB20_4 3851; GFX908-NEXT: ; %bb.5: ; %Flow 3852; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 3853; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 3854; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 3855; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3856; GFX908-NEXT: s_cbranch_execz .LBB20_2 3857; GFX908-NEXT: .LBB20_6: ; %atomicrmw.private 3858; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3859; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3860; GFX908-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3861; GFX908-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 3862; GFX908-NEXT: s_waitcnt vmcnt(0) 3863; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3864; GFX908-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3865; GFX908-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3866; GFX908-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 3867; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 3868; GFX908-NEXT: s_waitcnt vmcnt(0) 3869; GFX908-NEXT: s_setpc_b64 s[30:31] 3870; 3871; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3872; GFX8: ; %bb.0: 3873; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3874; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3875; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 3876; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 3877; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 3878; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 3879; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3881; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 3882; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 3883; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3884; GFX8-NEXT: s_cbranch_execnz .LBB20_3 3885; GFX8-NEXT: ; %bb.1: ; %Flow2 3886; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3887; GFX8-NEXT: s_cbranch_execnz .LBB20_6 3888; GFX8-NEXT: .LBB20_2: ; %atomicrmw.phi 3889; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3890; GFX8-NEXT: s_setpc_b64 s[30:31] 3891; GFX8-NEXT: .LBB20_3: ; %atomicrmw.global 3892; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 3893; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 3894; GFX8-NEXT: flat_load_dword v1, v[0:1] 3895; GFX8-NEXT: flat_load_dword v0, v[4:5] 3896; GFX8-NEXT: s_mov_b64 s[6:7], 0 3897; GFX8-NEXT: .LBB20_4: ; %atomicrmw.start 3898; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 3899; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3900; GFX8-NEXT: v_mov_b32_e32 v9, v1 3901; GFX8-NEXT: v_mov_b32_e32 v8, v0 3902; GFX8-NEXT: v_max_f64 v[0:1], v[8:9], v[8:9] 3903; GFX8-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] 3904; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc 3905; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3906; GFX8-NEXT: buffer_wbinvl1 3907; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] 3908; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 3909; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 3910; GFX8-NEXT: s_cbranch_execnz .LBB20_4 3911; GFX8-NEXT: ; %bb.5: ; %Flow 3912; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 3913; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 3914; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 3915; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3916; GFX8-NEXT: s_cbranch_execz .LBB20_2 3917; GFX8-NEXT: .LBB20_6: ; %atomicrmw.private 3918; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3919; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3920; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 3921; GFX8-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3922; GFX8-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 3923; GFX8-NEXT: s_waitcnt vmcnt(0) 3924; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3925; GFX8-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3926; GFX8-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3927; GFX8-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 3928; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 3929; GFX8-NEXT: s_waitcnt vmcnt(0) 3930; GFX8-NEXT: s_setpc_b64 s[30:31] 3931; 3932; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 3933; GFX7: ; %bb.0: 3934; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3935; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 3936; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 3937; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 3938; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 3939; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3940; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 3941; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 3942; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 3943; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 3944; GFX7-NEXT: s_cbranch_execnz .LBB20_3 3945; GFX7-NEXT: ; %bb.1: ; %Flow 3946; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3947; GFX7-NEXT: s_cbranch_execnz .LBB20_4 3948; GFX7-NEXT: .LBB20_2: ; %atomicrmw.phi 3949; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3950; GFX7-NEXT: s_setpc_b64 s[30:31] 3951; GFX7-NEXT: .LBB20_3: ; %atomicrmw.global 3952; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 3953; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 3954; GFX7-NEXT: buffer_wbinvl1 3955; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 3956; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 3957; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 3958; GFX7-NEXT: s_cbranch_execz .LBB20_2 3959; GFX7-NEXT: .LBB20_4: ; %atomicrmw.private 3960; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 3961; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 3962; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 3963; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 3964; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 3965; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 3966; GFX7-NEXT: s_waitcnt vmcnt(0) 3967; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 3968; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 3969; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 3970; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 3971; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 3972; GFX7-NEXT: s_waitcnt vmcnt(0) 3973; GFX7-NEXT: s_setpc_b64 s[30:31] 3974 %gep = getelementptr double, ptr %ptr, i64 -256 3975 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 3976 ret double %result 3977} 3978 3979define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 3980; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 3981; GFX12: ; %bb.0: 3982; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3983; GFX12-NEXT: s_wait_expcnt 0x0 3984; GFX12-NEXT: s_wait_samplecnt 0x0 3985; GFX12-NEXT: s_wait_bvhcnt 0x0 3986; GFX12-NEXT: s_wait_kmcnt 0x0 3987; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] 3988; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 3989; GFX12-NEXT: s_mov_b32 s0, exec_lo 3990; GFX12-NEXT: s_wait_alu 0xfffe 3991; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 3992; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 3993; GFX12-NEXT: s_cbranch_execnz .LBB21_3 3994; GFX12-NEXT: ; %bb.1: ; %Flow2 3995; GFX12-NEXT: s_wait_alu 0xfffe 3996; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 3997; GFX12-NEXT: s_cbranch_execnz .LBB21_6 3998; GFX12-NEXT: .LBB21_2: ; %atomicrmw.phi 3999; GFX12-NEXT: s_wait_alu 0xfffe 4000; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4001; GFX12-NEXT: s_wait_alu 0xfffe 4002; GFX12-NEXT: s_setpc_b64 s[30:31] 4003; GFX12-NEXT: .LBB21_3: ; %atomicrmw.global 4004; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] 4005; GFX12-NEXT: s_mov_b32 s1, 0 4006; GFX12-NEXT: .LBB21_4: ; %atomicrmw.start 4007; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4008; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4009; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] 4010; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4011; GFX12-NEXT: v_min_num_f64_e32 v[2:3], v[2:3], v[6:7] 4012; GFX12-NEXT: s_wait_storecnt 0x0 4013; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4014; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4015; GFX12-NEXT: global_inv scope:SCOPE_DEV 4016; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 4017; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 4018; GFX12-NEXT: s_wait_alu 0xfffe 4019; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4020; GFX12-NEXT: s_wait_alu 0xfffe 4021; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4022; GFX12-NEXT: s_cbranch_execnz .LBB21_4 4023; GFX12-NEXT: ; %bb.5: ; %Flow 4024; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4025; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 4026; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 4027; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4028; GFX12-NEXT: s_cbranch_execz .LBB21_2 4029; GFX12-NEXT: .LBB21_6: ; %atomicrmw.private 4030; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4031; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo 4032; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off 4033; GFX12-NEXT: s_wait_loadcnt 0x0 4034; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 4035; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4036; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[6:7] 4037; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off 4038; GFX12-NEXT: s_wait_alu 0xfffe 4039; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4040; GFX12-NEXT: s_wait_alu 0xfffe 4041; GFX12-NEXT: s_setpc_b64 s[30:31] 4042; 4043; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4044; GFX940: ; %bb.0: 4045; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4046; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 4047; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 4048; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 4049; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4050; GFX940-NEXT: s_cbranch_execnz .LBB21_3 4051; GFX940-NEXT: ; %bb.1: ; %Flow 4052; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4053; GFX940-NEXT: s_cbranch_execnz .LBB21_4 4054; GFX940-NEXT: .LBB21_2: ; %atomicrmw.phi 4055; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4056; GFX940-NEXT: s_setpc_b64 s[30:31] 4057; GFX940-NEXT: .LBB21_3: ; %atomicrmw.global 4058; GFX940-NEXT: buffer_wbl2 sc1 4059; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4060; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4061; GFX940-NEXT: buffer_inv sc1 4062; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 4063; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 4064; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4065; GFX940-NEXT: s_cbranch_execz .LBB21_2 4066; GFX940-NEXT: .LBB21_4: ; %atomicrmw.private 4067; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4068; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4069; GFX940-NEXT: s_nop 0 4070; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4071; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 4072; GFX940-NEXT: s_waitcnt vmcnt(0) 4073; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4074; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4075; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 4076; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4077; GFX940-NEXT: s_waitcnt vmcnt(0) 4078; GFX940-NEXT: s_setpc_b64 s[30:31] 4079; 4080; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4081; GFX11: ; %bb.0: 4082; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4083; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 4084; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 4085; GFX11-NEXT: s_mov_b32 s0, exec_lo 4086; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 4087; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 4088; GFX11-NEXT: s_cbranch_execnz .LBB21_3 4089; GFX11-NEXT: ; %bb.1: ; %Flow2 4090; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4091; GFX11-NEXT: s_cbranch_execnz .LBB21_6 4092; GFX11-NEXT: .LBB21_2: ; %atomicrmw.phi 4093; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4094; GFX11-NEXT: s_setpc_b64 s[30:31] 4095; GFX11-NEXT: .LBB21_3: ; %atomicrmw.global 4096; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] 4097; GFX11-NEXT: s_mov_b32 s1, 0 4098; GFX11-NEXT: .LBB21_4: ; %atomicrmw.start 4099; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4100; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4101; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 4102; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4103; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] 4104; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4105; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc 4106; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4107; GFX11-NEXT: buffer_gl1_inv 4108; GFX11-NEXT: buffer_gl0_inv 4109; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 4110; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 4111; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4112; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4113; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4114; GFX11-NEXT: s_cbranch_execnz .LBB21_4 4115; GFX11-NEXT: ; %bb.5: ; %Flow 4116; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4117; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 4118; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 4119; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4120; GFX11-NEXT: s_cbranch_execz .LBB21_2 4121; GFX11-NEXT: .LBB21_6: ; %atomicrmw.private 4122; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4123; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo 4124; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off 4125; GFX11-NEXT: s_waitcnt vmcnt(0) 4126; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4128; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] 4129; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off 4130; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4131; GFX11-NEXT: s_setpc_b64 s[30:31] 4132; 4133; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4134; GFX10: ; %bb.0: 4135; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4136; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 4137; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 4138; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 4139; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 4140; GFX10-NEXT: s_cbranch_execnz .LBB21_3 4141; GFX10-NEXT: ; %bb.1: ; %Flow 4142; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4143; GFX10-NEXT: s_cbranch_execnz .LBB21_4 4144; GFX10-NEXT: .LBB21_2: ; %atomicrmw.phi 4145; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4146; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4147; GFX10-NEXT: s_setpc_b64 s[30:31] 4148; GFX10-NEXT: .LBB21_3: ; %atomicrmw.global 4149; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4150; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 4151; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4152; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4153; GFX10-NEXT: buffer_gl1_inv 4154; GFX10-NEXT: buffer_gl0_inv 4155; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 4156; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 4157; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4158; GFX10-NEXT: s_cbranch_execz .LBB21_2 4159; GFX10-NEXT: .LBB21_4: ; %atomicrmw.private 4160; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4161; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4162; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4163; GFX10-NEXT: s_clause 0x1 4164; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4165; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4166; GFX10-NEXT: s_waitcnt vmcnt(0) 4167; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4168; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4169; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4170; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4171; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4172; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4173; GFX10-NEXT: s_setpc_b64 s[30:31] 4174; 4175; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4176; GFX90A: ; %bb.0: 4177; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4178; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 4179; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4180; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4181; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4182; GFX90A-NEXT: s_cbranch_execnz .LBB21_3 4183; GFX90A-NEXT: ; %bb.1: ; %Flow 4184; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4185; GFX90A-NEXT: s_cbranch_execnz .LBB21_4 4186; GFX90A-NEXT: .LBB21_2: ; %atomicrmw.phi 4187; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4188; GFX90A-NEXT: s_setpc_b64 s[30:31] 4189; GFX90A-NEXT: .LBB21_3: ; %atomicrmw.global 4190; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4191; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4192; GFX90A-NEXT: buffer_wbinvl1 4193; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4194; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 4195; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4196; GFX90A-NEXT: s_cbranch_execz .LBB21_2 4197; GFX90A-NEXT: .LBB21_4: ; %atomicrmw.private 4198; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4199; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4200; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4201; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4202; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4203; GFX90A-NEXT: s_waitcnt vmcnt(0) 4204; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4205; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4206; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4207; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4208; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4209; GFX90A-NEXT: s_waitcnt vmcnt(0) 4210; GFX90A-NEXT: s_setpc_b64 s[30:31] 4211; 4212; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4213; GFX908: ; %bb.0: 4214; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4215; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 4216; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 4217; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4218; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 4219; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4220; GFX908-NEXT: s_cbranch_execnz .LBB21_3 4221; GFX908-NEXT: ; %bb.1: ; %Flow2 4222; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4223; GFX908-NEXT: s_cbranch_execnz .LBB21_6 4224; GFX908-NEXT: .LBB21_2: ; %atomicrmw.phi 4225; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4226; GFX908-NEXT: s_setpc_b64 s[30:31] 4227; GFX908-NEXT: .LBB21_3: ; %atomicrmw.global 4228; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 4229; GFX908-NEXT: s_mov_b64 s[6:7], 0 4230; GFX908-NEXT: .LBB21_4: ; %atomicrmw.start 4231; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4232; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4233; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 4234; GFX908-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] 4235; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc 4236; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4237; GFX908-NEXT: buffer_wbinvl1 4238; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 4239; GFX908-NEXT: v_mov_b32_e32 v5, v3 4240; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4241; GFX908-NEXT: v_mov_b32_e32 v4, v2 4242; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 4243; GFX908-NEXT: s_cbranch_execnz .LBB21_4 4244; GFX908-NEXT: ; %bb.5: ; %Flow 4245; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4246; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 4247; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 4248; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4249; GFX908-NEXT: s_cbranch_execz .LBB21_2 4250; GFX908-NEXT: .LBB21_6: ; %atomicrmw.private 4251; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4252; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 4253; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 4254; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 4255; GFX908-NEXT: s_waitcnt vmcnt(0) 4256; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4257; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] 4258; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4259; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 4260; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4261; GFX908-NEXT: s_waitcnt vmcnt(0) 4262; GFX908-NEXT: s_setpc_b64 s[30:31] 4263; 4264; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4265; GFX8: ; %bb.0: 4266; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4267; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 4268; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 4269; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 4270; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4271; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 4272; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4273; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4274; GFX8-NEXT: s_cbranch_execnz .LBB21_3 4275; GFX8-NEXT: ; %bb.1: ; %Flow2 4276; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4277; GFX8-NEXT: s_cbranch_execnz .LBB21_6 4278; GFX8-NEXT: .LBB21_2: ; %atomicrmw.phi 4279; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4280; GFX8-NEXT: s_setpc_b64 s[30:31] 4281; GFX8-NEXT: .LBB21_3: ; %atomicrmw.global 4282; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 4283; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 4284; GFX8-NEXT: flat_load_dword v5, v[2:3] 4285; GFX8-NEXT: flat_load_dword v4, v[0:1] 4286; GFX8-NEXT: s_mov_b64 s[6:7], 0 4287; GFX8-NEXT: .LBB21_4: ; %atomicrmw.start 4288; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4289; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4290; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 4291; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] 4292; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc 4293; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4294; GFX8-NEXT: buffer_wbinvl1 4295; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 4296; GFX8-NEXT: v_mov_b32_e32 v5, v3 4297; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4298; GFX8-NEXT: v_mov_b32_e32 v4, v2 4299; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 4300; GFX8-NEXT: s_cbranch_execnz .LBB21_4 4301; GFX8-NEXT: ; %bb.5: ; %Flow 4302; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4303; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 4304; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 4305; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4306; GFX8-NEXT: s_cbranch_execz .LBB21_2 4307; GFX8-NEXT: .LBB21_6: ; %atomicrmw.private 4308; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4309; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 4310; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 4311; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 4312; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen 4313; GFX8-NEXT: s_waitcnt vmcnt(0) 4314; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4315; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] 4316; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4317; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen 4318; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4319; GFX8-NEXT: s_waitcnt vmcnt(0) 4320; GFX8-NEXT: s_setpc_b64 s[30:31] 4321; 4322; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory: 4323; GFX7: ; %bb.0: 4324; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4325; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 4326; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 4327; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4328; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 4329; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 4330; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4331; GFX7-NEXT: s_cbranch_execnz .LBB21_3 4332; GFX7-NEXT: ; %bb.1: ; %Flow 4333; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4334; GFX7-NEXT: s_cbranch_execnz .LBB21_4 4335; GFX7-NEXT: .LBB21_2: ; %atomicrmw.phi 4336; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4337; GFX7-NEXT: s_setpc_b64 s[30:31] 4338; GFX7-NEXT: .LBB21_3: ; %atomicrmw.global 4339; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 4340; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4341; GFX7-NEXT: buffer_wbinvl1 4342; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 4343; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 4344; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4345; GFX7-NEXT: s_cbranch_execz .LBB21_2 4346; GFX7-NEXT: .LBB21_4: ; %atomicrmw.private 4347; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4348; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4349; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4350; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 4351; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4352; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4353; GFX7-NEXT: s_waitcnt vmcnt(0) 4354; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4355; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4356; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4357; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 4358; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4359; GFX7-NEXT: s_waitcnt vmcnt(0) 4360; GFX7-NEXT: s_setpc_b64 s[30:31] 4361 %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4362 ret void 4363} 4364 4365define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 4366; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4367; GFX12: ; %bb.0: 4368; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4369; GFX12-NEXT: s_wait_expcnt 0x0 4370; GFX12-NEXT: s_wait_samplecnt 0x0 4371; GFX12-NEXT: s_wait_bvhcnt 0x0 4372; GFX12-NEXT: s_wait_kmcnt 0x0 4373; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 4374; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 4375; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo 4376; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 4377; GFX12-NEXT: s_mov_b32 s0, exec_lo 4378; GFX12-NEXT: s_wait_alu 0xfffe 4379; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4380; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 4381; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 4382; GFX12-NEXT: s_cbranch_execnz .LBB22_3 4383; GFX12-NEXT: ; %bb.1: ; %Flow2 4384; GFX12-NEXT: s_wait_alu 0xfffe 4385; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4386; GFX12-NEXT: s_cbranch_execnz .LBB22_6 4387; GFX12-NEXT: .LBB22_2: ; %atomicrmw.phi 4388; GFX12-NEXT: s_wait_alu 0xfffe 4389; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4390; GFX12-NEXT: s_wait_alu 0xfffe 4391; GFX12-NEXT: s_setpc_b64 s[30:31] 4392; GFX12-NEXT: .LBB22_3: ; %atomicrmw.global 4393; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] 4394; GFX12-NEXT: s_mov_b32 s1, 0 4395; GFX12-NEXT: .LBB22_4: ; %atomicrmw.start 4396; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4397; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4398; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 4399; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4400; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 4401; GFX12-NEXT: s_wait_storecnt 0x0 4402; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4403; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4404; GFX12-NEXT: global_inv scope:SCOPE_DEV 4405; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] 4406; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4407; GFX12-NEXT: s_wait_alu 0xfffe 4408; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4409; GFX12-NEXT: s_wait_alu 0xfffe 4410; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4411; GFX12-NEXT: s_cbranch_execnz .LBB22_4 4412; GFX12-NEXT: ; %bb.5: ; %Flow 4413; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4414; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 4415; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 4416; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4417; GFX12-NEXT: s_cbranch_execz .LBB22_2 4418; GFX12-NEXT: .LBB22_6: ; %atomicrmw.private 4419; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] 4420; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo 4421; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off 4422; GFX12-NEXT: s_wait_loadcnt 0x0 4423; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 4424; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4425; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 4426; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off 4427; GFX12-NEXT: s_wait_alu 0xfffe 4428; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4429; GFX12-NEXT: s_wait_alu 0xfffe 4430; GFX12-NEXT: s_setpc_b64 s[30:31] 4431; 4432; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4433; GFX940: ; %bb.0: 4434; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4435; GFX940-NEXT: s_mov_b64 s[0:1], 0x7f8 4436; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 4437; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 4438; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 4439; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 4440; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4441; GFX940-NEXT: s_cbranch_execnz .LBB22_3 4442; GFX940-NEXT: ; %bb.1: ; %Flow 4443; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4444; GFX940-NEXT: s_cbranch_execnz .LBB22_4 4445; GFX940-NEXT: .LBB22_2: ; %atomicrmw.phi 4446; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4447; GFX940-NEXT: s_setpc_b64 s[30:31] 4448; GFX940-NEXT: .LBB22_3: ; %atomicrmw.global 4449; GFX940-NEXT: buffer_wbl2 sc1 4450; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4451; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4452; GFX940-NEXT: buffer_inv sc1 4453; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 4454; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 4455; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4456; GFX940-NEXT: s_cbranch_execz .LBB22_2 4457; GFX940-NEXT: .LBB22_4: ; %atomicrmw.private 4458; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4459; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4460; GFX940-NEXT: s_nop 0 4461; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4462; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 4463; GFX940-NEXT: s_waitcnt vmcnt(0) 4464; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4465; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4466; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 4467; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4468; GFX940-NEXT: s_waitcnt vmcnt(0) 4469; GFX940-NEXT: s_setpc_b64 s[30:31] 4470; 4471; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4472; GFX11: ; %bb.0: 4473; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4474; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 4475; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x7f8, v0 4476; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo 4477; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 4478; GFX11-NEXT: s_mov_b32 s0, exec_lo 4479; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4480; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 4481; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 4482; GFX11-NEXT: s_cbranch_execnz .LBB22_3 4483; GFX11-NEXT: ; %bb.1: ; %Flow2 4484; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4485; GFX11-NEXT: s_cbranch_execnz .LBB22_6 4486; GFX11-NEXT: .LBB22_2: ; %atomicrmw.phi 4487; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4488; GFX11-NEXT: s_setpc_b64 s[30:31] 4489; GFX11-NEXT: .LBB22_3: ; %atomicrmw.global 4490; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] 4491; GFX11-NEXT: s_mov_b32 s1, 0 4492; GFX11-NEXT: .LBB22_4: ; %atomicrmw.start 4493; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4494; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4495; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 4496; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4497; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4498; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4499; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc 4500; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4501; GFX11-NEXT: buffer_gl1_inv 4502; GFX11-NEXT: buffer_gl0_inv 4503; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] 4504; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4505; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4506; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4507; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4508; GFX11-NEXT: s_cbranch_execnz .LBB22_4 4509; GFX11-NEXT: ; %bb.5: ; %Flow 4510; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4511; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 4512; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 4513; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4514; GFX11-NEXT: s_cbranch_execz .LBB22_2 4515; GFX11-NEXT: .LBB22_6: ; %atomicrmw.private 4516; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] 4517; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo 4518; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off 4519; GFX11-NEXT: s_waitcnt vmcnt(0) 4520; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4521; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4522; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4523; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off 4524; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4525; GFX11-NEXT: s_setpc_b64 s[30:31] 4526; 4527; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4528; GFX10: ; %bb.0: 4529; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4530; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0 4531; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 4532; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 4533; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 4534; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 4535; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 4536; GFX10-NEXT: s_cbranch_execnz .LBB22_3 4537; GFX10-NEXT: ; %bb.1: ; %Flow 4538; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4539; GFX10-NEXT: s_cbranch_execnz .LBB22_4 4540; GFX10-NEXT: .LBB22_2: ; %atomicrmw.phi 4541; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4542; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4543; GFX10-NEXT: s_setpc_b64 s[30:31] 4544; GFX10-NEXT: .LBB22_3: ; %atomicrmw.global 4545; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4546; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 4547; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4548; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4549; GFX10-NEXT: buffer_gl1_inv 4550; GFX10-NEXT: buffer_gl0_inv 4551; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 4552; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 4553; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4554; GFX10-NEXT: s_cbranch_execz .LBB22_2 4555; GFX10-NEXT: .LBB22_4: ; %atomicrmw.private 4556; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4557; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4558; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4559; GFX10-NEXT: s_clause 0x1 4560; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4561; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4562; GFX10-NEXT: s_waitcnt vmcnt(0) 4563; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4564; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4565; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4566; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4567; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4568; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4569; GFX10-NEXT: s_setpc_b64 s[30:31] 4570; 4571; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4572; GFX90A: ; %bb.0: 4573; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4574; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7f8, v0 4575; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 4576; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 4577; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4578; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4579; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4580; GFX90A-NEXT: s_cbranch_execnz .LBB22_3 4581; GFX90A-NEXT: ; %bb.1: ; %Flow 4582; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4583; GFX90A-NEXT: s_cbranch_execnz .LBB22_4 4584; GFX90A-NEXT: .LBB22_2: ; %atomicrmw.phi 4585; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4586; GFX90A-NEXT: s_setpc_b64 s[30:31] 4587; GFX90A-NEXT: .LBB22_3: ; %atomicrmw.global 4588; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4589; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4590; GFX90A-NEXT: buffer_wbinvl1 4591; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4592; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 4593; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4594; GFX90A-NEXT: s_cbranch_execz .LBB22_2 4595; GFX90A-NEXT: .LBB22_4: ; %atomicrmw.private 4596; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4597; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4598; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4599; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4600; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4601; GFX90A-NEXT: s_waitcnt vmcnt(0) 4602; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4603; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4604; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4605; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4606; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4607; GFX90A-NEXT: s_waitcnt vmcnt(0) 4608; GFX90A-NEXT: s_setpc_b64 s[30:31] 4609; 4610; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4611; GFX908: ; %bb.0: 4612; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4613; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 4614; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0x7f8, v0 4615; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 4616; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 4617; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 4618; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 4619; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4620; GFX908-NEXT: s_cbranch_execnz .LBB22_3 4621; GFX908-NEXT: ; %bb.1: ; %Flow2 4622; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4623; GFX908-NEXT: s_cbranch_execnz .LBB22_6 4624; GFX908-NEXT: .LBB22_2: ; %atomicrmw.phi 4625; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4626; GFX908-NEXT: s_setpc_b64 s[30:31] 4627; GFX908-NEXT: .LBB22_3: ; %atomicrmw.global 4628; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] 4629; GFX908-NEXT: s_mov_b64 s[6:7], 0 4630; GFX908-NEXT: .LBB22_4: ; %atomicrmw.start 4631; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 4632; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4633; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 4634; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4635; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc 4636; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4637; GFX908-NEXT: buffer_wbinvl1 4638; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4639; GFX908-NEXT: v_mov_b32_e32 v3, v1 4640; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4641; GFX908-NEXT: v_mov_b32_e32 v2, v0 4642; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 4643; GFX908-NEXT: s_cbranch_execnz .LBB22_4 4644; GFX908-NEXT: ; %bb.5: ; %Flow 4645; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 4646; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 4647; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 4648; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4649; GFX908-NEXT: s_cbranch_execz .LBB22_2 4650; GFX908-NEXT: .LBB22_6: ; %atomicrmw.private 4651; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] 4652; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc 4653; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 4654; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 4655; GFX908-NEXT: s_waitcnt vmcnt(0) 4656; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4657; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4658; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4659; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 4660; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 4661; GFX908-NEXT: s_waitcnt vmcnt(0) 4662; GFX908-NEXT: s_setpc_b64 s[30:31] 4663; 4664; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4665; GFX8: ; %bb.0: 4666; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4667; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 4668; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 4669; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 4670; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7f8, v0 4671; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 4672; GFX8-NEXT: s_waitcnt lgkmcnt(0) 4673; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 4674; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 4675; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4676; GFX8-NEXT: s_cbranch_execnz .LBB22_3 4677; GFX8-NEXT: ; %bb.1: ; %Flow2 4678; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4679; GFX8-NEXT: s_cbranch_execnz .LBB22_6 4680; GFX8-NEXT: .LBB22_2: ; %atomicrmw.phi 4681; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4682; GFX8-NEXT: s_setpc_b64 s[30:31] 4683; GFX8-NEXT: .LBB22_3: ; %atomicrmw.global 4684; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 4685; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc 4686; GFX8-NEXT: flat_load_dword v3, v[0:1] 4687; GFX8-NEXT: flat_load_dword v2, v[6:7] 4688; GFX8-NEXT: s_mov_b64 s[6:7], 0 4689; GFX8-NEXT: .LBB22_4: ; %atomicrmw.start 4690; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 4691; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4692; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 4693; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4694; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc 4695; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4696; GFX8-NEXT: buffer_wbinvl1 4697; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 4698; GFX8-NEXT: v_mov_b32_e32 v3, v1 4699; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 4700; GFX8-NEXT: v_mov_b32_e32 v2, v0 4701; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 4702; GFX8-NEXT: s_cbranch_execnz .LBB22_4 4703; GFX8-NEXT: ; %bb.5: ; %Flow 4704; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 4705; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 4706; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 4707; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4708; GFX8-NEXT: s_cbranch_execz .LBB22_2 4709; GFX8-NEXT: .LBB22_6: ; %atomicrmw.private 4710; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] 4711; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc 4712; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 4713; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 4714; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen 4715; GFX8-NEXT: s_waitcnt vmcnt(0) 4716; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4717; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4718; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 4719; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen 4720; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 4721; GFX8-NEXT: s_waitcnt vmcnt(0) 4722; GFX8-NEXT: s_setpc_b64 s[30:31] 4723; 4724; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: 4725; GFX7: ; %bb.0: 4726; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4727; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 4728; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 4729; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7f8, v0 4730; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4731; GFX7-NEXT: s_waitcnt lgkmcnt(0) 4732; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 4733; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 4734; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4735; GFX7-NEXT: s_cbranch_execnz .LBB22_3 4736; GFX7-NEXT: ; %bb.1: ; %Flow 4737; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4738; GFX7-NEXT: s_cbranch_execnz .LBB22_4 4739; GFX7-NEXT: .LBB22_2: ; %atomicrmw.phi 4740; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4741; GFX7-NEXT: s_setpc_b64 s[30:31] 4742; GFX7-NEXT: .LBB22_3: ; %atomicrmw.global 4743; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 4744; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4745; GFX7-NEXT: buffer_wbinvl1 4746; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 4747; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 4748; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4749; GFX7-NEXT: s_cbranch_execz .LBB22_2 4750; GFX7-NEXT: .LBB22_4: ; %atomicrmw.private 4751; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4752; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4753; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4754; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 4755; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4756; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 4757; GFX7-NEXT: s_waitcnt vmcnt(0) 4758; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4759; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4760; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4761; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 4762; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 4763; GFX7-NEXT: s_waitcnt vmcnt(0) 4764; GFX7-NEXT: s_setpc_b64 s[30:31] 4765 %gep = getelementptr double, ptr %ptr, i64 255 4766 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 4767 ret void 4768} 4769 4770define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 { 4771; GFX12-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 4772; GFX12: ; %bb.0: 4773; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4774; GFX12-NEXT: s_wait_expcnt 0x0 4775; GFX12-NEXT: s_wait_samplecnt 0x0 4776; GFX12-NEXT: s_wait_bvhcnt 0x0 4777; GFX12-NEXT: s_wait_kmcnt 0x0 4778; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 4779; GFX12-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 4780; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo 4781; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 4782; GFX12-NEXT: s_mov_b32 s0, exec_lo 4783; GFX12-NEXT: s_wait_alu 0xfffe 4784; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4785; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v7 4786; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 4787; GFX12-NEXT: s_cbranch_execnz .LBB23_3 4788; GFX12-NEXT: ; %bb.1: ; %Flow2 4789; GFX12-NEXT: s_wait_alu 0xfffe 4790; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4791; GFX12-NEXT: s_cbranch_execnz .LBB23_6 4792; GFX12-NEXT: .LBB23_2: ; %atomicrmw.phi 4793; GFX12-NEXT: s_wait_alu 0xfffe 4794; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4795; GFX12-NEXT: s_wait_alu 0xfffe 4796; GFX12-NEXT: s_setpc_b64 s[30:31] 4797; GFX12-NEXT: .LBB23_3: ; %atomicrmw.global 4798; GFX12-NEXT: flat_load_b64 v[2:3], v[6:7] 4799; GFX12-NEXT: s_mov_b32 s1, 0 4800; GFX12-NEXT: .LBB23_4: ; %atomicrmw.start 4801; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 4802; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4803; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 4804; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4805; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 4806; GFX12-NEXT: s_wait_storecnt 0x0 4807; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 4808; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 4809; GFX12-NEXT: global_inv scope:SCOPE_DEV 4810; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] 4811; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4812; GFX12-NEXT: s_wait_alu 0xfffe 4813; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 4814; GFX12-NEXT: s_wait_alu 0xfffe 4815; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4816; GFX12-NEXT: s_cbranch_execnz .LBB23_4 4817; GFX12-NEXT: ; %bb.5: ; %Flow 4818; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 4819; GFX12-NEXT: ; implicit-def: $vgpr6_vgpr7 4820; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 4821; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 4822; GFX12-NEXT: s_cbranch_execz .LBB23_2 4823; GFX12-NEXT: .LBB23_6: ; %atomicrmw.private 4824; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] 4825; GFX12-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo 4826; GFX12-NEXT: scratch_load_b64 v[0:1], v2, off 4827; GFX12-NEXT: s_wait_loadcnt 0x0 4828; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] 4829; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 4830; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 4831; GFX12-NEXT: scratch_store_b64 v2, v[0:1], off 4832; GFX12-NEXT: s_wait_alu 0xfffe 4833; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 4834; GFX12-NEXT: s_wait_alu 0xfffe 4835; GFX12-NEXT: s_setpc_b64 s[30:31] 4836; 4837; GFX940-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 4838; GFX940: ; %bb.0: 4839; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4840; GFX940-NEXT: s_movk_i32 s0, 0xf800 4841; GFX940-NEXT: s_mov_b32 s1, -1 4842; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 4843; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 4844; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 4845; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 4846; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 4847; GFX940-NEXT: s_cbranch_execnz .LBB23_3 4848; GFX940-NEXT: ; %bb.1: ; %Flow 4849; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4850; GFX940-NEXT: s_cbranch_execnz .LBB23_4 4851; GFX940-NEXT: .LBB23_2: ; %atomicrmw.phi 4852; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4853; GFX940-NEXT: s_setpc_b64 s[30:31] 4854; GFX940-NEXT: .LBB23_3: ; %atomicrmw.global 4855; GFX940-NEXT: buffer_wbl2 sc1 4856; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4857; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4858; GFX940-NEXT: buffer_inv sc1 4859; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 4860; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 4861; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 4862; GFX940-NEXT: s_cbranch_execz .LBB23_2 4863; GFX940-NEXT: .LBB23_4: ; %atomicrmw.private 4864; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 4865; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4866; GFX940-NEXT: s_nop 0 4867; GFX940-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 4868; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v4, off 4869; GFX940-NEXT: s_waitcnt vmcnt(0) 4870; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4871; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4872; GFX940-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 4873; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 4874; GFX940-NEXT: s_waitcnt vmcnt(0) 4875; GFX940-NEXT: s_setpc_b64 s[30:31] 4876; 4877; GFX11-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 4878; GFX11: ; %bb.0: 4879; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4880; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 4881; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0xfffff800, v0 4882; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, -1, v1, vcc_lo 4883; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 4884; GFX11-NEXT: s_mov_b32 s0, exec_lo 4885; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4886; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v7 4887; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 4888; GFX11-NEXT: s_cbranch_execnz .LBB23_3 4889; GFX11-NEXT: ; %bb.1: ; %Flow2 4890; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4891; GFX11-NEXT: s_cbranch_execnz .LBB23_6 4892; GFX11-NEXT: .LBB23_2: ; %atomicrmw.phi 4893; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4894; GFX11-NEXT: s_setpc_b64 s[30:31] 4895; GFX11-NEXT: .LBB23_3: ; %atomicrmw.global 4896; GFX11-NEXT: flat_load_b64 v[2:3], v[6:7] 4897; GFX11-NEXT: s_mov_b32 s1, 0 4898; GFX11-NEXT: .LBB23_4: ; %atomicrmw.start 4899; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 4900; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4901; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 4902; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4903; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4904; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 4905; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[6:7], v[0:3] glc 4906; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4907; GFX11-NEXT: buffer_gl1_inv 4908; GFX11-NEXT: buffer_gl0_inv 4909; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] 4910; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 4911; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 4912; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 4913; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 4914; GFX11-NEXT: s_cbranch_execnz .LBB23_4 4915; GFX11-NEXT: ; %bb.5: ; %Flow 4916; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 4917; GFX11-NEXT: ; implicit-def: $vgpr6_vgpr7 4918; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 4919; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 4920; GFX11-NEXT: s_cbranch_execz .LBB23_2 4921; GFX11-NEXT: .LBB23_6: ; %atomicrmw.private 4922; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] 4923; GFX11-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo 4924; GFX11-NEXT: scratch_load_b64 v[0:1], v2, off 4925; GFX11-NEXT: s_waitcnt vmcnt(0) 4926; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4927; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4928; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 4929; GFX11-NEXT: scratch_store_b64 v2, v[0:1], off 4930; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 4931; GFX11-NEXT: s_setpc_b64 s[30:31] 4932; 4933; GFX10-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 4934; GFX10: ; %bb.0: 4935; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4936; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 4937; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 4938; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 4939; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 4940; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 4941; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 4942; GFX10-NEXT: s_cbranch_execnz .LBB23_3 4943; GFX10-NEXT: ; %bb.1: ; %Flow 4944; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4945; GFX10-NEXT: s_cbranch_execnz .LBB23_4 4946; GFX10-NEXT: .LBB23_2: ; %atomicrmw.phi 4947; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4948; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4949; GFX10-NEXT: s_setpc_b64 s[30:31] 4950; GFX10-NEXT: .LBB23_3: ; %atomicrmw.global 4951; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4952; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 4953; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4954; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 4955; GFX10-NEXT: buffer_gl1_inv 4956; GFX10-NEXT: buffer_gl0_inv 4957; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 4958; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 4959; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 4960; GFX10-NEXT: s_cbranch_execz .LBB23_2 4961; GFX10-NEXT: .LBB23_4: ; %atomicrmw.private 4962; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 4963; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 4964; GFX10-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo 4965; GFX10-NEXT: s_clause 0x1 4966; GFX10-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 4967; GFX10-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 4968; GFX10-NEXT: s_waitcnt vmcnt(0) 4969; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 4970; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 4971; GFX10-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 4972; GFX10-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 4973; GFX10-NEXT: s_waitcnt_depctr 0xffe3 4974; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 4975; GFX10-NEXT: s_setpc_b64 s[30:31] 4976; 4977; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 4978; GFX90A: ; %bb.0: 4979; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4980; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 4981; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 4982; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 4983; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 4984; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 4985; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 4986; GFX90A-NEXT: s_cbranch_execnz .LBB23_3 4987; GFX90A-NEXT: ; %bb.1: ; %Flow 4988; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 4989; GFX90A-NEXT: s_cbranch_execnz .LBB23_4 4990; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi 4991; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 4992; GFX90A-NEXT: s_setpc_b64 s[30:31] 4993; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.global 4994; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] 4995; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 4996; GFX90A-NEXT: buffer_wbinvl1 4997; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 4998; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 4999; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5000; GFX90A-NEXT: s_cbranch_execz .LBB23_2 5001; GFX90A-NEXT: .LBB23_4: ; %atomicrmw.private 5002; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5003; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5004; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5005; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 5006; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5007; GFX90A-NEXT: s_waitcnt vmcnt(0) 5008; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 5009; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 5010; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5011; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 5012; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5013; GFX90A-NEXT: s_waitcnt vmcnt(0) 5014; GFX90A-NEXT: s_setpc_b64 s[30:31] 5015; 5016; GFX908-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 5017; GFX908: ; %bb.0: 5018; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5019; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5020; GFX908-NEXT: v_add_co_u32_e32 v6, vcc, 0xfffff800, v0 5021; GFX908-NEXT: v_addc_co_u32_e32 v7, vcc, -1, v1, vcc 5022; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5023; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 5024; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5025; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5026; GFX908-NEXT: s_cbranch_execnz .LBB23_3 5027; GFX908-NEXT: ; %bb.1: ; %Flow2 5028; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5029; GFX908-NEXT: s_cbranch_execnz .LBB23_6 5030; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi 5031; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5032; GFX908-NEXT: s_setpc_b64 s[30:31] 5033; GFX908-NEXT: .LBB23_3: ; %atomicrmw.global 5034; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[6:7] 5035; GFX908-NEXT: s_mov_b64 s[6:7], 0 5036; GFX908-NEXT: .LBB23_4: ; %atomicrmw.start 5037; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5038; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5039; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5040; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5041; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc 5042; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5043; GFX908-NEXT: buffer_wbinvl1 5044; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5045; GFX908-NEXT: v_mov_b32_e32 v3, v1 5046; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5047; GFX908-NEXT: v_mov_b32_e32 v2, v0 5048; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5049; GFX908-NEXT: s_cbranch_execnz .LBB23_4 5050; GFX908-NEXT: ; %bb.5: ; %Flow 5051; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5052; GFX908-NEXT: ; implicit-def: $vgpr6_vgpr7 5053; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 5054; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5055; GFX908-NEXT: s_cbranch_execz .LBB23_2 5056; GFX908-NEXT: .LBB23_6: ; %atomicrmw.private 5057; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] 5058; GFX908-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc 5059; GFX908-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 5060; GFX908-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 5061; GFX908-NEXT: s_waitcnt vmcnt(0) 5062; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 5063; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5064; GFX908-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 5065; GFX908-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 5066; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5067; GFX908-NEXT: s_waitcnt vmcnt(0) 5068; GFX908-NEXT: s_setpc_b64 s[30:31] 5069; 5070; GFX8-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 5071; GFX8: ; %bb.0: 5072; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5073; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5074; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5075; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5076; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xfffff800, v0 5077; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc 5078; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5079; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v7 5080; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5081; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5082; GFX8-NEXT: s_cbranch_execnz .LBB23_3 5083; GFX8-NEXT: ; %bb.1: ; %Flow2 5084; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5085; GFX8-NEXT: s_cbranch_execnz .LBB23_6 5086; GFX8-NEXT: .LBB23_2: ; %atomicrmw.phi 5087; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5088; GFX8-NEXT: s_setpc_b64 s[30:31] 5089; GFX8-NEXT: .LBB23_3: ; %atomicrmw.global 5090; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6 5091; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc 5092; GFX8-NEXT: flat_load_dword v3, v[0:1] 5093; GFX8-NEXT: flat_load_dword v2, v[6:7] 5094; GFX8-NEXT: s_mov_b64 s[6:7], 0 5095; GFX8-NEXT: .LBB23_4: ; %atomicrmw.start 5096; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5097; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5098; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5099; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5100; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc 5101; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5102; GFX8-NEXT: buffer_wbinvl1 5103; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] 5104; GFX8-NEXT: v_mov_b32_e32 v3, v1 5105; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5106; GFX8-NEXT: v_mov_b32_e32 v2, v0 5107; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5108; GFX8-NEXT: s_cbranch_execnz .LBB23_4 5109; GFX8-NEXT: ; %bb.5: ; %Flow 5110; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5111; GFX8-NEXT: ; implicit-def: $vgpr6_vgpr7 5112; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 5113; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5114; GFX8-NEXT: s_cbranch_execz .LBB23_2 5115; GFX8-NEXT: .LBB23_6: ; %atomicrmw.private 5116; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] 5117; GFX8-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc 5118; GFX8-NEXT: v_add_u32_e32 v3, vcc, 4, v2 5119; GFX8-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 5120; GFX8-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen 5121; GFX8-NEXT: s_waitcnt vmcnt(0) 5122; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 5123; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5124; GFX8-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 5125; GFX8-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen 5126; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5127; GFX8-NEXT: s_waitcnt vmcnt(0) 5128; GFX8-NEXT: s_setpc_b64 s[30:31] 5129; 5130; GFX7-LABEL: flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: 5131; GFX7: ; %bb.0: 5132; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5133; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 5134; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 5135; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 5136; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 5137; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5138; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5139; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 5140; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5141; GFX7-NEXT: s_cbranch_execnz .LBB23_3 5142; GFX7-NEXT: ; %bb.1: ; %Flow 5143; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5144; GFX7-NEXT: s_cbranch_execnz .LBB23_4 5145; GFX7-NEXT: .LBB23_2: ; %atomicrmw.phi 5146; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5147; GFX7-NEXT: s_setpc_b64 s[30:31] 5148; GFX7-NEXT: .LBB23_3: ; %atomicrmw.global 5149; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[2:3] 5150; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5151; GFX7-NEXT: buffer_wbinvl1 5152; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 5153; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 5154; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5155; GFX7-NEXT: s_cbranch_execz .LBB23_2 5156; GFX7-NEXT: .LBB23_4: ; %atomicrmw.private 5157; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5158; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5159; GFX7-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc 5160; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v4 5161; GFX7-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen 5162; GFX7-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen 5163; GFX7-NEXT: s_waitcnt vmcnt(0) 5164; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] 5165; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 5166; GFX7-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen 5167; GFX7-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen 5168; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5169; GFX7-NEXT: s_waitcnt vmcnt(0) 5170; GFX7-NEXT: s_setpc_b64 s[30:31] 5171 %gep = getelementptr double, ptr %ptr, i64 -256 5172 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 5173 ret void 5174} 5175 5176define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 { 5177; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5178; GFX12: ; %bb.0: 5179; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5180; GFX12-NEXT: s_wait_expcnt 0x0 5181; GFX12-NEXT: s_wait_samplecnt 0x0 5182; GFX12-NEXT: s_wait_bvhcnt 0x0 5183; GFX12-NEXT: s_wait_kmcnt 0x0 5184; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 5185; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 5186; GFX12-NEXT: s_mov_b32 s0, exec_lo 5187; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 5188; GFX12-NEXT: s_wait_alu 0xfffe 5189; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 5190; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 5191; GFX12-NEXT: s_cbranch_execz .LBB24_4 5192; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global 5193; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] 5194; GFX12-NEXT: s_mov_b32 s1, 0 5195; GFX12-NEXT: .LBB24_2: ; %atomicrmw.start 5196; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5197; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5198; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 5199; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5200; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] 5201; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] 5202; GFX12-NEXT: s_wait_storecnt 0x0 5203; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5204; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5205; GFX12-NEXT: global_inv scope:SCOPE_DEV 5206; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 5207; GFX12-NEXT: s_wait_alu 0xfffe 5208; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 5209; GFX12-NEXT: s_wait_alu 0xfffe 5210; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5211; GFX12-NEXT: s_cbranch_execnz .LBB24_2 5212; GFX12-NEXT: ; %bb.3: ; %Flow 5213; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 5214; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 5215; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 5216; GFX12-NEXT: .LBB24_4: ; %Flow2 5217; GFX12-NEXT: s_wait_alu 0xfffe 5218; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 5219; GFX12-NEXT: s_cbranch_execz .LBB24_6 5220; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private 5221; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5222; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5223; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off 5224; GFX12-NEXT: s_wait_loadcnt 0x0 5225; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 5226; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5227; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 5228; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off 5229; GFX12-NEXT: .LBB24_6: ; %atomicrmw.phi 5230; GFX12-NEXT: s_wait_alu 0xfffe 5231; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5232; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 5233; GFX12-NEXT: s_wait_alu 0xfffe 5234; GFX12-NEXT: s_setpc_b64 s[30:31] 5235; 5236; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5237; GFX940: ; %bb.0: 5238; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5239; GFX940-NEXT: v_mov_b32_e32 v5, v1 5240; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 5241; GFX940-NEXT: v_mov_b32_e32 v4, v0 5242; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 5243; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 5244; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 5245; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 5246; GFX940-NEXT: s_cbranch_execnz .LBB24_3 5247; GFX940-NEXT: ; %bb.1: ; %Flow 5248; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5249; GFX940-NEXT: s_cbranch_execnz .LBB24_4 5250; GFX940-NEXT: .LBB24_2: ; %atomicrmw.phi 5251; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5252; GFX940-NEXT: s_setpc_b64 s[30:31] 5253; GFX940-NEXT: .LBB24_3: ; %atomicrmw.global 5254; GFX940-NEXT: buffer_wbl2 sc1 5255; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 5256; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5257; GFX940-NEXT: buffer_inv sc1 5258; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 5259; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 5260; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5261; GFX940-NEXT: s_cbranch_execz .LBB24_2 5262; GFX940-NEXT: .LBB24_4: ; %atomicrmw.private 5263; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5264; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5265; GFX940-NEXT: s_nop 0 5266; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 5267; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off 5268; GFX940-NEXT: s_waitcnt vmcnt(0) 5269; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 5270; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 5271; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 5272; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5273; GFX940-NEXT: s_waitcnt vmcnt(0) 5274; GFX940-NEXT: s_setpc_b64 s[30:31] 5275; 5276; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5277; GFX11: ; %bb.0: 5278; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5279; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5280; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 5281; GFX11-NEXT: s_mov_b32 s0, exec_lo 5282; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 5283; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 5284; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 5285; GFX11-NEXT: s_cbranch_execz .LBB24_4 5286; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global 5287; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] 5288; GFX11-NEXT: s_mov_b32 s1, 0 5289; GFX11-NEXT: .LBB24_2: ; %atomicrmw.start 5290; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5291; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5292; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 5293; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5294; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5295; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5296; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5297; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc 5298; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5299; GFX11-NEXT: buffer_gl1_inv 5300; GFX11-NEXT: buffer_gl0_inv 5301; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 5302; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5303; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5304; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5305; GFX11-NEXT: s_cbranch_execnz .LBB24_2 5306; GFX11-NEXT: ; %bb.3: ; %Flow 5307; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5308; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 5309; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 5310; GFX11-NEXT: .LBB24_4: ; %Flow2 5311; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5312; GFX11-NEXT: s_cbranch_execz .LBB24_6 5313; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private 5314; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5315; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5316; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off 5317; GFX11-NEXT: s_waitcnt vmcnt(0) 5318; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5320; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5321; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off 5322; GFX11-NEXT: .LBB24_6: ; %atomicrmw.phi 5323; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5324; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 5325; GFX11-NEXT: s_setpc_b64 s[30:31] 5326; 5327; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5328; GFX10: ; %bb.0: 5329; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5330; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5331; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 5332; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 5333; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v1 5334; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 5335; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 5336; GFX10-NEXT: s_cbranch_execz .LBB24_4 5337; GFX10-NEXT: ; %bb.1: ; %atomicrmw.global 5338; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5339; GFX10-NEXT: s_mov_b32 s5, 0 5340; GFX10-NEXT: .LBB24_2: ; %atomicrmw.start 5341; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 5342; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5343; GFX10-NEXT: v_mov_b32_e32 v9, v3 5344; GFX10-NEXT: v_mov_b32_e32 v8, v2 5345; GFX10-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5346; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5347; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5348; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5349; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5350; GFX10-NEXT: buffer_gl1_inv 5351; GFX10-NEXT: buffer_gl0_inv 5352; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 5353; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 5354; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 5355; GFX10-NEXT: s_cbranch_execnz .LBB24_2 5356; GFX10-NEXT: ; %bb.3: ; %Flow 5357; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 5358; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 5359; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 5360; GFX10-NEXT: .LBB24_4: ; %Flow2 5361; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5362; GFX10-NEXT: s_cbranch_execz .LBB24_6 5363; GFX10-NEXT: ; %bb.5: ; %atomicrmw.private 5364; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5365; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5366; GFX10-NEXT: s_clause 0x1 5367; GFX10-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5368; GFX10-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 5369; GFX10-NEXT: s_waitcnt vmcnt(0) 5370; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5371; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5372; GFX10-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5373; GFX10-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5374; GFX10-NEXT: .LBB24_6: ; %atomicrmw.phi 5375; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5376; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5377; GFX10-NEXT: v_mov_b32_e32 v0, v2 5378; GFX10-NEXT: v_mov_b32_e32 v1, v3 5379; GFX10-NEXT: s_setpc_b64 s[30:31] 5380; 5381; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5382; GFX90A: ; %bb.0: 5383; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5384; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 5385; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5386; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5387; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 5388; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5389; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5390; GFX90A-NEXT: s_cbranch_execz .LBB24_4 5391; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global 5392; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5393; GFX90A-NEXT: s_mov_b64 s[6:7], 0 5394; GFX90A-NEXT: .LBB24_2: ; %atomicrmw.start 5395; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 5396; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5397; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] 5398; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5399; GFX90A-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5400; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5401; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5402; GFX90A-NEXT: buffer_wbinvl1 5403; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5404; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5405; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 5406; GFX90A-NEXT: s_cbranch_execnz .LBB24_2 5407; GFX90A-NEXT: ; %bb.3: ; %Flow 5408; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 5409; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5410; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 5411; GFX90A-NEXT: .LBB24_4: ; %Flow2 5412; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5413; GFX90A-NEXT: s_cbranch_execz .LBB24_6 5414; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private 5415; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5416; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5417; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5418; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 5419; GFX90A-NEXT: s_waitcnt vmcnt(0) 5420; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5421; GFX90A-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5422; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5423; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5424; GFX90A-NEXT: .LBB24_6: ; %atomicrmw.phi 5425; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5426; GFX90A-NEXT: v_mov_b32_e32 v0, v2 5427; GFX90A-NEXT: v_mov_b32_e32 v1, v3 5428; GFX90A-NEXT: s_waitcnt vmcnt(0) 5429; GFX90A-NEXT: s_setpc_b64 s[30:31] 5430; 5431; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5432; GFX908: ; %bb.0: 5433; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5434; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5435; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5436; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5437; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 5438; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5439; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5440; GFX908-NEXT: s_cbranch_execz .LBB24_4 5441; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global 5442; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5443; GFX908-NEXT: s_mov_b64 s[6:7], 0 5444; GFX908-NEXT: .LBB24_2: ; %atomicrmw.start 5445; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5446; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5447; GFX908-NEXT: v_mov_b32_e32 v9, v3 5448; GFX908-NEXT: v_mov_b32_e32 v8, v2 5449; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5450; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5451; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5452; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5453; GFX908-NEXT: buffer_wbinvl1 5454; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5455; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5456; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5457; GFX908-NEXT: s_cbranch_execnz .LBB24_2 5458; GFX908-NEXT: ; %bb.3: ; %Flow 5459; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5460; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5461; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 5462; GFX908-NEXT: .LBB24_4: ; %Flow2 5463; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5464; GFX908-NEXT: s_cbranch_execz .LBB24_6 5465; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private 5466; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5467; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5468; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5469; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 5470; GFX908-NEXT: s_waitcnt vmcnt(0) 5471; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5472; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5473; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5474; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5475; GFX908-NEXT: .LBB24_6: ; %atomicrmw.phi 5476; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5477; GFX908-NEXT: v_mov_b32_e32 v0, v2 5478; GFX908-NEXT: v_mov_b32_e32 v1, v3 5479; GFX908-NEXT: s_waitcnt vmcnt(0) 5480; GFX908-NEXT: s_setpc_b64 s[30:31] 5481; 5482; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5483; GFX8: ; %bb.0: 5484; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5485; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5486; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5487; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5488; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 5489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5490; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5491; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5492; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5493; GFX8-NEXT: s_cbranch_execz .LBB24_4 5494; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global 5495; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 5496; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5497; GFX8-NEXT: flat_load_dword v3, v[2:3] 5498; GFX8-NEXT: flat_load_dword v2, v[0:1] 5499; GFX8-NEXT: s_mov_b64 s[6:7], 0 5500; GFX8-NEXT: .LBB24_2: ; %atomicrmw.start 5501; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5502; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5503; GFX8-NEXT: v_mov_b32_e32 v9, v3 5504; GFX8-NEXT: v_mov_b32_e32 v8, v2 5505; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5506; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5507; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5508; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5509; GFX8-NEXT: buffer_wbinvl1 5510; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5511; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5512; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5513; GFX8-NEXT: s_cbranch_execnz .LBB24_2 5514; GFX8-NEXT: ; %bb.3: ; %Flow 5515; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5516; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5517; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 5518; GFX8-NEXT: .LBB24_4: ; %Flow2 5519; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5520; GFX8-NEXT: s_cbranch_execz .LBB24_6 5521; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private 5522; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5523; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5524; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 5525; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5526; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen 5527; GFX8-NEXT: s_waitcnt vmcnt(0) 5528; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5529; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5530; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5531; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 5532; GFX8-NEXT: .LBB24_6: ; %atomicrmw.phi 5533; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5534; GFX8-NEXT: v_mov_b32_e32 v0, v2 5535; GFX8-NEXT: v_mov_b32_e32 v1, v3 5536; GFX8-NEXT: s_waitcnt vmcnt(0) 5537; GFX8-NEXT: s_setpc_b64 s[30:31] 5538; 5539; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory: 5540; GFX7: ; %bb.0: 5541; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5542; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 5543; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 5544; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5545; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 5546; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5547; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5548; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 5549; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5550; GFX7-NEXT: s_cbranch_execz .LBB24_4 5551; GFX7-NEXT: ; %bb.1: ; %atomicrmw.global 5552; GFX7-NEXT: v_add_i32_e32 v2, vcc, 4, v0 5553; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5554; GFX7-NEXT: flat_load_dword v3, v[2:3] 5555; GFX7-NEXT: flat_load_dword v2, v[0:1] 5556; GFX7-NEXT: s_mov_b64 s[6:7], 0 5557; GFX7-NEXT: .LBB24_2: ; %atomicrmw.start 5558; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 5559; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5560; GFX7-NEXT: v_mov_b32_e32 v9, v3 5561; GFX7-NEXT: v_mov_b32_e32 v8, v2 5562; GFX7-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5563; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5564; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5565; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5566; GFX7-NEXT: buffer_wbinvl1 5567; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5568; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5569; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7] 5570; GFX7-NEXT: s_cbranch_execnz .LBB24_2 5571; GFX7-NEXT: ; %bb.3: ; %Flow 5572; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] 5573; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 5574; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 5575; GFX7-NEXT: .LBB24_4: ; %Flow2 5576; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5577; GFX7-NEXT: s_cbranch_execz .LBB24_6 5578; GFX7-NEXT: ; %bb.5: ; %atomicrmw.private 5579; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5580; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5581; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 5582; GFX7-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5583; GFX7-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen 5584; GFX7-NEXT: s_waitcnt vmcnt(0) 5585; GFX7-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5586; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5587; GFX7-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5588; GFX7-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 5589; GFX7-NEXT: .LBB24_6: ; %atomicrmw.phi 5590; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5591; GFX7-NEXT: v_mov_b32_e32 v0, v2 5592; GFX7-NEXT: v_mov_b32_e32 v1, v3 5593; GFX7-NEXT: s_waitcnt vmcnt(0) 5594; GFX7-NEXT: s_setpc_b64 s[30:31] 5595 %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 5596 ret double %result 5597} 5598 5599define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, double %val) #0 { 5600; GFX12-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5601; GFX12: ; %bb.0: 5602; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5603; GFX12-NEXT: s_wait_expcnt 0x0 5604; GFX12-NEXT: s_wait_samplecnt 0x0 5605; GFX12-NEXT: s_wait_bvhcnt 0x0 5606; GFX12-NEXT: s_wait_kmcnt 0x0 5607; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 5608; GFX12-NEXT: s_mov_b64 s[0:1], src_private_base 5609; GFX12-NEXT: s_mov_b32 s0, exec_lo 5610; GFX12-NEXT: ; implicit-def: $vgpr2_vgpr3 5611; GFX12-NEXT: s_wait_alu 0xfffe 5612; GFX12-NEXT: v_cmpx_ne_u32_e64 s1, v1 5613; GFX12-NEXT: s_xor_b32 s0, exec_lo, s0 5614; GFX12-NEXT: s_cbranch_execz .LBB25_4 5615; GFX12-NEXT: ; %bb.1: ; %atomicrmw.global 5616; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] 5617; GFX12-NEXT: s_mov_b32 s1, 0 5618; GFX12-NEXT: .LBB25_2: ; %atomicrmw.start 5619; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 5620; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5621; GFX12-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 5622; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5623; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[8:9], v[8:9] 5624; GFX12-NEXT: v_min_num_f64_e32 v[6:7], v[2:3], v[4:5] 5625; GFX12-NEXT: s_wait_storecnt 0x0 5626; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 5627; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5628; GFX12-NEXT: global_inv scope:SCOPE_DEV 5629; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 5630; GFX12-NEXT: s_wait_alu 0xfffe 5631; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 5632; GFX12-NEXT: s_wait_alu 0xfffe 5633; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5634; GFX12-NEXT: s_cbranch_execnz .LBB25_2 5635; GFX12-NEXT: ; %bb.3: ; %Flow 5636; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 5637; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 5638; GFX12-NEXT: ; implicit-def: $vgpr4_vgpr5 5639; GFX12-NEXT: .LBB25_4: ; %Flow2 5640; GFX12-NEXT: s_wait_alu 0xfffe 5641; GFX12-NEXT: s_and_not1_saveexec_b32 s0, s0 5642; GFX12-NEXT: s_cbranch_execz .LBB25_6 5643; GFX12-NEXT: ; %bb.5: ; %atomicrmw.private 5644; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5645; GFX12-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5646; GFX12-NEXT: scratch_load_b64 v[2:3], v6, off 5647; GFX12-NEXT: s_wait_loadcnt 0x0 5648; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 5649; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 5650; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[4:5] 5651; GFX12-NEXT: scratch_store_b64 v6, v[0:1], off 5652; GFX12-NEXT: .LBB25_6: ; %atomicrmw.phi 5653; GFX12-NEXT: s_wait_alu 0xfffe 5654; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 5655; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 5656; GFX12-NEXT: s_wait_alu 0xfffe 5657; GFX12-NEXT: s_setpc_b64 s[30:31] 5658; 5659; GFX940-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5660; GFX940: ; %bb.0: 5661; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5662; GFX940-NEXT: v_mov_b32_e32 v5, v1 5663; GFX940-NEXT: s_mov_b64 s[0:1], src_private_base 5664; GFX940-NEXT: v_mov_b32_e32 v4, v0 5665; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 5666; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1 5667; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc 5668; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] 5669; GFX940-NEXT: s_cbranch_execnz .LBB25_3 5670; GFX940-NEXT: ; %bb.1: ; %Flow 5671; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5672; GFX940-NEXT: s_cbranch_execnz .LBB25_4 5673; GFX940-NEXT: .LBB25_2: ; %atomicrmw.phi 5674; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5675; GFX940-NEXT: s_setpc_b64 s[30:31] 5676; GFX940-NEXT: .LBB25_3: ; %atomicrmw.global 5677; GFX940-NEXT: buffer_wbl2 sc1 5678; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 5679; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5680; GFX940-NEXT: buffer_inv sc1 5681; GFX940-NEXT: ; implicit-def: $vgpr4_vgpr5 5682; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3 5683; GFX940-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] 5684; GFX940-NEXT: s_cbranch_execz .LBB25_2 5685; GFX940-NEXT: .LBB25_4: ; %atomicrmw.private 5686; GFX940-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5687; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5688; GFX940-NEXT: s_nop 0 5689; GFX940-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 5690; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v6, off 5691; GFX940-NEXT: s_waitcnt vmcnt(0) 5692; GFX940-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 5693; GFX940-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 5694; GFX940-NEXT: scratch_store_dwordx2 v6, v[2:3], off sc0 sc1 5695; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 5696; GFX940-NEXT: s_waitcnt vmcnt(0) 5697; GFX940-NEXT: s_setpc_b64 s[30:31] 5698; 5699; GFX11-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5700; GFX11: ; %bb.0: 5701; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5702; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5703; GFX11-NEXT: s_mov_b64 s[0:1], src_private_base 5704; GFX11-NEXT: s_mov_b32 s0, exec_lo 5705; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 5706; GFX11-NEXT: v_cmpx_ne_u32_e64 s1, v1 5707; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 5708; GFX11-NEXT: s_cbranch_execz .LBB25_4 5709; GFX11-NEXT: ; %bb.1: ; %atomicrmw.global 5710; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] 5711; GFX11-NEXT: s_mov_b32 s1, 0 5712; GFX11-NEXT: .LBB25_2: ; %atomicrmw.start 5713; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 5714; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5715; GFX11-NEXT: v_dual_mov_b32 v9, v3 :: v_dual_mov_b32 v8, v2 5716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 5717; GFX11-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5718; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5719; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 5720; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[6:9] glc 5721; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5722; GFX11-NEXT: buffer_gl1_inv 5723; GFX11-NEXT: buffer_gl0_inv 5724; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[8:9] 5725; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 5726; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 5727; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 5728; GFX11-NEXT: s_cbranch_execnz .LBB25_2 5729; GFX11-NEXT: ; %bb.3: ; %Flow 5730; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 5731; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 5732; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 5733; GFX11-NEXT: .LBB25_4: ; %Flow2 5734; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 5735; GFX11-NEXT: s_cbranch_execz .LBB25_6 5736; GFX11-NEXT: ; %bb.5: ; %atomicrmw.private 5737; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] 5738; GFX11-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo 5739; GFX11-NEXT: scratch_load_b64 v[2:3], v6, off 5740; GFX11-NEXT: s_waitcnt vmcnt(0) 5741; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 5743; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5744; GFX11-NEXT: scratch_store_b64 v6, v[0:1], off 5745; GFX11-NEXT: .LBB25_6: ; %atomicrmw.phi 5746; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 5747; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 5748; GFX11-NEXT: s_setpc_b64 s[30:31] 5749; 5750; GFX10-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5751; GFX10: ; %bb.0: 5752; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5753; GFX10-NEXT: v_mov_b32_e32 v5, v1 5754; GFX10-NEXT: v_mov_b32_e32 v4, v0 5755; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base 5756; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 5757; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 5758; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo 5759; GFX10-NEXT: s_xor_b32 s4, exec_lo, s4 5760; GFX10-NEXT: s_cbranch_execnz .LBB25_3 5761; GFX10-NEXT: ; %bb.1: ; %Flow 5762; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5763; GFX10-NEXT: s_cbranch_execnz .LBB25_4 5764; GFX10-NEXT: .LBB25_2: ; %atomicrmw.phi 5765; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5766; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5767; GFX10-NEXT: s_setpc_b64 s[30:31] 5768; GFX10-NEXT: .LBB25_3: ; %atomicrmw.global 5769; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 5770; GFX10-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 5771; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5772; GFX10-NEXT: buffer_gl1_inv 5773; GFX10-NEXT: buffer_gl0_inv 5774; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 5775; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 5776; GFX10-NEXT: s_andn2_saveexec_b32 s4, s4 5777; GFX10-NEXT: s_cbranch_execz .LBB25_2 5778; GFX10-NEXT: .LBB25_4: ; %atomicrmw.private 5779; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] 5780; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5781; GFX10-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo 5782; GFX10-NEXT: s_clause 0x1 5783; GFX10-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 5784; GFX10-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 5785; GFX10-NEXT: s_waitcnt vmcnt(0) 5786; GFX10-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 5787; GFX10-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 5788; GFX10-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 5789; GFX10-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 5790; GFX10-NEXT: s_waitcnt_depctr 0xffe3 5791; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 5792; GFX10-NEXT: s_setpc_b64 s[30:31] 5793; 5794; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5795; GFX90A: ; %bb.0: 5796; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5797; GFX90A-NEXT: v_mov_b32_e32 v5, v1 5798; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base 5799; GFX90A-NEXT: v_mov_b32_e32 v4, v0 5800; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 5801; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 5802; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc 5803; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5804; GFX90A-NEXT: s_cbranch_execnz .LBB25_3 5805; GFX90A-NEXT: ; %bb.1: ; %Flow 5806; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5807; GFX90A-NEXT: s_cbranch_execnz .LBB25_4 5808; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi 5809; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5810; GFX90A-NEXT: s_setpc_b64 s[30:31] 5811; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.global 5812; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] glc 5813; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5814; GFX90A-NEXT: buffer_wbinvl1 5815; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 5816; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 5817; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5818; GFX90A-NEXT: s_cbranch_execz .LBB25_2 5819; GFX90A-NEXT: .LBB25_4: ; %atomicrmw.private 5820; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5821; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 5822; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 5823; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 5824; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5825; GFX90A-NEXT: s_waitcnt vmcnt(0) 5826; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 5827; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 5828; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 5829; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 5830; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 5831; GFX90A-NEXT: s_waitcnt vmcnt(0) 5832; GFX90A-NEXT: s_setpc_b64 s[30:31] 5833; 5834; GFX908-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5835; GFX908: ; %bb.0: 5836; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5837; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5838; GFX908-NEXT: s_mov_b64 s[4:5], src_private_base 5839; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 5840; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 5841; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc 5842; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5843; GFX908-NEXT: s_cbranch_execz .LBB25_4 5844; GFX908-NEXT: ; %bb.1: ; %atomicrmw.global 5845; GFX908-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 5846; GFX908-NEXT: s_mov_b64 s[6:7], 0 5847; GFX908-NEXT: .LBB25_2: ; %atomicrmw.start 5848; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 5849; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5850; GFX908-NEXT: v_mov_b32_e32 v9, v3 5851; GFX908-NEXT: v_mov_b32_e32 v8, v2 5852; GFX908-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5853; GFX908-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5854; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5855; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5856; GFX908-NEXT: buffer_wbinvl1 5857; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5858; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5859; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 5860; GFX908-NEXT: s_cbranch_execnz .LBB25_2 5861; GFX908-NEXT: ; %bb.3: ; %Flow 5862; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 5863; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1 5864; GFX908-NEXT: ; implicit-def: $vgpr4_vgpr5 5865; GFX908-NEXT: .LBB25_4: ; %Flow2 5866; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5867; GFX908-NEXT: s_cbranch_execz .LBB25_6 5868; GFX908-NEXT: ; %bb.5: ; %atomicrmw.private 5869; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5870; GFX908-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5871; GFX908-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5872; GFX908-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 5873; GFX908-NEXT: s_waitcnt vmcnt(0) 5874; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5875; GFX908-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5876; GFX908-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5877; GFX908-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 5878; GFX908-NEXT: .LBB25_6: ; %atomicrmw.phi 5879; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 5880; GFX908-NEXT: v_mov_b32_e32 v0, v2 5881; GFX908-NEXT: v_mov_b32_e32 v1, v3 5882; GFX908-NEXT: s_waitcnt vmcnt(0) 5883; GFX908-NEXT: s_setpc_b64 s[30:31] 5884; 5885; GFX8-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5886; GFX8: ; %bb.0: 5887; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5888; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 5889; GFX8-NEXT: s_mov_b64 s[4:5], 0xc0 5890; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 5891; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 5892; GFX8-NEXT: s_waitcnt lgkmcnt(0) 5893; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 5894; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc 5895; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5896; GFX8-NEXT: s_cbranch_execz .LBB25_4 5897; GFX8-NEXT: ; %bb.1: ; %atomicrmw.global 5898; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 5899; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 5900; GFX8-NEXT: flat_load_dword v3, v[2:3] 5901; GFX8-NEXT: flat_load_dword v2, v[0:1] 5902; GFX8-NEXT: s_mov_b64 s[6:7], 0 5903; GFX8-NEXT: .LBB25_2: ; %atomicrmw.start 5904; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 5905; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5906; GFX8-NEXT: v_mov_b32_e32 v9, v3 5907; GFX8-NEXT: v_mov_b32_e32 v8, v2 5908; GFX8-NEXT: v_max_f64 v[2:3], v[8:9], v[8:9] 5909; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[4:5] 5910; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc 5911; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5912; GFX8-NEXT: buffer_wbinvl1 5913; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] 5914; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 5915; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 5916; GFX8-NEXT: s_cbranch_execnz .LBB25_2 5917; GFX8-NEXT: ; %bb.3: ; %Flow 5918; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 5919; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 5920; GFX8-NEXT: ; implicit-def: $vgpr4_vgpr5 5921; GFX8-NEXT: .LBB25_4: ; %Flow2 5922; GFX8-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5923; GFX8-NEXT: s_cbranch_execz .LBB25_6 5924; GFX8-NEXT: ; %bb.5: ; %atomicrmw.private 5925; GFX8-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] 5926; GFX8-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc 5927; GFX8-NEXT: v_add_u32_e32 v7, vcc, 4, v6 5928; GFX8-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen 5929; GFX8-NEXT: buffer_load_dword v3, v7, s[0:3], 0 offen 5930; GFX8-NEXT: s_waitcnt vmcnt(0) 5931; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 5932; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] 5933; GFX8-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen 5934; GFX8-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen 5935; GFX8-NEXT: .LBB25_6: ; %atomicrmw.phi 5936; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 5937; GFX8-NEXT: v_mov_b32_e32 v0, v2 5938; GFX8-NEXT: v_mov_b32_e32 v1, v3 5939; GFX8-NEXT: s_waitcnt vmcnt(0) 5940; GFX8-NEXT: s_setpc_b64 s[30:31] 5941; 5942; GFX7-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: 5943; GFX7: ; %bb.0: 5944; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5945; GFX7-NEXT: s_mov_b64 s[4:5], 0xc0 5946; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 5947; GFX7-NEXT: v_mov_b32_e32 v5, v1 5948; GFX7-NEXT: v_mov_b32_e32 v4, v0 5949; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1 5950; GFX7-NEXT: s_waitcnt lgkmcnt(0) 5951; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 5952; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc 5953; GFX7-NEXT: s_xor_b64 s[4:5], exec, s[4:5] 5954; GFX7-NEXT: s_cbranch_execnz .LBB25_3 5955; GFX7-NEXT: ; %bb.1: ; %Flow 5956; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5957; GFX7-NEXT: s_cbranch_execnz .LBB25_4 5958; GFX7-NEXT: .LBB25_2: ; %atomicrmw.phi 5959; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5960; GFX7-NEXT: s_setpc_b64 s[30:31] 5961; GFX7-NEXT: .LBB25_3: ; %atomicrmw.global 5962; GFX7-NEXT: flat_atomic_fmin_x2 v[0:1], v[4:5], v[2:3] glc 5963; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 5964; GFX7-NEXT: buffer_wbinvl1 5965; GFX7-NEXT: ; implicit-def: $vgpr4_vgpr5 5966; GFX7-NEXT: ; implicit-def: $vgpr2_vgpr3 5967; GFX7-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] 5968; GFX7-NEXT: s_cbranch_execz .LBB25_2 5969; GFX7-NEXT: .LBB25_4: ; %atomicrmw.private 5970; GFX7-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] 5971; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 5972; GFX7-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc 5973; GFX7-NEXT: v_add_i32_e32 v7, vcc, 4, v6 5974; GFX7-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen 5975; GFX7-NEXT: buffer_load_dword v1, v7, s[0:3], 0 offen 5976; GFX7-NEXT: s_waitcnt vmcnt(0) 5977; GFX7-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 5978; GFX7-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] 5979; GFX7-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen 5980; GFX7-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen 5981; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 5982; GFX7-NEXT: s_waitcnt vmcnt(0) 5983; GFX7-NEXT: s_setpc_b64 s[30:31] 5984 %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 5985 ret double %result 5986} 5987 5988; -------------------------------------------------------------------- 5989; half 5990; -------------------------------------------------------------------- 5991 5992define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 5993; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 5994; GFX12: ; %bb.0: 5995; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 5996; GFX12-NEXT: s_wait_expcnt 0x0 5997; GFX12-NEXT: s_wait_samplecnt 0x0 5998; GFX12-NEXT: s_wait_bvhcnt 0x0 5999; GFX12-NEXT: s_wait_kmcnt 0x0 6000; GFX12-NEXT: v_mov_b32_e32 v3, v0 6001; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 6002; GFX12-NEXT: s_mov_b32 s0, 0 6003; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6004; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6005; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6006; GFX12-NEXT: flat_load_b32 v5, v[0:1] 6007; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6008; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6009; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6010; GFX12-NEXT: v_not_b32_e32 v4, v4 6011; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start 6012; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6013; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6014; GFX12-NEXT: v_mov_b32_e32 v6, v5 6015; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6016; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6017; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 6018; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6019; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 6020; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 6021; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6022; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6023; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 6024; GFX12-NEXT: s_wait_storecnt 0x0 6025; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6026; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6027; GFX12-NEXT: global_inv scope:SCOPE_DEV 6028; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6029; GFX12-NEXT: s_wait_alu 0xfffe 6030; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6031; GFX12-NEXT: s_wait_alu 0xfffe 6032; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6033; GFX12-NEXT: s_cbranch_execnz .LBB26_1 6034; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6035; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6036; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6037; GFX12-NEXT: s_wait_alu 0xfffe 6038; GFX12-NEXT: s_setpc_b64 s[30:31] 6039; 6040; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6041; GFX940: ; %bb.0: 6042; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6043; GFX940-NEXT: v_mov_b32_e32 v3, v0 6044; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 6045; GFX940-NEXT: flat_load_dword v5, v[0:1] 6046; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 6047; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6048; GFX940-NEXT: s_mov_b32 s0, 0xffff 6049; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 6050; GFX940-NEXT: v_not_b32_e32 v4, v4 6051; GFX940-NEXT: s_mov_b64 s[0:1], 0 6052; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 6053; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start 6054; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6055; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6056; GFX940-NEXT: v_mov_b32_e32 v7, v5 6057; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6058; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 6059; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 6060; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6061; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 6062; GFX940-NEXT: buffer_wbl2 sc1 6063; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 6064; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6065; GFX940-NEXT: buffer_inv sc1 6066; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6067; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6068; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6069; GFX940-NEXT: s_cbranch_execnz .LBB26_1 6070; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6071; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6072; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6073; GFX940-NEXT: s_setpc_b64 s[30:31] 6074; 6075; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6076; GFX11: ; %bb.0: 6077; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6078; GFX11-NEXT: v_mov_b32_e32 v3, v0 6079; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 6080; GFX11-NEXT: s_mov_b32 s0, 0 6081; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6082; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6083; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6084; GFX11-NEXT: flat_load_b32 v5, v[0:1] 6085; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6086; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6087; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6088; GFX11-NEXT: v_not_b32_e32 v4, v4 6089; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start 6090; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6091; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6092; GFX11-NEXT: v_mov_b32_e32 v6, v5 6093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6094; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6095; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 6096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6097; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 6098; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 6099; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6100; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6101; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 6102; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6103; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 6104; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6105; GFX11-NEXT: buffer_gl1_inv 6106; GFX11-NEXT: buffer_gl0_inv 6107; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6108; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6109; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6110; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6111; GFX11-NEXT: s_cbranch_execnz .LBB26_1 6112; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6113; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6114; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6115; GFX11-NEXT: s_setpc_b64 s[30:31] 6116; 6117; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6118; GFX10: ; %bb.0: 6119; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6120; GFX10-NEXT: v_mov_b32_e32 v3, v0 6121; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 6122; GFX10-NEXT: s_mov_b32 s4, 0 6123; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6124; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6125; GFX10-NEXT: flat_load_dword v5, v[0:1] 6126; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6127; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6128; GFX10-NEXT: v_not_b32_e32 v4, v4 6129; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start 6130; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6131; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6132; GFX10-NEXT: v_mov_b32_e32 v6, v5 6133; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6134; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 6135; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 6136; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6137; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 6138; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6139; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6140; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6141; GFX10-NEXT: buffer_gl1_inv 6142; GFX10-NEXT: buffer_gl0_inv 6143; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6144; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6145; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6146; GFX10-NEXT: s_cbranch_execnz .LBB26_1 6147; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6148; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6149; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6150; GFX10-NEXT: s_setpc_b64 s[30:31] 6151; 6152; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6153; GFX90A: ; %bb.0: 6154; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6155; GFX90A-NEXT: v_mov_b32_e32 v3, v0 6156; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6157; GFX90A-NEXT: flat_load_dword v5, v[0:1] 6158; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6159; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6160; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6161; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6162; GFX90A-NEXT: v_not_b32_e32 v4, v4 6163; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6164; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 6165; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start 6166; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6167; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6168; GFX90A-NEXT: v_mov_b32_e32 v7, v5 6169; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6170; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 6171; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 6172; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6173; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 6174; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 6175; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6176; GFX90A-NEXT: buffer_wbinvl1 6177; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6178; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6179; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6180; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 6181; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6182; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6183; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6184; GFX90A-NEXT: s_setpc_b64 s[30:31] 6185; 6186; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6187; GFX908: ; %bb.0: 6188; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6189; GFX908-NEXT: v_mov_b32_e32 v3, v0 6190; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6191; GFX908-NEXT: flat_load_dword v5, v[0:1] 6192; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6193; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6194; GFX908-NEXT: s_mov_b32 s4, 0xffff 6195; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6196; GFX908-NEXT: v_not_b32_e32 v4, v4 6197; GFX908-NEXT: s_mov_b64 s[4:5], 0 6198; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 6199; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start 6200; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6201; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6202; GFX908-NEXT: v_mov_b32_e32 v6, v5 6203; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6204; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 6205; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 6206; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6207; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 6208; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6209; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6210; GFX908-NEXT: buffer_wbinvl1 6211; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6212; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6213; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6214; GFX908-NEXT: s_cbranch_execnz .LBB26_1 6215; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6216; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6217; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6218; GFX908-NEXT: s_setpc_b64 s[30:31] 6219; 6220; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6221; GFX8: ; %bb.0: 6222; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6223; GFX8-NEXT: v_mov_b32_e32 v3, v0 6224; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6225; GFX8-NEXT: flat_load_dword v5, v[0:1] 6226; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6227; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6228; GFX8-NEXT: s_mov_b32 s4, 0xffff 6229; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6230; GFX8-NEXT: v_not_b32_e32 v4, v4 6231; GFX8-NEXT: s_mov_b64 s[4:5], 0 6232; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 6233; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start 6234; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6235; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6236; GFX8-NEXT: v_mov_b32_e32 v6, v5 6237; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6238; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 6239; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 6240; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6241; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6242; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6243; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6244; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6245; GFX8-NEXT: buffer_wbinvl1 6246; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6247; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6248; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6249; GFX8-NEXT: s_cbranch_execnz .LBB26_1 6250; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6251; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6252; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6253; GFX8-NEXT: s_setpc_b64 s[30:31] 6254; 6255; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory: 6256; GFX7: ; %bb.0: 6257; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6258; GFX7-NEXT: v_mov_b32_e32 v3, v0 6259; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6260; GFX7-NEXT: flat_load_dword v5, v[0:1] 6261; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6262; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6263; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6264; GFX7-NEXT: s_mov_b64 s[4:5], 0 6265; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6266; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6267; GFX7-NEXT: v_not_b32_e32 v4, v4 6268; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start 6269; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6270; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6271; GFX7-NEXT: v_mov_b32_e32 v6, v5 6272; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6273; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6274; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6275; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 6276; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6277; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6278; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6279; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6280; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6281; GFX7-NEXT: buffer_wbinvl1 6282; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6283; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6284; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6285; GFX7-NEXT: s_cbranch_execnz .LBB26_1 6286; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6287; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6288; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6289; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6290; GFX7-NEXT: s_setpc_b64 s[30:31] 6291 %result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6292 ret half %result 6293} 6294 6295define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 6296; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6297; GFX12: ; %bb.0: 6298; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6299; GFX12-NEXT: s_wait_expcnt 0x0 6300; GFX12-NEXT: s_wait_samplecnt 0x0 6301; GFX12-NEXT: s_wait_bvhcnt 0x0 6302; GFX12-NEXT: s_wait_kmcnt 0x0 6303; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6304; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6305; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 6306; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 6307; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6308; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6309; GFX12-NEXT: s_mov_b32 s0, 0 6310; GFX12-NEXT: flat_load_b32 v5, v[0:1] 6311; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6312; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6313; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6314; GFX12-NEXT: v_not_b32_e32 v4, v4 6315; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start 6316; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6317; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6318; GFX12-NEXT: v_mov_b32_e32 v6, v5 6319; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6320; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6321; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 6322; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6323; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 6324; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 6325; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6326; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6327; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 6328; GFX12-NEXT: s_wait_storecnt 0x0 6329; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6330; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6331; GFX12-NEXT: global_inv scope:SCOPE_DEV 6332; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6333; GFX12-NEXT: s_wait_alu 0xfffe 6334; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6335; GFX12-NEXT: s_wait_alu 0xfffe 6336; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6337; GFX12-NEXT: s_cbranch_execnz .LBB27_1 6338; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6339; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6340; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6341; GFX12-NEXT: s_wait_alu 0xfffe 6342; GFX12-NEXT: s_setpc_b64 s[30:31] 6343; 6344; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6345; GFX940: ; %bb.0: 6346; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6347; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 6348; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 6349; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 6350; GFX940-NEXT: v_mov_b32_e32 v1, v5 6351; GFX940-NEXT: flat_load_dword v5, v[0:1] 6352; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 6353; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6354; GFX940-NEXT: s_mov_b32 s0, 0xffff 6355; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 6356; GFX940-NEXT: v_not_b32_e32 v4, v4 6357; GFX940-NEXT: s_mov_b64 s[0:1], 0 6358; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 6359; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start 6360; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6361; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6362; GFX940-NEXT: v_mov_b32_e32 v7, v5 6363; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6364; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 6365; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 6366; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6367; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 6368; GFX940-NEXT: buffer_wbl2 sc1 6369; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 6370; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6371; GFX940-NEXT: buffer_inv sc1 6372; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6373; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6374; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6375; GFX940-NEXT: s_cbranch_execnz .LBB27_1 6376; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6377; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6378; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6379; GFX940-NEXT: s_setpc_b64 s[30:31] 6380; 6381; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6382; GFX11: ; %bb.0: 6383; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6384; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6385; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6386; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 6387; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 6388; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6389; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6390; GFX11-NEXT: s_mov_b32 s0, 0 6391; GFX11-NEXT: flat_load_b32 v5, v[0:1] 6392; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6393; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6394; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6395; GFX11-NEXT: v_not_b32_e32 v4, v4 6396; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start 6397; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6398; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6399; GFX11-NEXT: v_mov_b32_e32 v6, v5 6400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6401; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6402; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 6403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6404; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 6405; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 6406; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6407; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6408; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 6409; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6410; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 6411; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6412; GFX11-NEXT: buffer_gl1_inv 6413; GFX11-NEXT: buffer_gl0_inv 6414; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6415; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6416; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6417; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6418; GFX11-NEXT: s_cbranch_execnz .LBB27_1 6419; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6420; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6421; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6422; GFX11-NEXT: s_setpc_b64 s[30:31] 6423; 6424; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6425; GFX10: ; %bb.0: 6426; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6427; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 6428; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 6429; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 6430; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6431; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6432; GFX10-NEXT: s_mov_b32 s4, 0 6433; GFX10-NEXT: flat_load_dword v5, v[0:1] 6434; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6435; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6436; GFX10-NEXT: v_not_b32_e32 v4, v4 6437; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start 6438; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6439; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6440; GFX10-NEXT: v_mov_b32_e32 v6, v5 6441; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6442; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 6443; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 6444; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6445; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 6446; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6447; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6448; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6449; GFX10-NEXT: buffer_gl1_inv 6450; GFX10-NEXT: buffer_gl0_inv 6451; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6452; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6453; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6454; GFX10-NEXT: s_cbranch_execnz .LBB27_1 6455; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6456; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6457; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6458; GFX10-NEXT: s_setpc_b64 s[30:31] 6459; 6460; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6461; GFX90A: ; %bb.0: 6462; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6463; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 6464; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6465; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6466; GFX90A-NEXT: flat_load_dword v5, v[0:1] 6467; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6468; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6469; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6470; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6471; GFX90A-NEXT: v_not_b32_e32 v4, v4 6472; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6473; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 6474; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start 6475; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6476; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6477; GFX90A-NEXT: v_mov_b32_e32 v7, v5 6478; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6479; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 6480; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 6481; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6482; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 6483; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 6484; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6485; GFX90A-NEXT: buffer_wbinvl1 6486; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6487; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6488; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6489; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 6490; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6491; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6492; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6493; GFX90A-NEXT: s_setpc_b64 s[30:31] 6494; 6495; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6496; GFX908: ; %bb.0: 6497; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6498; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 6499; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6500; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6501; GFX908-NEXT: flat_load_dword v5, v[0:1] 6502; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6503; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6504; GFX908-NEXT: s_mov_b32 s4, 0xffff 6505; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6506; GFX908-NEXT: v_not_b32_e32 v4, v4 6507; GFX908-NEXT: s_mov_b64 s[4:5], 0 6508; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 6509; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start 6510; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6511; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6512; GFX908-NEXT: v_mov_b32_e32 v6, v5 6513; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6514; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 6515; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 6516; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6517; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 6518; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6519; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6520; GFX908-NEXT: buffer_wbinvl1 6521; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6522; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6523; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6524; GFX908-NEXT: s_cbranch_execnz .LBB27_1 6525; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6526; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6527; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6528; GFX908-NEXT: s_setpc_b64 s[30:31] 6529; 6530; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6531; GFX8: ; %bb.0: 6532; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6533; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 6534; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6535; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6536; GFX8-NEXT: flat_load_dword v5, v[0:1] 6537; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6538; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6539; GFX8-NEXT: s_mov_b32 s4, 0xffff 6540; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6541; GFX8-NEXT: v_not_b32_e32 v4, v4 6542; GFX8-NEXT: s_mov_b64 s[4:5], 0 6543; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 6544; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start 6545; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6546; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6547; GFX8-NEXT: v_mov_b32_e32 v6, v5 6548; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6549; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 6550; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 6551; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6552; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6553; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6554; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6555; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6556; GFX8-NEXT: buffer_wbinvl1 6557; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6558; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6559; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6560; GFX8-NEXT: s_cbranch_execnz .LBB27_1 6561; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6562; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6563; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6564; GFX8-NEXT: s_setpc_b64 s[30:31] 6565; 6566; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 6567; GFX7: ; %bb.0: 6568; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6569; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 6570; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6571; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6572; GFX7-NEXT: flat_load_dword v5, v[0:1] 6573; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6574; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6575; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6576; GFX7-NEXT: s_mov_b64 s[4:5], 0 6577; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6578; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6579; GFX7-NEXT: v_not_b32_e32 v4, v4 6580; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start 6581; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6582; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6583; GFX7-NEXT: v_mov_b32_e32 v6, v5 6584; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6585; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6586; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6587; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 6588; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6589; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6590; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6591; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6592; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6593; GFX7-NEXT: buffer_wbinvl1 6594; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6595; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6596; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6597; GFX7-NEXT: s_cbranch_execnz .LBB27_1 6598; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6599; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6600; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6601; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6602; GFX7-NEXT: s_setpc_b64 s[30:31] 6603 %gep = getelementptr half, ptr %ptr, i64 1023 6604 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6605 ret half %result 6606} 6607 6608define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 6609; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6610; GFX12: ; %bb.0: 6611; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6612; GFX12-NEXT: s_wait_expcnt 0x0 6613; GFX12-NEXT: s_wait_samplecnt 0x0 6614; GFX12-NEXT: s_wait_bvhcnt 0x0 6615; GFX12-NEXT: s_wait_kmcnt 0x0 6616; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6617; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6618; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 6619; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 6620; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6621; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6622; GFX12-NEXT: s_mov_b32 s0, 0 6623; GFX12-NEXT: flat_load_b32 v5, v[0:1] 6624; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6625; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6626; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6627; GFX12-NEXT: v_not_b32_e32 v4, v4 6628; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start 6629; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6630; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6631; GFX12-NEXT: v_mov_b32_e32 v6, v5 6632; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6633; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6634; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 6635; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6636; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 6637; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 6638; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6639; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6640; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 6641; GFX12-NEXT: s_wait_storecnt 0x0 6642; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6643; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6644; GFX12-NEXT: global_inv scope:SCOPE_DEV 6645; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6646; GFX12-NEXT: s_wait_alu 0xfffe 6647; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6648; GFX12-NEXT: s_wait_alu 0xfffe 6649; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6650; GFX12-NEXT: s_cbranch_execnz .LBB28_1 6651; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6652; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6653; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6654; GFX12-NEXT: s_wait_alu 0xfffe 6655; GFX12-NEXT: s_setpc_b64 s[30:31] 6656; 6657; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6658; GFX940: ; %bb.0: 6659; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6660; GFX940-NEXT: s_movk_i32 s0, 0xf800 6661; GFX940-NEXT: s_mov_b32 s1, -1 6662; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 6663; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 6664; GFX940-NEXT: v_mov_b32_e32 v1, v5 6665; GFX940-NEXT: flat_load_dword v5, v[0:1] 6666; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 6667; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6668; GFX940-NEXT: s_mov_b32 s0, 0xffff 6669; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 6670; GFX940-NEXT: v_not_b32_e32 v4, v4 6671; GFX940-NEXT: s_mov_b64 s[0:1], 0 6672; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 6673; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start 6674; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6675; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6676; GFX940-NEXT: v_mov_b32_e32 v7, v5 6677; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6678; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 6679; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 6680; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6681; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 6682; GFX940-NEXT: buffer_wbl2 sc1 6683; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 6684; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6685; GFX940-NEXT: buffer_inv sc1 6686; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6687; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6688; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6689; GFX940-NEXT: s_cbranch_execnz .LBB28_1 6690; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 6691; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 6692; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6693; GFX940-NEXT: s_setpc_b64 s[30:31] 6694; 6695; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6696; GFX11: ; %bb.0: 6697; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6698; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6699; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6700; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 6701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 6702; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 6703; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 6704; GFX11-NEXT: s_mov_b32 s0, 0 6705; GFX11-NEXT: flat_load_b32 v5, v[0:1] 6706; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6707; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 6709; GFX11-NEXT: v_not_b32_e32 v4, v4 6710; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start 6711; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 6712; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6713; GFX11-NEXT: v_mov_b32_e32 v6, v5 6714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6715; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6716; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 6717; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6718; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 6719; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 6720; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6721; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6722; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 6723; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 6724; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 6725; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6726; GFX11-NEXT: buffer_gl1_inv 6727; GFX11-NEXT: buffer_gl0_inv 6728; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6729; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 6730; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 6731; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6732; GFX11-NEXT: s_cbranch_execnz .LBB28_1 6733; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 6734; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 6735; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6736; GFX11-NEXT: s_setpc_b64 s[30:31] 6737; 6738; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6739; GFX10: ; %bb.0: 6740; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6741; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 6742; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 6743; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 6744; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 6745; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 6746; GFX10-NEXT: s_mov_b32 s4, 0 6747; GFX10-NEXT: flat_load_dword v5, v[0:1] 6748; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6749; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 6750; GFX10-NEXT: v_not_b32_e32 v4, v4 6751; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start 6752; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 6753; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6754; GFX10-NEXT: v_mov_b32_e32 v6, v5 6755; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6756; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 6757; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 6758; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 6759; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 6760; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 6761; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6762; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6763; GFX10-NEXT: buffer_gl1_inv 6764; GFX10-NEXT: buffer_gl0_inv 6765; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 6766; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 6767; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 6768; GFX10-NEXT: s_cbranch_execnz .LBB28_1 6769; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 6770; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 6771; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6772; GFX10-NEXT: s_setpc_b64 s[30:31] 6773; 6774; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6775; GFX90A: ; %bb.0: 6776; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6777; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 6778; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 6779; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 6780; GFX90A-NEXT: flat_load_dword v5, v[0:1] 6781; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 6782; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6783; GFX90A-NEXT: s_mov_b32 s4, 0xffff 6784; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6785; GFX90A-NEXT: v_not_b32_e32 v4, v4 6786; GFX90A-NEXT: s_mov_b64 s[4:5], 0 6787; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 6788; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start 6789; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 6790; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6791; GFX90A-NEXT: v_mov_b32_e32 v7, v5 6792; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 6793; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 6794; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 6795; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6796; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 6797; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 6798; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6799; GFX90A-NEXT: buffer_wbinvl1 6800; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 6801; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6802; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 6803; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 6804; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 6805; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 6806; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6807; GFX90A-NEXT: s_setpc_b64 s[30:31] 6808; 6809; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6810; GFX908: ; %bb.0: 6811; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6812; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 6813; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 6814; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 6815; GFX908-NEXT: flat_load_dword v5, v[0:1] 6816; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 6817; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6818; GFX908-NEXT: s_mov_b32 s4, 0xffff 6819; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6820; GFX908-NEXT: v_not_b32_e32 v4, v4 6821; GFX908-NEXT: s_mov_b64 s[4:5], 0 6822; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 6823; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start 6824; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 6825; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6826; GFX908-NEXT: v_mov_b32_e32 v6, v5 6827; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6828; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 6829; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 6830; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6831; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 6832; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6833; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6834; GFX908-NEXT: buffer_wbinvl1 6835; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6836; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6837; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 6838; GFX908-NEXT: s_cbranch_execnz .LBB28_1 6839; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 6840; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 6841; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6842; GFX908-NEXT: s_setpc_b64 s[30:31] 6843; 6844; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6845; GFX8: ; %bb.0: 6846; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6847; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 6848; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 6849; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 6850; GFX8-NEXT: flat_load_dword v5, v[0:1] 6851; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 6852; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6853; GFX8-NEXT: s_mov_b32 s4, 0xffff 6854; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 6855; GFX8-NEXT: v_not_b32_e32 v4, v4 6856; GFX8-NEXT: s_mov_b64 s[4:5], 0 6857; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 6858; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start 6859; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 6860; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6861; GFX8-NEXT: v_mov_b32_e32 v6, v5 6862; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 6863; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 6864; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 6865; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 6866; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 6867; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 6868; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6869; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6870; GFX8-NEXT: buffer_wbinvl1 6871; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6872; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6873; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 6874; GFX8-NEXT: s_cbranch_execnz .LBB28_1 6875; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 6876; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 6877; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 6878; GFX8-NEXT: s_setpc_b64 s[30:31] 6879; 6880; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 6881; GFX7: ; %bb.0: 6882; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6883; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 6884; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 6885; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 6886; GFX7-NEXT: flat_load_dword v5, v[0:1] 6887; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 6888; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 6889; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 6890; GFX7-NEXT: s_mov_b64 s[4:5], 0 6891; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 6892; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 6893; GFX7-NEXT: v_not_b32_e32 v4, v4 6894; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start 6895; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 6896; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6897; GFX7-NEXT: v_mov_b32_e32 v6, v5 6898; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 6899; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 6900; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 6901; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 6902; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 6903; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 6904; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 6905; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 6906; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6907; GFX7-NEXT: buffer_wbinvl1 6908; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 6909; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 6910; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 6911; GFX7-NEXT: s_cbranch_execnz .LBB28_1 6912; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 6913; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 6914; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 6915; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 6916; GFX7-NEXT: s_setpc_b64 s[30:31] 6917 %gep = getelementptr half, ptr %ptr, i64 -1024 6918 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 6919 ret half %result 6920 } 6921 6922define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 6923; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 6924; GFX12: ; %bb.0: 6925; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6926; GFX12-NEXT: s_wait_expcnt 0x0 6927; GFX12-NEXT: s_wait_samplecnt 0x0 6928; GFX12-NEXT: s_wait_bvhcnt 0x0 6929; GFX12-NEXT: s_wait_kmcnt 0x0 6930; GFX12-NEXT: v_mov_b32_e32 v3, v0 6931; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 6932; GFX12-NEXT: s_mov_b32 s0, 0 6933; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 6934; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 6935; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 6936; GFX12-NEXT: flat_load_b32 v4, v[0:1] 6937; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 6938; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 6939; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6940; GFX12-NEXT: v_not_b32_e32 v6, v3 6941; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start 6942; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 6943; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6944; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 6945; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6946; GFX12-NEXT: v_max_num_f16_e32 v3, v3, v3 6947; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 6948; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 6949; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 6950; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 6951; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 6952; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 6953; GFX12-NEXT: s_wait_storecnt 0x0 6954; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 6955; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 6956; GFX12-NEXT: global_inv scope:SCOPE_DEV 6957; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 6958; GFX12-NEXT: v_mov_b32_e32 v4, v3 6959; GFX12-NEXT: s_wait_alu 0xfffe 6960; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 6961; GFX12-NEXT: s_wait_alu 0xfffe 6962; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 6963; GFX12-NEXT: s_cbranch_execnz .LBB29_1 6964; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 6965; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 6966; GFX12-NEXT: s_wait_alu 0xfffe 6967; GFX12-NEXT: s_setpc_b64 s[30:31] 6968; 6969; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 6970; GFX940: ; %bb.0: 6971; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 6972; GFX940-NEXT: v_mov_b32_e32 v3, v0 6973; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 6974; GFX940-NEXT: flat_load_dword v5, v[0:1] 6975; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 6976; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 6977; GFX940-NEXT: s_mov_b32 s0, 0xffff 6978; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 6979; GFX940-NEXT: v_not_b32_e32 v6, v4 6980; GFX940-NEXT: s_mov_b64 s[0:1], 0 6981; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 6982; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start 6983; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 6984; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6985; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5 6986; GFX940-NEXT: v_max_f16_e32 v4, v4, v4 6987; GFX940-NEXT: v_min_f16_e32 v4, v4, v2 6988; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4 6989; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 6990; GFX940-NEXT: buffer_wbl2 sc1 6991; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 6992; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 6993; GFX940-NEXT: buffer_inv sc1 6994; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 6995; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 6996; GFX940-NEXT: v_mov_b32_e32 v5, v4 6997; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 6998; GFX940-NEXT: s_cbranch_execnz .LBB29_1 6999; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7000; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7001; GFX940-NEXT: s_setpc_b64 s[30:31] 7002; 7003; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7004; GFX11: ; %bb.0: 7005; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7006; GFX11-NEXT: v_mov_b32_e32 v3, v0 7007; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 7008; GFX11-NEXT: s_mov_b32 s0, 0 7009; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 7010; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 7011; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 7012; GFX11-NEXT: flat_load_b32 v4, v[0:1] 7013; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7014; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7015; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7016; GFX11-NEXT: v_not_b32_e32 v6, v3 7017; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start 7018; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7019; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7020; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7021; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7022; GFX11-NEXT: v_max_f16_e32 v3, v3, v3 7023; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 7024; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7025; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7026; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7027; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7028; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 7029; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7030; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 7031; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7032; GFX11-NEXT: buffer_gl1_inv 7033; GFX11-NEXT: buffer_gl0_inv 7034; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7035; GFX11-NEXT: v_mov_b32_e32 v4, v3 7036; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7037; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7038; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7039; GFX11-NEXT: s_cbranch_execnz .LBB29_1 7040; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7041; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7042; GFX11-NEXT: s_setpc_b64 s[30:31] 7043; 7044; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7045; GFX10: ; %bb.0: 7046; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7047; GFX10-NEXT: v_mov_b32_e32 v3, v0 7048; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 7049; GFX10-NEXT: s_mov_b32 s4, 0 7050; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 7051; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 7052; GFX10-NEXT: flat_load_dword v4, v[0:1] 7053; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7054; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 7055; GFX10-NEXT: v_not_b32_e32 v6, v3 7056; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start 7057; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7058; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7059; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7060; GFX10-NEXT: v_max_f16_e32 v3, v3, v3 7061; GFX10-NEXT: v_min_f16_e32 v3, v3, v2 7062; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 7063; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 7064; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7065; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7066; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7067; GFX10-NEXT: buffer_gl1_inv 7068; GFX10-NEXT: buffer_gl0_inv 7069; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7070; GFX10-NEXT: v_mov_b32_e32 v4, v3 7071; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7072; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7073; GFX10-NEXT: s_cbranch_execnz .LBB29_1 7074; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7075; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7076; GFX10-NEXT: s_setpc_b64 s[30:31] 7077; 7078; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7079; GFX90A: ; %bb.0: 7080; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7081; GFX90A-NEXT: v_mov_b32_e32 v3, v0 7082; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 7083; GFX90A-NEXT: flat_load_dword v5, v[0:1] 7084; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 7085; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 7086; GFX90A-NEXT: s_mov_b32 s4, 0xffff 7087; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 7088; GFX90A-NEXT: v_not_b32_e32 v6, v4 7089; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7090; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 7091; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start 7092; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7093; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7094; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5 7095; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4 7096; GFX90A-NEXT: v_min_f16_e32 v4, v4, v2 7097; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4 7098; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 7099; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 7100; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7101; GFX90A-NEXT: buffer_wbinvl1 7102; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 7103; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7104; GFX90A-NEXT: v_mov_b32_e32 v5, v4 7105; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7106; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 7107; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7108; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7109; GFX90A-NEXT: s_setpc_b64 s[30:31] 7110; 7111; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7112; GFX908: ; %bb.0: 7113; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7114; GFX908-NEXT: v_mov_b32_e32 v3, v0 7115; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 7116; GFX908-NEXT: flat_load_dword v4, v[0:1] 7117; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 7118; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7119; GFX908-NEXT: s_mov_b32 s4, 0xffff 7120; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7121; GFX908-NEXT: v_not_b32_e32 v6, v3 7122; GFX908-NEXT: s_mov_b64 s[4:5], 0 7123; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 7124; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start 7125; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7126; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7127; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7128; GFX908-NEXT: v_max_f16_e32 v3, v3, v3 7129; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 7130; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7131; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 7132; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7133; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7134; GFX908-NEXT: buffer_wbinvl1 7135; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7136; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7137; GFX908-NEXT: v_mov_b32_e32 v4, v3 7138; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7139; GFX908-NEXT: s_cbranch_execnz .LBB29_1 7140; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7141; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7142; GFX908-NEXT: s_setpc_b64 s[30:31] 7143; 7144; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7145; GFX8: ; %bb.0: 7146; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7147; GFX8-NEXT: v_mov_b32_e32 v3, v0 7148; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 7149; GFX8-NEXT: flat_load_dword v4, v[0:1] 7150; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 7151; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 7152; GFX8-NEXT: s_mov_b32 s4, 0xffff 7153; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 7154; GFX8-NEXT: v_not_b32_e32 v6, v3 7155; GFX8-NEXT: s_mov_b64 s[4:5], 0 7156; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 7157; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start 7158; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7159; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7160; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4 7161; GFX8-NEXT: v_max_f16_e32 v3, v3, v3 7162; GFX8-NEXT: v_min_f16_e32 v3, v3, v2 7163; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 7164; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3 7165; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 7166; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7167; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7168; GFX8-NEXT: buffer_wbinvl1 7169; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7170; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7171; GFX8-NEXT: v_mov_b32_e32 v4, v3 7172; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7173; GFX8-NEXT: s_cbranch_execnz .LBB29_1 7174; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7175; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7176; GFX8-NEXT: s_setpc_b64 s[30:31] 7177; 7178; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory: 7179; GFX7: ; %bb.0: 7180; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7181; GFX7-NEXT: v_mov_b32_e32 v3, v0 7182; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 7183; GFX7-NEXT: flat_load_dword v4, v[0:1] 7184; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2 7185; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 7186; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 7187; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2 7188; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 7189; GFX7-NEXT: v_not_b32_e32 v6, v3 7190; GFX7-NEXT: s_mov_b64 s[4:5], 0 7191; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start 7192; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7193; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7194; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4 7195; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 7196; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 7197; GFX7-NEXT: v_min_f32_e32 v3, v3, v5 7198; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 7199; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3 7200; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 7201; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 7202; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7203; GFX7-NEXT: buffer_wbinvl1 7204; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7205; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7206; GFX7-NEXT: v_mov_b32_e32 v4, v3 7207; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7208; GFX7-NEXT: s_cbranch_execnz .LBB29_1 7209; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7210; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7211; GFX7-NEXT: s_setpc_b64 s[30:31] 7212 %unused = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7213 ret void 7214} 7215 7216define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 7217; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7218; GFX12: ; %bb.0: 7219; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7220; GFX12-NEXT: s_wait_expcnt 0x0 7221; GFX12-NEXT: s_wait_samplecnt 0x0 7222; GFX12-NEXT: s_wait_bvhcnt 0x0 7223; GFX12-NEXT: s_wait_kmcnt 0x0 7224; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 7225; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7226; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 7227; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 7228; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 7229; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 7230; GFX12-NEXT: s_mov_b32 s0, 0 7231; GFX12-NEXT: flat_load_b32 v3, v[0:1] 7232; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7233; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7234; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7235; GFX12-NEXT: v_not_b32_e32 v5, v5 7236; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start 7237; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7238; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7239; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7240; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7241; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 7242; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 7243; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7244; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 7245; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7246; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7247; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 7248; GFX12-NEXT: s_wait_storecnt 0x0 7249; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7250; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7251; GFX12-NEXT: global_inv scope:SCOPE_DEV 7252; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7253; GFX12-NEXT: v_mov_b32_e32 v3, v2 7254; GFX12-NEXT: s_wait_alu 0xfffe 7255; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7256; GFX12-NEXT: s_wait_alu 0xfffe 7257; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7258; GFX12-NEXT: s_cbranch_execnz .LBB30_1 7259; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7260; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7261; GFX12-NEXT: s_wait_alu 0xfffe 7262; GFX12-NEXT: s_setpc_b64 s[30:31] 7263; 7264; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7265; GFX940: ; %bb.0: 7266; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7267; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 7268; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 7269; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 7270; GFX940-NEXT: v_mov_b32_e32 v1, v5 7271; GFX940-NEXT: flat_load_dword v3, v[0:1] 7272; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 7273; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7274; GFX940-NEXT: s_mov_b32 s0, 0xffff 7275; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 7276; GFX940-NEXT: v_not_b32_e32 v5, v5 7277; GFX940-NEXT: s_mov_b64 s[0:1], 0 7278; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 7279; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start 7280; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7281; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7282; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7283; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 7284; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 7285; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7286; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 7287; GFX940-NEXT: buffer_wbl2 sc1 7288; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 7289; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7290; GFX940-NEXT: buffer_inv sc1 7291; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7292; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7293; GFX940-NEXT: v_mov_b32_e32 v3, v2 7294; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7295; GFX940-NEXT: s_cbranch_execnz .LBB30_1 7296; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7297; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7298; GFX940-NEXT: s_setpc_b64 s[30:31] 7299; 7300; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7301; GFX11: ; %bb.0: 7302; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7303; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 7304; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7305; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 7306; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 7307; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 7308; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 7309; GFX11-NEXT: s_mov_b32 s0, 0 7310; GFX11-NEXT: flat_load_b32 v3, v[0:1] 7311; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7312; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7313; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7314; GFX11-NEXT: v_not_b32_e32 v5, v5 7315; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start 7316; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7317; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7318; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7319; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7320; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 7321; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 7322; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7323; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 7324; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7325; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7326; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 7327; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7328; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 7329; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7330; GFX11-NEXT: buffer_gl1_inv 7331; GFX11-NEXT: buffer_gl0_inv 7332; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7333; GFX11-NEXT: v_mov_b32_e32 v3, v2 7334; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7335; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7336; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7337; GFX11-NEXT: s_cbranch_execnz .LBB30_1 7338; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7339; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7340; GFX11-NEXT: s_setpc_b64 s[30:31] 7341; 7342; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7343; GFX10: ; %bb.0: 7344; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7345; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 7346; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 7347; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 7348; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 7349; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 7350; GFX10-NEXT: s_mov_b32 s4, 0 7351; GFX10-NEXT: flat_load_dword v3, v[0:1] 7352; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7353; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7354; GFX10-NEXT: v_not_b32_e32 v5, v5 7355; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start 7356; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7357; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7358; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7359; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 7360; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 7361; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 7362; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 7363; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7364; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7365; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7366; GFX10-NEXT: buffer_gl1_inv 7367; GFX10-NEXT: buffer_gl0_inv 7368; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7369; GFX10-NEXT: v_mov_b32_e32 v3, v2 7370; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7371; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7372; GFX10-NEXT: s_cbranch_execnz .LBB30_1 7373; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7374; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7375; GFX10-NEXT: s_setpc_b64 s[30:31] 7376; 7377; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7378; GFX90A: ; %bb.0: 7379; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7380; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 7381; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7382; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 7383; GFX90A-NEXT: flat_load_dword v3, v[0:1] 7384; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 7385; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7386; GFX90A-NEXT: s_mov_b32 s4, 0xffff 7387; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7388; GFX90A-NEXT: v_not_b32_e32 v5, v5 7389; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7390; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 7391; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start 7392; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7393; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7394; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7395; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 7396; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 7397; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7398; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 7399; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7400; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7401; GFX90A-NEXT: buffer_wbinvl1 7402; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7403; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7404; GFX90A-NEXT: v_mov_b32_e32 v3, v2 7405; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7406; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 7407; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7408; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7409; GFX90A-NEXT: s_setpc_b64 s[30:31] 7410; 7411; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7412; GFX908: ; %bb.0: 7413; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7414; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 7415; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7416; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 7417; GFX908-NEXT: flat_load_dword v3, v[0:1] 7418; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 7419; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7420; GFX908-NEXT: s_mov_b32 s4, 0xffff 7421; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7422; GFX908-NEXT: v_not_b32_e32 v5, v5 7423; GFX908-NEXT: s_mov_b64 s[4:5], 0 7424; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 7425; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start 7426; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7427; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7428; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7429; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 7430; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 7431; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7432; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 7433; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7434; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7435; GFX908-NEXT: buffer_wbinvl1 7436; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7437; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7438; GFX908-NEXT: v_mov_b32_e32 v3, v2 7439; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7440; GFX908-NEXT: s_cbranch_execnz .LBB30_1 7441; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7442; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7443; GFX908-NEXT: s_setpc_b64 s[30:31] 7444; 7445; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7446; GFX8: ; %bb.0: 7447; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7448; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 7449; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7450; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 7451; GFX8-NEXT: flat_load_dword v3, v[0:1] 7452; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 7453; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7454; GFX8-NEXT: s_mov_b32 s4, 0xffff 7455; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7456; GFX8-NEXT: v_not_b32_e32 v5, v5 7457; GFX8-NEXT: s_mov_b64 s[4:5], 0 7458; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 7459; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start 7460; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7461; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7462; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7463; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 7464; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 7465; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 7466; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7467; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 7468; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7469; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7470; GFX8-NEXT: buffer_wbinvl1 7471; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7472; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7473; GFX8-NEXT: v_mov_b32_e32 v3, v2 7474; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7475; GFX8-NEXT: s_cbranch_execnz .LBB30_1 7476; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7477; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7478; GFX8-NEXT: s_setpc_b64 s[30:31] 7479; 7480; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 7481; GFX7: ; %bb.0: 7482; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7483; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 7484; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7485; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 7486; GFX7-NEXT: flat_load_dword v3, v[0:1] 7487; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7488; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 7489; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7490; GFX7-NEXT: s_mov_b64 s[4:5], 0 7491; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 7492; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 7493; GFX7-NEXT: v_not_b32_e32 v6, v2 7494; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start 7495; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7496; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7497; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7498; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 7499; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 7500; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 7501; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7502; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7503; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 7504; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7505; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7506; GFX7-NEXT: buffer_wbinvl1 7507; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7508; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7509; GFX7-NEXT: v_mov_b32_e32 v3, v2 7510; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7511; GFX7-NEXT: s_cbranch_execnz .LBB30_1 7512; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7513; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7514; GFX7-NEXT: s_setpc_b64 s[30:31] 7515 %gep = getelementptr half, ptr %ptr, i64 1023 7516 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7517 ret void 7518} 7519 7520define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 7521; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7522; GFX12: ; %bb.0: 7523; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7524; GFX12-NEXT: s_wait_expcnt 0x0 7525; GFX12-NEXT: s_wait_samplecnt 0x0 7526; GFX12-NEXT: s_wait_bvhcnt 0x0 7527; GFX12-NEXT: s_wait_kmcnt 0x0 7528; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 7529; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7530; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 7531; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 7532; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 7533; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 7534; GFX12-NEXT: s_mov_b32 s0, 0 7535; GFX12-NEXT: flat_load_b32 v3, v[0:1] 7536; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7537; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7538; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7539; GFX12-NEXT: v_not_b32_e32 v5, v5 7540; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start 7541; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7542; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7543; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7544; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7545; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 7546; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 7547; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7548; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 7549; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7550; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 7551; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 7552; GFX12-NEXT: s_wait_storecnt 0x0 7553; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7554; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7555; GFX12-NEXT: global_inv scope:SCOPE_DEV 7556; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7557; GFX12-NEXT: v_mov_b32_e32 v3, v2 7558; GFX12-NEXT: s_wait_alu 0xfffe 7559; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7560; GFX12-NEXT: s_wait_alu 0xfffe 7561; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7562; GFX12-NEXT: s_cbranch_execnz .LBB31_1 7563; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7564; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7565; GFX12-NEXT: s_wait_alu 0xfffe 7566; GFX12-NEXT: s_setpc_b64 s[30:31] 7567; 7568; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7569; GFX940: ; %bb.0: 7570; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7571; GFX940-NEXT: s_movk_i32 s0, 0xf800 7572; GFX940-NEXT: s_mov_b32 s1, -1 7573; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 7574; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 7575; GFX940-NEXT: v_mov_b32_e32 v1, v5 7576; GFX940-NEXT: flat_load_dword v3, v[0:1] 7577; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 7578; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7579; GFX940-NEXT: s_mov_b32 s0, 0xffff 7580; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 7581; GFX940-NEXT: v_not_b32_e32 v5, v5 7582; GFX940-NEXT: s_mov_b64 s[0:1], 0 7583; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 7584; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start 7585; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7586; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7587; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7588; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 7589; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 7590; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7591; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 7592; GFX940-NEXT: buffer_wbl2 sc1 7593; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 7594; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7595; GFX940-NEXT: buffer_inv sc1 7596; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7597; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7598; GFX940-NEXT: v_mov_b32_e32 v3, v2 7599; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7600; GFX940-NEXT: s_cbranch_execnz .LBB31_1 7601; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7602; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7603; GFX940-NEXT: s_setpc_b64 s[30:31] 7604; 7605; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7606; GFX11: ; %bb.0: 7607; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7608; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 7609; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7610; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 7611; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 7612; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 7613; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 7614; GFX11-NEXT: s_mov_b32 s0, 0 7615; GFX11-NEXT: flat_load_b32 v3, v[0:1] 7616; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7617; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7618; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7619; GFX11-NEXT: v_not_b32_e32 v5, v5 7620; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start 7621; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7622; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7623; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7624; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7625; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 7626; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 7627; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7628; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 7629; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7630; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 7631; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 7632; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7633; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 7634; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7635; GFX11-NEXT: buffer_gl1_inv 7636; GFX11-NEXT: buffer_gl0_inv 7637; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7638; GFX11-NEXT: v_mov_b32_e32 v3, v2 7639; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7640; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7641; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7642; GFX11-NEXT: s_cbranch_execnz .LBB31_1 7643; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7644; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7645; GFX11-NEXT: s_setpc_b64 s[30:31] 7646; 7647; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7648; GFX10: ; %bb.0: 7649; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7650; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 7651; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 7652; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 7653; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 7654; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 7655; GFX10-NEXT: s_mov_b32 s4, 0 7656; GFX10-NEXT: flat_load_dword v3, v[0:1] 7657; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7658; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 7659; GFX10-NEXT: v_not_b32_e32 v5, v5 7660; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start 7661; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7662; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7663; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7664; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 7665; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 7666; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 7667; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 7668; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7669; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7670; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7671; GFX10-NEXT: buffer_gl1_inv 7672; GFX10-NEXT: buffer_gl0_inv 7673; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 7674; GFX10-NEXT: v_mov_b32_e32 v3, v2 7675; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7676; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7677; GFX10-NEXT: s_cbranch_execnz .LBB31_1 7678; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7679; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7680; GFX10-NEXT: s_setpc_b64 s[30:31] 7681; 7682; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7683; GFX90A: ; %bb.0: 7684; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7685; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 7686; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7687; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 7688; GFX90A-NEXT: flat_load_dword v3, v[0:1] 7689; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 7690; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7691; GFX90A-NEXT: s_mov_b32 s4, 0xffff 7692; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7693; GFX90A-NEXT: v_not_b32_e32 v5, v5 7694; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7695; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 7696; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start 7697; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7698; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7699; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7700; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 7701; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 7702; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7703; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 7704; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7705; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7706; GFX90A-NEXT: buffer_wbinvl1 7707; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7708; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7709; GFX90A-NEXT: v_mov_b32_e32 v3, v2 7710; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7711; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 7712; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7713; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7714; GFX90A-NEXT: s_setpc_b64 s[30:31] 7715; 7716; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7717; GFX908: ; %bb.0: 7718; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7719; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 7720; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 7721; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 7722; GFX908-NEXT: flat_load_dword v3, v[0:1] 7723; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 7724; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7725; GFX908-NEXT: s_mov_b32 s4, 0xffff 7726; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7727; GFX908-NEXT: v_not_b32_e32 v5, v5 7728; GFX908-NEXT: s_mov_b64 s[4:5], 0 7729; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 7730; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start 7731; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7732; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7733; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7734; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 7735; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 7736; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7737; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 7738; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7739; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7740; GFX908-NEXT: buffer_wbinvl1 7741; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7742; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7743; GFX908-NEXT: v_mov_b32_e32 v3, v2 7744; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7745; GFX908-NEXT: s_cbranch_execnz .LBB31_1 7746; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7747; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7748; GFX908-NEXT: s_setpc_b64 s[30:31] 7749; 7750; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7751; GFX8: ; %bb.0: 7752; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7753; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 7754; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 7755; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 7756; GFX8-NEXT: flat_load_dword v3, v[0:1] 7757; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 7758; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7759; GFX8-NEXT: s_mov_b32 s4, 0xffff 7760; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 7761; GFX8-NEXT: v_not_b32_e32 v5, v5 7762; GFX8-NEXT: s_mov_b64 s[4:5], 0 7763; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 7764; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start 7765; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 7766; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7767; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7768; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 7769; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 7770; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 7771; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7772; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 7773; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7774; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7775; GFX8-NEXT: buffer_wbinvl1 7776; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7777; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7778; GFX8-NEXT: v_mov_b32_e32 v3, v2 7779; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 7780; GFX8-NEXT: s_cbranch_execnz .LBB31_1 7781; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 7782; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 7783; GFX8-NEXT: s_setpc_b64 s[30:31] 7784; 7785; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: 7786; GFX7: ; %bb.0: 7787; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7788; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 7789; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 7790; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 7791; GFX7-NEXT: flat_load_dword v3, v[0:1] 7792; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7793; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 7794; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 7795; GFX7-NEXT: s_mov_b64 s[4:5], 0 7796; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 7797; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 7798; GFX7-NEXT: v_not_b32_e32 v6, v2 7799; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start 7800; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 7801; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7802; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 7803; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 7804; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 7805; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 7806; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 7807; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 7808; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 7809; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 7810; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7811; GFX7-NEXT: buffer_wbinvl1 7812; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 7813; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7814; GFX7-NEXT: v_mov_b32_e32 v3, v2 7815; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 7816; GFX7-NEXT: s_cbranch_execnz .LBB31_1 7817; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 7818; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 7819; GFX7-NEXT: s_setpc_b64 s[30:31] 7820 %gep = getelementptr half, ptr %ptr, i64 -1024 7821 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 7822 ret void 7823} 7824 7825define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 7826; GFX12-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7827; GFX12: ; %bb.0: 7828; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7829; GFX12-NEXT: s_wait_expcnt 0x0 7830; GFX12-NEXT: s_wait_samplecnt 0x0 7831; GFX12-NEXT: s_wait_bvhcnt 0x0 7832; GFX12-NEXT: s_wait_kmcnt 0x0 7833; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 7834; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 7835; GFX12-NEXT: s_mov_b32 s0, 0 7836; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start 7837; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 7838; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7839; GFX12-NEXT: v_mov_b32_e32 v4, v3 7840; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7841; GFX12-NEXT: v_max_num_f16_e32 v3, v4, v4 7842; GFX12-NEXT: v_min_num_f16_e32 v3, v3, v2 7843; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7844; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3 7845; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7846; GFX12-NEXT: s_wait_storecnt 0x0 7847; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 7848; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 7849; GFX12-NEXT: global_inv scope:SCOPE_DEV 7850; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7851; GFX12-NEXT: s_wait_alu 0xfffe 7852; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 7853; GFX12-NEXT: s_wait_alu 0xfffe 7854; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7855; GFX12-NEXT: s_cbranch_execnz .LBB32_1 7856; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 7857; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 7858; GFX12-NEXT: v_mov_b32_e32 v0, v3 7859; GFX12-NEXT: s_wait_alu 0xfffe 7860; GFX12-NEXT: s_setpc_b64 s[30:31] 7861; 7862; GFX940-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7863; GFX940: ; %bb.0: 7864; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7865; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 7866; GFX940-NEXT: s_mov_b64 s[0:1], 0 7867; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 7868; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 7869; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start 7870; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 7871; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7872; GFX940-NEXT: v_mov_b32_e32 v5, v3 7873; GFX940-NEXT: v_max_f16_e32 v3, v5, v5 7874; GFX940-NEXT: v_min_f16_e32 v3, v3, v2 7875; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3 7876; GFX940-NEXT: buffer_wbl2 sc1 7877; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 7878; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7879; GFX940-NEXT: buffer_inv sc1 7880; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7881; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 7882; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 7883; GFX940-NEXT: s_cbranch_execnz .LBB32_1 7884; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 7885; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 7886; GFX940-NEXT: v_mov_b32_e32 v0, v3 7887; GFX940-NEXT: s_setpc_b64 s[30:31] 7888; 7889; GFX11-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7890; GFX11: ; %bb.0: 7891; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7892; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 7893; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 7894; GFX11-NEXT: s_mov_b32 s0, 0 7895; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start 7896; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 7897; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7898; GFX11-NEXT: v_mov_b32_e32 v4, v3 7899; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7900; GFX11-NEXT: v_max_f16_e32 v3, v4, v4 7901; GFX11-NEXT: v_min_f16_e32 v3, v3, v2 7902; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 7903; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 7904; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 7905; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 7906; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 7907; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7908; GFX11-NEXT: buffer_gl1_inv 7909; GFX11-NEXT: buffer_gl0_inv 7910; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 7911; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 7912; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 7913; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 7914; GFX11-NEXT: s_cbranch_execnz .LBB32_1 7915; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 7916; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 7917; GFX11-NEXT: v_mov_b32_e32 v0, v3 7918; GFX11-NEXT: s_setpc_b64 s[30:31] 7919; 7920; GFX10-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7921; GFX10: ; %bb.0: 7922; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7923; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 7924; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 7925; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 7926; GFX10-NEXT: s_mov_b32 s4, 0 7927; GFX10-NEXT: flat_load_dword v0, v[3:4] 7928; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start 7929; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 7930; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7931; GFX10-NEXT: v_mov_b32_e32 v6, v0 7932; GFX10-NEXT: v_max_f16_e32 v0, v6, v6 7933; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 7934; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 7935; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 7936; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 7937; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 7938; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7939; GFX10-NEXT: buffer_gl1_inv 7940; GFX10-NEXT: buffer_gl0_inv 7941; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 7942; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 7943; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 7944; GFX10-NEXT: s_cbranch_execnz .LBB32_1 7945; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 7946; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 7947; GFX10-NEXT: s_setpc_b64 s[30:31] 7948; 7949; GFX90A-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7950; GFX90A: ; %bb.0: 7951; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7952; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 7953; GFX90A-NEXT: s_mov_b64 s[4:5], 0 7954; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 7955; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 7956; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start 7957; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 7958; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7959; GFX90A-NEXT: v_mov_b32_e32 v5, v3 7960; GFX90A-NEXT: v_max_f16_e32 v3, v5, v5 7961; GFX90A-NEXT: v_min_f16_e32 v3, v3, v2 7962; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3 7963; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 7964; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7965; GFX90A-NEXT: buffer_wbinvl1 7966; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 7967; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7968; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 7969; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 7970; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 7971; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 7972; GFX90A-NEXT: v_mov_b32_e32 v0, v3 7973; GFX90A-NEXT: s_setpc_b64 s[30:31] 7974; 7975; GFX908-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 7976; GFX908: ; %bb.0: 7977; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 7978; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 7979; GFX908-NEXT: s_mov_b64 s[4:5], 0 7980; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 7981; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 7982; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start 7983; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 7984; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7985; GFX908-NEXT: v_mov_b32_e32 v4, v3 7986; GFX908-NEXT: v_max_f16_e32 v3, v4, v4 7987; GFX908-NEXT: v_min_f16_e32 v3, v3, v2 7988; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3 7989; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 7990; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 7991; GFX908-NEXT: buffer_wbinvl1 7992; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 7993; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 7994; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 7995; GFX908-NEXT: s_cbranch_execnz .LBB32_1 7996; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 7997; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 7998; GFX908-NEXT: v_mov_b32_e32 v0, v3 7999; GFX908-NEXT: s_setpc_b64 s[30:31] 8000; 8001; GFX8-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 8002; GFX8: ; %bb.0: 8003; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8004; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 8005; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 8006; GFX8-NEXT: flat_load_dword v0, v[3:4] 8007; GFX8-NEXT: s_mov_b64 s[4:5], 0 8008; GFX8-NEXT: v_max_f16_e32 v1, v2, v2 8009; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start 8010; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8011; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8012; GFX8-NEXT: v_mov_b32_e32 v6, v0 8013; GFX8-NEXT: v_max_f16_e32 v0, v6, v6 8014; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 8015; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 8016; GFX8-NEXT: v_or_b32_e32 v5, v2, v0 8017; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 8018; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8019; GFX8-NEXT: buffer_wbinvl1 8020; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 8021; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8022; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8023; GFX8-NEXT: s_cbranch_execnz .LBB32_1 8024; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8025; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8026; GFX8-NEXT: s_setpc_b64 s[30:31] 8027; 8028; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 8029; GFX7: ; %bb.0: 8030; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8031; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 8032; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8033; GFX7-NEXT: flat_load_dword v3, v[0:1] 8034; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8035; GFX7-NEXT: s_mov_b64 s[4:5], 0 8036; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 8037; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start 8038; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8039; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8040; GFX7-NEXT: v_mov_b32_e32 v4, v3 8041; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8042; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 8043; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 8044; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 8045; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 8046; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 8047; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8048; GFX7-NEXT: buffer_wbinvl1 8049; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 8050; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8051; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8052; GFX7-NEXT: s_cbranch_execnz .LBB32_1 8053; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8054; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8055; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 8056; GFX7-NEXT: s_setpc_b64 s[30:31] 8057 %gep = getelementptr half, ptr %ptr, i64 1023 8058 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 8059 ret half %result 8060} 8061 8062define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8063; GFX12-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8064; GFX12: ; %bb.0: 8065; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8066; GFX12-NEXT: s_wait_expcnt 0x0 8067; GFX12-NEXT: s_wait_samplecnt 0x0 8068; GFX12-NEXT: s_wait_bvhcnt 0x0 8069; GFX12-NEXT: s_wait_kmcnt 0x0 8070; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 8071; GFX12-NEXT: v_max_num_f16_e32 v4, v2, v2 8072; GFX12-NEXT: s_mov_b32 s0, 0 8073; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start 8074; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8075; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8076; GFX12-NEXT: v_max_num_f16_e32 v2, v3, v3 8077; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8078; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v4 8079; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 8080; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8081; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 8082; GFX12-NEXT: s_wait_storecnt 0x0 8083; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8084; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8085; GFX12-NEXT: global_inv scope:SCOPE_DEV 8086; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8087; GFX12-NEXT: v_mov_b32_e32 v3, v2 8088; GFX12-NEXT: s_wait_alu 0xfffe 8089; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8090; GFX12-NEXT: s_wait_alu 0xfffe 8091; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8092; GFX12-NEXT: s_cbranch_execnz .LBB33_1 8093; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8094; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8095; GFX12-NEXT: s_wait_alu 0xfffe 8096; GFX12-NEXT: s_setpc_b64 s[30:31] 8097; 8098; GFX940-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8099; GFX940: ; %bb.0: 8100; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8101; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 8102; GFX940-NEXT: s_mov_b64 s[0:1], 0 8103; GFX940-NEXT: v_max_f16_e32 v4, v2, v2 8104; GFX940-NEXT: s_mov_b32 s2, 0xffff0000 8105; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start 8106; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8107; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8108; GFX940-NEXT: v_max_f16_e32 v2, v3, v3 8109; GFX940-NEXT: v_min_f16_e32 v2, v2, v4 8110; GFX940-NEXT: v_and_or_b32 v2, v3, s2, v2 8111; GFX940-NEXT: buffer_wbl2 sc1 8112; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 8113; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8114; GFX940-NEXT: buffer_inv sc1 8115; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8116; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8117; GFX940-NEXT: v_mov_b32_e32 v3, v2 8118; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8119; GFX940-NEXT: s_cbranch_execnz .LBB33_1 8120; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8121; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8122; GFX940-NEXT: s_setpc_b64 s[30:31] 8123; 8124; GFX11-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8125; GFX11: ; %bb.0: 8126; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8127; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 8128; GFX11-NEXT: v_max_f16_e32 v4, v2, v2 8129; GFX11-NEXT: s_mov_b32 s0, 0 8130; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start 8131; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8132; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8133; GFX11-NEXT: v_max_f16_e32 v2, v3, v3 8134; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8135; GFX11-NEXT: v_min_f16_e32 v2, v2, v4 8136; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 8137; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8138; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 8139; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8140; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc 8141; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8142; GFX11-NEXT: buffer_gl1_inv 8143; GFX11-NEXT: buffer_gl0_inv 8144; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8145; GFX11-NEXT: v_mov_b32_e32 v3, v2 8146; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8147; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8148; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8149; GFX11-NEXT: s_cbranch_execnz .LBB33_1 8150; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8151; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8152; GFX11-NEXT: s_setpc_b64 s[30:31] 8153; 8154; GFX10-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8155; GFX10: ; %bb.0: 8156; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8157; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 8158; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8159; GFX10-NEXT: v_max_f16_e32 v4, v2, v2 8160; GFX10-NEXT: s_mov_b32 s4, 0 8161; GFX10-NEXT: flat_load_dword v3, v[0:1] 8162; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start 8163; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8164; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8165; GFX10-NEXT: v_max_f16_e32 v2, v3, v3 8166; GFX10-NEXT: v_min_f16_e32 v2, v2, v4 8167; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 8168; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 8169; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8170; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8171; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8172; GFX10-NEXT: buffer_gl1_inv 8173; GFX10-NEXT: buffer_gl0_inv 8174; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8175; GFX10-NEXT: v_mov_b32_e32 v3, v2 8176; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8177; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8178; GFX10-NEXT: s_cbranch_execnz .LBB33_1 8179; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8180; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8181; GFX10-NEXT: s_setpc_b64 s[30:31] 8182; 8183; GFX90A-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8184; GFX90A: ; %bb.0: 8185; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8186; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 8187; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8188; GFX90A-NEXT: v_max_f16_e32 v4, v2, v2 8189; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000 8190; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start 8191; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8192; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8193; GFX90A-NEXT: v_max_f16_e32 v2, v3, v3 8194; GFX90A-NEXT: v_min_f16_e32 v2, v2, v4 8195; GFX90A-NEXT: v_and_or_b32 v2, v3, s6, v2 8196; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 8197; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8198; GFX90A-NEXT: buffer_wbinvl1 8199; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8200; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8201; GFX90A-NEXT: v_mov_b32_e32 v3, v2 8202; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8203; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 8204; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8205; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8206; GFX90A-NEXT: s_setpc_b64 s[30:31] 8207; 8208; GFX908-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8209; GFX908: ; %bb.0: 8210; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8211; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 8212; GFX908-NEXT: s_mov_b64 s[4:5], 0 8213; GFX908-NEXT: v_max_f16_e32 v4, v2, v2 8214; GFX908-NEXT: s_mov_b32 s6, 0xffff0000 8215; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start 8216; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8217; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8218; GFX908-NEXT: v_max_f16_e32 v2, v3, v3 8219; GFX908-NEXT: v_min_f16_e32 v2, v2, v4 8220; GFX908-NEXT: v_and_or_b32 v2, v3, s6, v2 8221; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 8222; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8223; GFX908-NEXT: buffer_wbinvl1 8224; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8225; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8226; GFX908-NEXT: v_mov_b32_e32 v3, v2 8227; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8228; GFX908-NEXT: s_cbranch_execnz .LBB33_1 8229; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8230; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8231; GFX908-NEXT: s_setpc_b64 s[30:31] 8232; 8233; GFX8-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8234; GFX8: ; %bb.0: 8235; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8236; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 8237; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8238; GFX8-NEXT: flat_load_dword v3, v[0:1] 8239; GFX8-NEXT: s_mov_b64 s[4:5], 0 8240; GFX8-NEXT: v_max_f16_e32 v4, v2, v2 8241; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start 8242; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8243; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8244; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 8245; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 8246; GFX8-NEXT: v_min_f16_e32 v2, v2, v4 8247; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 8248; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8249; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8250; GFX8-NEXT: buffer_wbinvl1 8251; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8252; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8253; GFX8-NEXT: v_mov_b32_e32 v3, v2 8254; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8255; GFX8-NEXT: s_cbranch_execnz .LBB33_1 8256; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8257; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8258; GFX8-NEXT: s_setpc_b64 s[30:31] 8259; 8260; GFX7-LABEL: flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 8261; GFX7: ; %bb.0: 8262; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8263; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 8264; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8265; GFX7-NEXT: flat_load_dword v3, v[0:1] 8266; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8267; GFX7-NEXT: s_mov_b64 s[4:5], 0 8268; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2 8269; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start 8270; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8271; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8272; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 8273; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 8274; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 8275; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8276; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 8277; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8278; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8279; GFX7-NEXT: buffer_wbinvl1 8280; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8281; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8282; GFX7-NEXT: v_mov_b32_e32 v3, v2 8283; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8284; GFX7-NEXT: s_cbranch_execnz .LBB33_1 8285; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8286; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8287; GFX7-NEXT: s_setpc_b64 s[30:31] 8288 %gep = getelementptr half, ptr %ptr, i64 1023 8289 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 8290 ret void 8291} 8292 8293define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8294; GFX12-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8295; GFX12: ; %bb.0: 8296; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8297; GFX12-NEXT: s_wait_expcnt 0x0 8298; GFX12-NEXT: s_wait_samplecnt 0x0 8299; GFX12-NEXT: s_wait_bvhcnt 0x0 8300; GFX12-NEXT: s_wait_kmcnt 0x0 8301; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8302; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8303; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 8304; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 8305; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8306; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8307; GFX12-NEXT: s_mov_b32 s0, 0 8308; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8309; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8310; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8311; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8312; GFX12-NEXT: v_not_b32_e32 v4, v4 8313; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start 8314; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8315; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8316; GFX12-NEXT: v_mov_b32_e32 v6, v5 8317; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8318; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8319; GFX12-NEXT: v_max_num_f16_e32 v5, v5, v5 8320; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8321; GFX12-NEXT: v_min_num_f16_e32 v5, v5, v2 8322; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5 8323; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8324; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8325; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8326; GFX12-NEXT: global_wb scope:SCOPE_SYS 8327; GFX12-NEXT: s_wait_storecnt 0x0 8328; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 8329; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8330; GFX12-NEXT: global_inv scope:SCOPE_SYS 8331; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8332; GFX12-NEXT: s_wait_alu 0xfffe 8333; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8334; GFX12-NEXT: s_wait_alu 0xfffe 8335; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8336; GFX12-NEXT: s_cbranch_execnz .LBB34_1 8337; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8338; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8339; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8340; GFX12-NEXT: s_wait_alu 0xfffe 8341; GFX12-NEXT: s_setpc_b64 s[30:31] 8342; 8343; GFX940-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8344; GFX940: ; %bb.0: 8345; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8346; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 8347; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 8348; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 8349; GFX940-NEXT: v_mov_b32_e32 v1, v5 8350; GFX940-NEXT: flat_load_dword v5, v[0:1] 8351; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 8352; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8353; GFX940-NEXT: s_mov_b32 s0, 0xffff 8354; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 8355; GFX940-NEXT: v_not_b32_e32 v4, v4 8356; GFX940-NEXT: s_mov_b64 s[0:1], 0 8357; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 8358; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start 8359; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8360; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8361; GFX940-NEXT: v_mov_b32_e32 v7, v5 8362; GFX940-NEXT: v_lshrrev_b32_e32 v5, v3, v7 8363; GFX940-NEXT: v_max_f16_e32 v5, v5, v5 8364; GFX940-NEXT: v_min_f16_e32 v5, v5, v2 8365; GFX940-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8366; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 8367; GFX940-NEXT: buffer_wbl2 sc0 sc1 8368; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 8369; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8370; GFX940-NEXT: buffer_inv sc0 sc1 8371; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 8372; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8373; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8374; GFX940-NEXT: s_cbranch_execnz .LBB34_1 8375; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8376; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8377; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8378; GFX940-NEXT: s_setpc_b64 s[30:31] 8379; 8380; GFX11-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8381; GFX11: ; %bb.0: 8382; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8383; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8384; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8385; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 8386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 8387; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 8388; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 8389; GFX11-NEXT: s_mov_b32 s0, 0 8390; GFX11-NEXT: flat_load_b32 v5, v[0:1] 8391; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8392; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8393; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8394; GFX11-NEXT: v_not_b32_e32 v4, v4 8395; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start 8396; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8397; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8398; GFX11-NEXT: v_mov_b32_e32 v6, v5 8399; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8400; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8401; GFX11-NEXT: v_max_f16_e32 v5, v5, v5 8402; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8403; GFX11-NEXT: v_min_f16_e32 v5, v5, v2 8404; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 8405; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8406; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8407; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 8408; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8409; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 8410; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8411; GFX11-NEXT: buffer_gl1_inv 8412; GFX11-NEXT: buffer_gl0_inv 8413; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8414; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8415; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8416; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8417; GFX11-NEXT: s_cbranch_execnz .LBB34_1 8418; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8419; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8420; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8421; GFX11-NEXT: s_setpc_b64 s[30:31] 8422; 8423; GFX10-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8424; GFX10: ; %bb.0: 8425; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8426; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 8427; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8428; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 8429; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 8430; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 8431; GFX10-NEXT: s_mov_b32 s4, 0 8432; GFX10-NEXT: flat_load_dword v5, v[0:1] 8433; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8434; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8435; GFX10-NEXT: v_not_b32_e32 v4, v4 8436; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start 8437; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8438; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8439; GFX10-NEXT: v_mov_b32_e32 v6, v5 8440; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8441; GFX10-NEXT: v_max_f16_e32 v5, v5, v5 8442; GFX10-NEXT: v_min_f16_e32 v5, v5, v2 8443; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8444; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 8445; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8446; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8447; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8448; GFX10-NEXT: buffer_gl1_inv 8449; GFX10-NEXT: buffer_gl0_inv 8450; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8451; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8452; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8453; GFX10-NEXT: s_cbranch_execnz .LBB34_1 8454; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8455; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8456; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8457; GFX10-NEXT: s_setpc_b64 s[30:31] 8458; 8459; GFX90A-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8460; GFX90A: ; %bb.0: 8461; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8462; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8463; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8464; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 8465; GFX90A-NEXT: flat_load_dword v5, v[0:1] 8466; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 8467; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8468; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8469; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8470; GFX90A-NEXT: v_not_b32_e32 v4, v4 8471; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8472; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 8473; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start 8474; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8475; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8476; GFX90A-NEXT: v_mov_b32_e32 v7, v5 8477; GFX90A-NEXT: v_lshrrev_b32_e32 v5, v3, v7 8478; GFX90A-NEXT: v_max_f16_e32 v5, v5, v5 8479; GFX90A-NEXT: v_min_f16_e32 v5, v5, v2 8480; GFX90A-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8481; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 8482; GFX90A-NEXT: buffer_wbl2 8483; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 8484; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8485; GFX90A-NEXT: buffer_invl2 8486; GFX90A-NEXT: buffer_wbinvl1 8487; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 8488; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8489; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8490; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 8491; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8492; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8493; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8494; GFX90A-NEXT: s_setpc_b64 s[30:31] 8495; 8496; GFX908-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8497; GFX908: ; %bb.0: 8498; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8499; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 8500; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8501; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 8502; GFX908-NEXT: flat_load_dword v5, v[0:1] 8503; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 8504; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8505; GFX908-NEXT: s_mov_b32 s4, 0xffff 8506; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8507; GFX908-NEXT: v_not_b32_e32 v4, v4 8508; GFX908-NEXT: s_mov_b64 s[4:5], 0 8509; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 8510; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start 8511; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8512; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8513; GFX908-NEXT: v_mov_b32_e32 v6, v5 8514; GFX908-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8515; GFX908-NEXT: v_max_f16_e32 v5, v5, v5 8516; GFX908-NEXT: v_min_f16_e32 v5, v5, v2 8517; GFX908-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8518; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 8519; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8520; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8521; GFX908-NEXT: buffer_wbinvl1 8522; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8523; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8524; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8525; GFX908-NEXT: s_cbranch_execnz .LBB34_1 8526; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8527; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8528; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8529; GFX908-NEXT: s_setpc_b64 s[30:31] 8530; 8531; GFX8-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8532; GFX8: ; %bb.0: 8533; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8534; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 8535; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8536; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 8537; GFX8-NEXT: flat_load_dword v5, v[0:1] 8538; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 8539; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8540; GFX8-NEXT: s_mov_b32 s4, 0xffff 8541; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 8542; GFX8-NEXT: v_not_b32_e32 v4, v4 8543; GFX8-NEXT: s_mov_b64 s[4:5], 0 8544; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 8545; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start 8546; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8547; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8548; GFX8-NEXT: v_mov_b32_e32 v6, v5 8549; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8550; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 8551; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 8552; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 8553; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8554; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 8555; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8556; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8557; GFX8-NEXT: buffer_wbinvl1 8558; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8559; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8560; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8561; GFX8-NEXT: s_cbranch_execnz .LBB34_1 8562; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8563; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8564; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8565; GFX8-NEXT: s_setpc_b64 s[30:31] 8566; 8567; GFX7-LABEL: flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8568; GFX7: ; %bb.0: 8569; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8570; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 8571; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8572; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 8573; GFX7-NEXT: flat_load_dword v5, v[0:1] 8574; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2 8575; GFX7-NEXT: v_and_b32_e32 v2, 3, v3 8576; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 8577; GFX7-NEXT: s_mov_b64 s[4:5], 0 8578; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4 8579; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2 8580; GFX7-NEXT: v_not_b32_e32 v4, v4 8581; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start 8582; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8583; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8584; GFX7-NEXT: v_mov_b32_e32 v6, v5 8585; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6 8586; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 8587; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 8588; GFX7-NEXT: v_min_f32_e32 v5, v5, v3 8589; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 8590; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5 8591; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 8592; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 8593; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8594; GFX7-NEXT: buffer_wbinvl1 8595; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 8596; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8597; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8598; GFX7-NEXT: s_cbranch_execnz .LBB34_1 8599; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8600; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8601; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 8602; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 8603; GFX7-NEXT: s_setpc_b64 s[30:31] 8604 %gep = getelementptr half, ptr %ptr, i64 1023 8605 %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 8606 ret half %result 8607} 8608 8609define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 { 8610; GFX12-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8611; GFX12: ; %bb.0: 8612; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8613; GFX12-NEXT: s_wait_expcnt 0x0 8614; GFX12-NEXT: s_wait_samplecnt 0x0 8615; GFX12-NEXT: s_wait_bvhcnt 0x0 8616; GFX12-NEXT: s_wait_kmcnt 0x0 8617; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 8618; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8619; GFX12-NEXT: v_max_num_f16_e32 v6, v2, v2 8620; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 8621; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 8622; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 8623; GFX12-NEXT: s_mov_b32 s0, 0 8624; GFX12-NEXT: flat_load_b32 v3, v[0:1] 8625; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8626; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 8627; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8628; GFX12-NEXT: v_not_b32_e32 v5, v5 8629; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start 8630; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8631; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8632; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8633; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8634; GFX12-NEXT: v_max_num_f16_e32 v2, v2, v2 8635; GFX12-NEXT: v_min_num_f16_e32 v2, v2, v6 8636; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8637; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2 8638; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8639; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8640; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 8641; GFX12-NEXT: global_wb scope:SCOPE_SYS 8642; GFX12-NEXT: s_wait_storecnt 0x0 8643; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 8644; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8645; GFX12-NEXT: global_inv scope:SCOPE_SYS 8646; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8647; GFX12-NEXT: v_mov_b32_e32 v3, v2 8648; GFX12-NEXT: s_wait_alu 0xfffe 8649; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8650; GFX12-NEXT: s_wait_alu 0xfffe 8651; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8652; GFX12-NEXT: s_cbranch_execnz .LBB35_1 8653; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8654; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8655; GFX12-NEXT: s_wait_alu 0xfffe 8656; GFX12-NEXT: s_setpc_b64 s[30:31] 8657; 8658; GFX940-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8659; GFX940: ; %bb.0: 8660; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8661; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 8662; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 8663; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 8664; GFX940-NEXT: v_mov_b32_e32 v1, v5 8665; GFX940-NEXT: flat_load_dword v3, v[0:1] 8666; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 8667; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8668; GFX940-NEXT: s_mov_b32 s0, 0xffff 8669; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 8670; GFX940-NEXT: v_not_b32_e32 v5, v5 8671; GFX940-NEXT: s_mov_b64 s[0:1], 0 8672; GFX940-NEXT: v_max_f16_e32 v6, v2, v2 8673; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start 8674; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8675; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8676; GFX940-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8677; GFX940-NEXT: v_max_f16_e32 v2, v2, v2 8678; GFX940-NEXT: v_min_f16_e32 v2, v2, v6 8679; GFX940-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8680; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 8681; GFX940-NEXT: buffer_wbl2 sc0 sc1 8682; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 8683; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8684; GFX940-NEXT: buffer_inv sc0 sc1 8685; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8686; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 8687; GFX940-NEXT: v_mov_b32_e32 v3, v2 8688; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 8689; GFX940-NEXT: s_cbranch_execnz .LBB35_1 8690; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 8691; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 8692; GFX940-NEXT: s_setpc_b64 s[30:31] 8693; 8694; GFX11-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8695; GFX11: ; %bb.0: 8696; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8697; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 8698; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8699; GFX11-NEXT: v_max_f16_e32 v6, v2, v2 8700; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 8701; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 8702; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 8703; GFX11-NEXT: s_mov_b32 s0, 0 8704; GFX11-NEXT: flat_load_b32 v3, v[0:1] 8705; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8706; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 8707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8708; GFX11-NEXT: v_not_b32_e32 v5, v5 8709; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start 8710; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 8711; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8712; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8714; GFX11-NEXT: v_max_f16_e32 v2, v2, v2 8715; GFX11-NEXT: v_min_f16_e32 v2, v2, v6 8716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8717; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 8718; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 8720; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 8721; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 8722; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 8723; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8724; GFX11-NEXT: buffer_gl1_inv 8725; GFX11-NEXT: buffer_gl0_inv 8726; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8727; GFX11-NEXT: v_mov_b32_e32 v3, v2 8728; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 8729; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 8730; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8731; GFX11-NEXT: s_cbranch_execnz .LBB35_1 8732; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 8733; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 8734; GFX11-NEXT: s_setpc_b64 s[30:31] 8735; 8736; GFX10-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8737; GFX10: ; %bb.0: 8738; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8739; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 8740; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 8741; GFX10-NEXT: v_max_f16_e32 v6, v2, v2 8742; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 8743; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 8744; GFX10-NEXT: s_mov_b32 s4, 0 8745; GFX10-NEXT: flat_load_dword v3, v[0:1] 8746; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8747; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 8748; GFX10-NEXT: v_not_b32_e32 v5, v5 8749; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start 8750; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 8751; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8752; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8753; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 8754; GFX10-NEXT: v_min_f16_e32 v2, v2, v6 8755; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 8756; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 8757; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 8758; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8759; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8760; GFX10-NEXT: buffer_gl1_inv 8761; GFX10-NEXT: buffer_gl0_inv 8762; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 8763; GFX10-NEXT: v_mov_b32_e32 v3, v2 8764; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 8765; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 8766; GFX10-NEXT: s_cbranch_execnz .LBB35_1 8767; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 8768; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 8769; GFX10-NEXT: s_setpc_b64 s[30:31] 8770; 8771; GFX90A-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8772; GFX90A: ; %bb.0: 8773; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8774; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 8775; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8776; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 8777; GFX90A-NEXT: flat_load_dword v3, v[0:1] 8778; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 8779; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8780; GFX90A-NEXT: s_mov_b32 s4, 0xffff 8781; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 8782; GFX90A-NEXT: v_not_b32_e32 v5, v5 8783; GFX90A-NEXT: s_mov_b64 s[4:5], 0 8784; GFX90A-NEXT: v_max_f16_e32 v6, v2, v2 8785; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start 8786; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 8787; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8788; GFX90A-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8789; GFX90A-NEXT: v_max_f16_e32 v2, v2, v2 8790; GFX90A-NEXT: v_min_f16_e32 v2, v2, v6 8791; GFX90A-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8792; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 8793; GFX90A-NEXT: buffer_wbl2 8794; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8795; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8796; GFX90A-NEXT: buffer_invl2 8797; GFX90A-NEXT: buffer_wbinvl1 8798; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8799; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8800; GFX90A-NEXT: v_mov_b32_e32 v3, v2 8801; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 8802; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 8803; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 8804; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 8805; GFX90A-NEXT: s_setpc_b64 s[30:31] 8806; 8807; GFX908-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8808; GFX908: ; %bb.0: 8809; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8810; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 8811; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8812; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 8813; GFX908-NEXT: flat_load_dword v3, v[0:1] 8814; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 8815; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8816; GFX908-NEXT: s_mov_b32 s4, 0xffff 8817; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 8818; GFX908-NEXT: v_not_b32_e32 v5, v5 8819; GFX908-NEXT: s_mov_b64 s[4:5], 0 8820; GFX908-NEXT: v_max_f16_e32 v6, v2, v2 8821; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start 8822; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 8823; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8824; GFX908-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8825; GFX908-NEXT: v_max_f16_e32 v2, v2, v2 8826; GFX908-NEXT: v_min_f16_e32 v2, v2, v6 8827; GFX908-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8828; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 8829; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8830; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8831; GFX908-NEXT: buffer_wbinvl1 8832; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8833; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8834; GFX908-NEXT: v_mov_b32_e32 v3, v2 8835; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 8836; GFX908-NEXT: s_cbranch_execnz .LBB35_1 8837; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 8838; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 8839; GFX908-NEXT: s_setpc_b64 s[30:31] 8840; 8841; GFX8-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8842; GFX8: ; %bb.0: 8843; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8844; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 8845; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8846; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 8847; GFX8-NEXT: flat_load_dword v3, v[0:1] 8848; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 8849; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8850; GFX8-NEXT: s_mov_b32 s4, 0xffff 8851; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 8852; GFX8-NEXT: v_not_b32_e32 v5, v5 8853; GFX8-NEXT: s_mov_b64 s[4:5], 0 8854; GFX8-NEXT: v_max_f16_e32 v6, v2, v2 8855; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start 8856; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 8857; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8858; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8859; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 8860; GFX8-NEXT: v_min_f16_e32 v2, v2, v6 8861; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 8862; GFX8-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8863; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 8864; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8865; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8866; GFX8-NEXT: buffer_wbinvl1 8867; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8868; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8869; GFX8-NEXT: v_mov_b32_e32 v3, v2 8870; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 8871; GFX8-NEXT: s_cbranch_execnz .LBB35_1 8872; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 8873; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 8874; GFX8-NEXT: s_setpc_b64 s[30:31] 8875; 8876; GFX7-LABEL: flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: 8877; GFX7: ; %bb.0: 8878; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8879; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 8880; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8881; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 8882; GFX7-NEXT: flat_load_dword v3, v[0:1] 8883; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8884; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 8885; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 8886; GFX7-NEXT: s_mov_b64 s[4:5], 0 8887; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2 8888; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4 8889; GFX7-NEXT: v_not_b32_e32 v6, v2 8890; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start 8891; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 8892; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8893; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 8894; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 8895; GFX7-NEXT: v_and_b32_e32 v7, v3, v6 8896; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 8897; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 8898; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 8899; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 8900; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 8901; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8902; GFX7-NEXT: buffer_wbinvl1 8903; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 8904; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 8905; GFX7-NEXT: v_mov_b32_e32 v3, v2 8906; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 8907; GFX7-NEXT: s_cbranch_execnz .LBB35_1 8908; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 8909; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 8910; GFX7-NEXT: s_setpc_b64 s[30:31] 8911 %gep = getelementptr half, ptr %ptr, i64 1023 8912 %unused = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 8913 ret void 8914} 8915 8916; -------------------------------------------------------------------- 8917; bfloat 8918; -------------------------------------------------------------------- 8919 8920define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 8921; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 8922; GFX12: ; %bb.0: 8923; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8924; GFX12-NEXT: s_wait_expcnt 0x0 8925; GFX12-NEXT: s_wait_samplecnt 0x0 8926; GFX12-NEXT: s_wait_bvhcnt 0x0 8927; GFX12-NEXT: s_wait_kmcnt 0x0 8928; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 8929; GFX12-NEXT: s_mov_b32 s0, 0 8930; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 8931; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 8932; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 8933; GFX12-NEXT: flat_load_b32 v5, v[0:1] 8934; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8935; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 8936; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8937; GFX12-NEXT: v_not_b32_e32 v4, v4 8938; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start 8939; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 8940; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8941; GFX12-NEXT: v_mov_b32_e32 v6, v5 8942; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8943; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 8944; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 8945; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8946; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 8947; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 8948; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 8949; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 8950; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 8951; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 8952; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 8953; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 8954; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 8955; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 8956; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 8957; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 8958; GFX12-NEXT: s_wait_storecnt 0x0 8959; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 8960; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 8961; GFX12-NEXT: global_inv scope:SCOPE_DEV 8962; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 8963; GFX12-NEXT: s_wait_alu 0xfffe 8964; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 8965; GFX12-NEXT: s_wait_alu 0xfffe 8966; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 8967; GFX12-NEXT: s_cbranch_execnz .LBB36_1 8968; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 8969; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 8970; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 8971; GFX12-NEXT: s_wait_alu 0xfffe 8972; GFX12-NEXT: s_setpc_b64 s[30:31] 8973; 8974; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 8975; GFX940: ; %bb.0: 8976; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8977; GFX940-NEXT: v_mov_b32_e32 v3, v0 8978; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 8979; GFX940-NEXT: flat_load_dword v5, v[0:1] 8980; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 8981; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 8982; GFX940-NEXT: s_mov_b32 s0, 0xffff 8983; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 8984; GFX940-NEXT: v_not_b32_e32 v4, v4 8985; GFX940-NEXT: s_mov_b64 s[0:1], 0 8986; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 8987; GFX940-NEXT: s_movk_i32 s2, 0x7fff 8988; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start 8989; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 8990; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 8991; GFX940-NEXT: v_mov_b32_e32 v7, v5 8992; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 8993; GFX940-NEXT: s_nop 0 8994; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 8995; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 8996; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 8997; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 8998; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 8999; GFX940-NEXT: s_nop 1 9000; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9001; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9002; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 9003; GFX940-NEXT: buffer_wbl2 sc1 9004; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 9005; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9006; GFX940-NEXT: buffer_inv sc1 9007; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9008; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9009; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9010; GFX940-NEXT: s_cbranch_execnz .LBB36_1 9011; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9012; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9013; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9014; GFX940-NEXT: s_setpc_b64 s[30:31] 9015; 9016; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9017; GFX11: ; %bb.0: 9018; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9019; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 9020; GFX11-NEXT: s_mov_b32 s0, 0 9021; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9022; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9023; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9024; GFX11-NEXT: flat_load_b32 v5, v[0:1] 9025; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9026; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9027; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9028; GFX11-NEXT: v_not_b32_e32 v4, v4 9029; GFX11-NEXT: .p2align 6 9030; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start 9031; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9032; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9033; GFX11-NEXT: v_mov_b32_e32 v6, v5 9034; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9035; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9036; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9037; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9038; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 9039; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 9040; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 9041; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9042; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9043; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9044; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9045; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9046; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9047; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9048; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9049; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 9050; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9051; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 9052; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9053; GFX11-NEXT: buffer_gl1_inv 9054; GFX11-NEXT: buffer_gl0_inv 9055; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9056; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9057; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9058; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9059; GFX11-NEXT: s_cbranch_execnz .LBB36_1 9060; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9061; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9062; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9063; GFX11-NEXT: s_setpc_b64 s[30:31] 9064; 9065; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9066; GFX10: ; %bb.0: 9067; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9068; GFX10-NEXT: v_mov_b32_e32 v3, v0 9069; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9070; GFX10-NEXT: s_mov_b32 s4, 0 9071; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9072; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9073; GFX10-NEXT: flat_load_dword v5, v[0:1] 9074; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9075; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9076; GFX10-NEXT: v_not_b32_e32 v4, v4 9077; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start 9078; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9079; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9080; GFX10-NEXT: v_mov_b32_e32 v6, v5 9081; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9082; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 9083; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 9084; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 9085; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9086; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9087; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9088; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9089; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 9090; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9091; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9092; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9093; GFX10-NEXT: buffer_gl1_inv 9094; GFX10-NEXT: buffer_gl0_inv 9095; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9096; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9097; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9098; GFX10-NEXT: s_cbranch_execnz .LBB36_1 9099; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9100; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9101; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9102; GFX10-NEXT: s_setpc_b64 s[30:31] 9103; 9104; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9105; GFX90A: ; %bb.0: 9106; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9107; GFX90A-NEXT: v_mov_b32_e32 v3, v0 9108; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9109; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9110; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9111; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9112; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9113; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9114; GFX90A-NEXT: v_not_b32_e32 v4, v4 9115; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9116; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9117; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9118; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start 9119; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9120; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9121; GFX90A-NEXT: v_mov_b32_e32 v7, v5 9122; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9123; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 9124; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9125; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 9126; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 9127; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9128; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9129; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9130; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 9131; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 9132; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9133; GFX90A-NEXT: buffer_wbinvl1 9134; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9135; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9136; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9137; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 9138; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9139; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9140; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9141; GFX90A-NEXT: s_setpc_b64 s[30:31] 9142; 9143; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9144; GFX908: ; %bb.0: 9145; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9146; GFX908-NEXT: v_mov_b32_e32 v3, v0 9147; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9148; GFX908-NEXT: flat_load_dword v5, v[0:1] 9149; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9150; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9151; GFX908-NEXT: s_mov_b32 s4, 0xffff 9152; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9153; GFX908-NEXT: v_not_b32_e32 v4, v4 9154; GFX908-NEXT: s_mov_b64 s[4:5], 0 9155; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9156; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9157; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start 9158; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9159; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9160; GFX908-NEXT: v_mov_b32_e32 v6, v5 9161; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9162; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 9163; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 9164; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 9165; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 9166; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9167; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 9168; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9169; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 9170; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9171; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9172; GFX908-NEXT: buffer_wbinvl1 9173; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9174; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9175; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9176; GFX908-NEXT: s_cbranch_execnz .LBB36_1 9177; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9178; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9179; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9180; GFX908-NEXT: s_setpc_b64 s[30:31] 9181; 9182; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9183; GFX8: ; %bb.0: 9184; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9185; GFX8-NEXT: v_mov_b32_e32 v3, v0 9186; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9187; GFX8-NEXT: flat_load_dword v5, v[0:1] 9188; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9189; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9190; GFX8-NEXT: s_mov_b32 s4, 0xffff 9191; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9192; GFX8-NEXT: v_not_b32_e32 v4, v4 9193; GFX8-NEXT: s_mov_b64 s[4:5], 0 9194; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9195; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start 9196; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9197; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9198; GFX8-NEXT: v_mov_b32_e32 v6, v5 9199; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9200; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 9201; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 9202; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 9203; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9204; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 9205; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9206; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 9207; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 9208; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9209; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 9210; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9211; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9212; GFX8-NEXT: buffer_wbinvl1 9213; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9214; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9215; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9216; GFX8-NEXT: s_cbranch_execnz .LBB36_1 9217; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9218; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9219; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9220; GFX8-NEXT: s_setpc_b64 s[30:31] 9221; 9222; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: 9223; GFX7: ; %bb.0: 9224; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9225; GFX7-NEXT: v_mov_b32_e32 v3, v0 9226; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9227; GFX7-NEXT: flat_load_dword v5, v[0:1] 9228; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9229; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9230; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 9231; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9232; GFX7-NEXT: v_not_b32_e32 v4, v4 9233; GFX7-NEXT: s_mov_b64 s[4:5], 0 9234; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9235; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start 9236; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9237; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9238; GFX7-NEXT: v_mov_b32_e32 v6, v5 9239; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9240; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9241; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9242; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 9243; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9244; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 9245; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9246; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 9247; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9248; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9249; GFX7-NEXT: buffer_wbinvl1 9250; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9251; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9252; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9253; GFX7-NEXT: s_cbranch_execnz .LBB36_1 9254; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9255; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9256; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9257; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9258; GFX7-NEXT: s_setpc_b64 s[30:31] 9259 %result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9260 ret bfloat %result 9261} 9262 9263define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 9264; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9265; GFX12: ; %bb.0: 9266; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9267; GFX12-NEXT: s_wait_expcnt 0x0 9268; GFX12-NEXT: s_wait_samplecnt 0x0 9269; GFX12-NEXT: s_wait_bvhcnt 0x0 9270; GFX12-NEXT: s_wait_kmcnt 0x0 9271; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9272; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9273; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9274; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9275; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9276; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9277; GFX12-NEXT: s_mov_b32 s0, 0 9278; GFX12-NEXT: flat_load_b32 v5, v[0:1] 9279; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9280; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9281; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9282; GFX12-NEXT: v_not_b32_e32 v4, v4 9283; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start 9284; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9285; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9286; GFX12-NEXT: v_mov_b32_e32 v6, v5 9287; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9288; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9289; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9290; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9291; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 9292; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 9293; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 9294; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9295; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9296; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9297; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9298; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9299; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9300; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9301; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9302; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 9303; GFX12-NEXT: s_wait_storecnt 0x0 9304; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9305; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9306; GFX12-NEXT: global_inv scope:SCOPE_DEV 9307; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9308; GFX12-NEXT: s_wait_alu 0xfffe 9309; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9310; GFX12-NEXT: s_wait_alu 0xfffe 9311; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9312; GFX12-NEXT: s_cbranch_execnz .LBB37_1 9313; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9314; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9315; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9316; GFX12-NEXT: s_wait_alu 0xfffe 9317; GFX12-NEXT: s_setpc_b64 s[30:31] 9318; 9319; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9320; GFX940: ; %bb.0: 9321; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9322; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 9323; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9324; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9325; GFX940-NEXT: v_mov_b32_e32 v1, v5 9326; GFX940-NEXT: flat_load_dword v5, v[0:1] 9327; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9328; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9329; GFX940-NEXT: s_mov_b32 s0, 0xffff 9330; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9331; GFX940-NEXT: v_not_b32_e32 v4, v4 9332; GFX940-NEXT: s_mov_b64 s[0:1], 0 9333; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9334; GFX940-NEXT: s_movk_i32 s2, 0x7fff 9335; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start 9336; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9337; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9338; GFX940-NEXT: v_mov_b32_e32 v7, v5 9339; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9340; GFX940-NEXT: s_nop 0 9341; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 9342; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 9343; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 9344; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 9345; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9346; GFX940-NEXT: s_nop 1 9347; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9348; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9349; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 9350; GFX940-NEXT: buffer_wbl2 sc1 9351; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 9352; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9353; GFX940-NEXT: buffer_inv sc1 9354; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9355; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9356; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9357; GFX940-NEXT: s_cbranch_execnz .LBB37_1 9358; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9359; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9360; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9361; GFX940-NEXT: s_setpc_b64 s[30:31] 9362; 9363; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9364; GFX11: ; %bb.0: 9365; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9366; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9367; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9368; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9369; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9370; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9371; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9372; GFX11-NEXT: s_mov_b32 s0, 0 9373; GFX11-NEXT: flat_load_b32 v5, v[0:1] 9374; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9375; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9377; GFX11-NEXT: v_not_b32_e32 v4, v4 9378; GFX11-NEXT: .p2align 6 9379; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start 9380; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9381; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9382; GFX11-NEXT: v_mov_b32_e32 v6, v5 9383; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9384; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9385; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9386; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9387; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 9388; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 9389; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 9390; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9391; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9392; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9393; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9394; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9395; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9396; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9397; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9398; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 9399; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9400; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 9401; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9402; GFX11-NEXT: buffer_gl1_inv 9403; GFX11-NEXT: buffer_gl0_inv 9404; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9405; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9406; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9407; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9408; GFX11-NEXT: s_cbranch_execnz .LBB37_1 9409; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9410; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9411; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9412; GFX11-NEXT: s_setpc_b64 s[30:31] 9413; 9414; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9415; GFX10: ; %bb.0: 9416; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9417; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 9418; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 9419; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9420; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9421; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9422; GFX10-NEXT: s_mov_b32 s4, 0 9423; GFX10-NEXT: flat_load_dword v5, v[0:1] 9424; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9425; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9426; GFX10-NEXT: v_not_b32_e32 v4, v4 9427; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start 9428; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9429; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9430; GFX10-NEXT: v_mov_b32_e32 v6, v5 9431; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9432; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 9433; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 9434; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 9435; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9436; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9437; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9438; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9439; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 9440; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9441; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9442; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9443; GFX10-NEXT: buffer_gl1_inv 9444; GFX10-NEXT: buffer_gl0_inv 9445; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9446; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9447; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9448; GFX10-NEXT: s_cbranch_execnz .LBB37_1 9449; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9450; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9451; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9452; GFX10-NEXT: s_setpc_b64 s[30:31] 9453; 9454; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9455; GFX90A: ; %bb.0: 9456; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9457; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9458; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9459; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9460; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9461; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9462; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9463; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9464; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9465; GFX90A-NEXT: v_not_b32_e32 v4, v4 9466; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9467; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9468; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9469; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start 9470; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9471; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9472; GFX90A-NEXT: v_mov_b32_e32 v7, v5 9473; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9474; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 9475; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9476; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 9477; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 9478; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9479; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9480; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9481; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 9482; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 9483; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9484; GFX90A-NEXT: buffer_wbinvl1 9485; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9486; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9487; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9488; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 9489; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9490; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9491; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9492; GFX90A-NEXT: s_setpc_b64 s[30:31] 9493; 9494; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9495; GFX908: ; %bb.0: 9496; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9497; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 9498; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9499; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9500; GFX908-NEXT: flat_load_dword v5, v[0:1] 9501; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9502; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9503; GFX908-NEXT: s_mov_b32 s4, 0xffff 9504; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9505; GFX908-NEXT: v_not_b32_e32 v4, v4 9506; GFX908-NEXT: s_mov_b64 s[4:5], 0 9507; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9508; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9509; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start 9510; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9511; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9512; GFX908-NEXT: v_mov_b32_e32 v6, v5 9513; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9514; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 9515; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 9516; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 9517; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 9518; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9519; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 9520; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9521; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 9522; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9523; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9524; GFX908-NEXT: buffer_wbinvl1 9525; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9526; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9527; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9528; GFX908-NEXT: s_cbranch_execnz .LBB37_1 9529; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9530; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9531; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9532; GFX908-NEXT: s_setpc_b64 s[30:31] 9533; 9534; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9535; GFX8: ; %bb.0: 9536; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9537; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 9538; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9539; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9540; GFX8-NEXT: flat_load_dword v5, v[0:1] 9541; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9542; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9543; GFX8-NEXT: s_mov_b32 s4, 0xffff 9544; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9545; GFX8-NEXT: v_not_b32_e32 v4, v4 9546; GFX8-NEXT: s_mov_b64 s[4:5], 0 9547; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9548; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start 9549; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9550; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9551; GFX8-NEXT: v_mov_b32_e32 v6, v5 9552; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9553; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 9554; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 9555; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 9556; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9557; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 9558; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9559; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 9560; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 9561; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9562; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 9563; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9564; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9565; GFX8-NEXT: buffer_wbinvl1 9566; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9567; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9568; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9569; GFX8-NEXT: s_cbranch_execnz .LBB37_1 9570; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9571; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9572; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9573; GFX8-NEXT: s_setpc_b64 s[30:31] 9574; 9575; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 9576; GFX7: ; %bb.0: 9577; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9578; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 9579; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9580; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9581; GFX7-NEXT: flat_load_dword v5, v[0:1] 9582; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9583; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9584; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 9585; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9586; GFX7-NEXT: v_not_b32_e32 v4, v4 9587; GFX7-NEXT: s_mov_b64 s[4:5], 0 9588; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9589; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start 9590; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9591; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9592; GFX7-NEXT: v_mov_b32_e32 v6, v5 9593; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9594; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9595; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9596; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 9597; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9598; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 9599; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9600; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 9601; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9602; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9603; GFX7-NEXT: buffer_wbinvl1 9604; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9605; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9606; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9607; GFX7-NEXT: s_cbranch_execnz .LBB37_1 9608; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9609; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9610; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9611; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9612; GFX7-NEXT: s_setpc_b64 s[30:31] 9613 %gep = getelementptr bfloat, ptr %ptr, i64 1023 9614 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9615 ret bfloat %result 9616} 9617 9618define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 9619; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9620; GFX12: ; %bb.0: 9621; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9622; GFX12-NEXT: s_wait_expcnt 0x0 9623; GFX12-NEXT: s_wait_samplecnt 0x0 9624; GFX12-NEXT: s_wait_bvhcnt 0x0 9625; GFX12-NEXT: s_wait_kmcnt 0x0 9626; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9627; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9628; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9629; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9630; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9631; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9632; GFX12-NEXT: s_mov_b32 s0, 0 9633; GFX12-NEXT: flat_load_b32 v5, v[0:1] 9634; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9635; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9636; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9637; GFX12-NEXT: v_not_b32_e32 v4, v4 9638; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start 9639; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9640; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9641; GFX12-NEXT: v_mov_b32_e32 v6, v5 9642; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9643; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9644; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9645; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9646; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 9647; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 9648; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 9649; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9650; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9651; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9652; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9653; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9654; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9655; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9656; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9657; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 9658; GFX12-NEXT: s_wait_storecnt 0x0 9659; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 9660; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9661; GFX12-NEXT: global_inv scope:SCOPE_DEV 9662; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9663; GFX12-NEXT: s_wait_alu 0xfffe 9664; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 9665; GFX12-NEXT: s_wait_alu 0xfffe 9666; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9667; GFX12-NEXT: s_cbranch_execnz .LBB38_1 9668; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 9669; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 9670; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9671; GFX12-NEXT: s_wait_alu 0xfffe 9672; GFX12-NEXT: s_setpc_b64 s[30:31] 9673; 9674; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9675; GFX940: ; %bb.0: 9676; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9677; GFX940-NEXT: s_movk_i32 s0, 0xf800 9678; GFX940-NEXT: s_mov_b32 s1, -1 9679; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 9680; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 9681; GFX940-NEXT: v_mov_b32_e32 v1, v5 9682; GFX940-NEXT: flat_load_dword v5, v[0:1] 9683; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 9684; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9685; GFX940-NEXT: s_mov_b32 s0, 0xffff 9686; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 9687; GFX940-NEXT: v_not_b32_e32 v4, v4 9688; GFX940-NEXT: s_mov_b64 s[0:1], 0 9689; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9690; GFX940-NEXT: s_movk_i32 s2, 0x7fff 9691; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start 9692; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 9693; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9694; GFX940-NEXT: v_mov_b32_e32 v7, v5 9695; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9696; GFX940-NEXT: s_nop 0 9697; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 9698; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 9699; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 9700; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 9701; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9702; GFX940-NEXT: s_nop 1 9703; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9704; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9705; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 9706; GFX940-NEXT: buffer_wbl2 sc1 9707; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 9708; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9709; GFX940-NEXT: buffer_inv sc1 9710; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9711; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 9712; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 9713; GFX940-NEXT: s_cbranch_execnz .LBB38_1 9714; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 9715; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 9716; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9717; GFX940-NEXT: s_setpc_b64 s[30:31] 9718; 9719; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9720; GFX11: ; %bb.0: 9721; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9722; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9723; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9724; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9725; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 9726; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 9727; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 9728; GFX11-NEXT: s_mov_b32 s0, 0 9729; GFX11-NEXT: flat_load_b32 v5, v[0:1] 9730; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9731; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9732; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9733; GFX11-NEXT: v_not_b32_e32 v4, v4 9734; GFX11-NEXT: .p2align 6 9735; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start 9736; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 9737; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9738; GFX11-NEXT: v_mov_b32_e32 v6, v5 9739; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9740; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9741; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9743; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 9744; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 9745; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 9746; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9747; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 9748; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9749; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9750; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9751; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9752; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9753; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 9754; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 9755; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 9756; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 9757; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9758; GFX11-NEXT: buffer_gl1_inv 9759; GFX11-NEXT: buffer_gl0_inv 9760; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9761; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 9762; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 9763; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 9764; GFX11-NEXT: s_cbranch_execnz .LBB38_1 9765; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 9766; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 9767; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9768; GFX11-NEXT: s_setpc_b64 s[30:31] 9769; 9770; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9771; GFX10: ; %bb.0: 9772; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9773; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 9774; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 9775; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9776; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 9777; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 9778; GFX10-NEXT: s_mov_b32 s4, 0 9779; GFX10-NEXT: flat_load_dword v5, v[0:1] 9780; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9781; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 9782; GFX10-NEXT: v_not_b32_e32 v4, v4 9783; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start 9784; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 9785; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9786; GFX10-NEXT: v_mov_b32_e32 v6, v5 9787; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9788; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 9789; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 9790; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 9791; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 9792; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 9793; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 9794; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9795; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 9796; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 9797; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9798; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9799; GFX10-NEXT: buffer_gl1_inv 9800; GFX10-NEXT: buffer_gl0_inv 9801; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 9802; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 9803; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 9804; GFX10-NEXT: s_cbranch_execnz .LBB38_1 9805; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 9806; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 9807; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9808; GFX10-NEXT: s_setpc_b64 s[30:31] 9809; 9810; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9811; GFX90A: ; %bb.0: 9812; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9813; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9814; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9815; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 9816; GFX90A-NEXT: flat_load_dword v5, v[0:1] 9817; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 9818; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9819; GFX90A-NEXT: s_mov_b32 s4, 0xffff 9820; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9821; GFX90A-NEXT: v_not_b32_e32 v4, v4 9822; GFX90A-NEXT: s_mov_b64 s[4:5], 0 9823; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9824; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 9825; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start 9826; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 9827; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9828; GFX90A-NEXT: v_mov_b32_e32 v7, v5 9829; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9830; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 9831; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 9832; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 9833; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 9834; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9835; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 9836; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9837; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 9838; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 9839; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9840; GFX90A-NEXT: buffer_wbinvl1 9841; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 9842; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9843; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 9844; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 9845; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 9846; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 9847; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9848; GFX90A-NEXT: s_setpc_b64 s[30:31] 9849; 9850; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9851; GFX908: ; %bb.0: 9852; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9853; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 9854; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 9855; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 9856; GFX908-NEXT: flat_load_dword v5, v[0:1] 9857; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 9858; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9859; GFX908-NEXT: s_mov_b32 s4, 0xffff 9860; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9861; GFX908-NEXT: v_not_b32_e32 v4, v4 9862; GFX908-NEXT: s_mov_b64 s[4:5], 0 9863; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9864; GFX908-NEXT: s_movk_i32 s6, 0x7fff 9865; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start 9866; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 9867; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9868; GFX908-NEXT: v_mov_b32_e32 v6, v5 9869; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9870; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 9871; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 9872; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 9873; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 9874; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9875; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 9876; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9877; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 9878; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9879; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9880; GFX908-NEXT: buffer_wbinvl1 9881; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9882; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9883; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 9884; GFX908-NEXT: s_cbranch_execnz .LBB38_1 9885; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 9886; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 9887; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9888; GFX908-NEXT: s_setpc_b64 s[30:31] 9889; 9890; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9891; GFX8: ; %bb.0: 9892; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9893; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 9894; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9895; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 9896; GFX8-NEXT: flat_load_dword v5, v[0:1] 9897; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 9898; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9899; GFX8-NEXT: s_mov_b32 s4, 0xffff 9900; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 9901; GFX8-NEXT: v_not_b32_e32 v4, v4 9902; GFX8-NEXT: s_mov_b64 s[4:5], 0 9903; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 9904; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start 9905; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 9906; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9907; GFX8-NEXT: v_mov_b32_e32 v6, v5 9908; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 9909; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 9910; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 9911; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 9912; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 9913; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 9914; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 9915; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 9916; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 9917; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 9918; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 9919; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9920; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9921; GFX8-NEXT: buffer_wbinvl1 9922; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9923; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9924; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 9925; GFX8-NEXT: s_cbranch_execnz .LBB38_1 9926; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 9927; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 9928; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9929; GFX8-NEXT: s_setpc_b64 s[30:31] 9930; 9931; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 9932; GFX7: ; %bb.0: 9933; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9934; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0 9935; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 9936; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 9937; GFX7-NEXT: flat_load_dword v5, v[0:1] 9938; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 9939; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 9940; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 9941; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 9942; GFX7-NEXT: v_not_b32_e32 v4, v4 9943; GFX7-NEXT: s_mov_b64 s[4:5], 0 9944; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 9945; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start 9946; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 9947; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9948; GFX7-NEXT: v_mov_b32_e32 v6, v5 9949; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 9950; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 9951; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 9952; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 9953; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 9954; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 9955; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 9956; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 9957; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 9958; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 9959; GFX7-NEXT: buffer_wbinvl1 9960; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 9961; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 9962; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 9963; GFX7-NEXT: s_cbranch_execnz .LBB38_1 9964; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 9965; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 9966; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 9967; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 9968; GFX7-NEXT: s_setpc_b64 s[30:31] 9969 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 9970 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 9971 ret bfloat %result 9972 } 9973 9974define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 9975; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 9976; GFX12: ; %bb.0: 9977; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9978; GFX12-NEXT: s_wait_expcnt 0x0 9979; GFX12-NEXT: s_wait_samplecnt 0x0 9980; GFX12-NEXT: s_wait_bvhcnt 0x0 9981; GFX12-NEXT: s_wait_kmcnt 0x0 9982; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 9983; GFX12-NEXT: s_mov_b32 s0, 0 9984; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 9985; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 9986; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 9987; GFX12-NEXT: flat_load_b32 v4, v[0:1] 9988; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3 9989; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 9990; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 9991; GFX12-NEXT: v_not_b32_e32 v6, v3 9992; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start 9993; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 9994; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 9995; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4 9996; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 9997; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3 9998; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 9999; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10000; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 10001; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3 10002; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10003; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 10004; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10005; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 10006; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10007; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10008; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10009; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3 10010; GFX12-NEXT: s_wait_storecnt 0x0 10011; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10012; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10013; GFX12-NEXT: global_inv scope:SCOPE_DEV 10014; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10015; GFX12-NEXT: v_mov_b32_e32 v4, v3 10016; GFX12-NEXT: s_wait_alu 0xfffe 10017; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10018; GFX12-NEXT: s_wait_alu 0xfffe 10019; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10020; GFX12-NEXT: s_cbranch_execnz .LBB39_1 10021; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10022; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10023; GFX12-NEXT: s_wait_alu 0xfffe 10024; GFX12-NEXT: s_setpc_b64 s[30:31] 10025; 10026; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10027; GFX940: ; %bb.0: 10028; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10029; GFX940-NEXT: v_mov_b32_e32 v3, v0 10030; GFX940-NEXT: v_and_b32_e32 v0, -4, v3 10031; GFX940-NEXT: flat_load_dword v5, v[0:1] 10032; GFX940-NEXT: v_and_b32_e32 v3, 3, v3 10033; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10034; GFX940-NEXT: s_mov_b32 s0, 0xffff 10035; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 10036; GFX940-NEXT: v_not_b32_e32 v6, v4 10037; GFX940-NEXT: s_mov_b64 s[0:1], 0 10038; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10039; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10040; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start 10041; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10042; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10043; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10044; GFX940-NEXT: s_nop 0 10045; GFX940-NEXT: v_min_f32_e32 v4, v4, v2 10046; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1 10047; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 10048; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2 10049; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 10050; GFX940-NEXT: s_nop 1 10051; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 10052; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10053; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4 10054; GFX940-NEXT: buffer_wbl2 sc1 10055; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 10056; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10057; GFX940-NEXT: buffer_inv sc1 10058; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 10059; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10060; GFX940-NEXT: v_mov_b32_e32 v5, v4 10061; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10062; GFX940-NEXT: s_cbranch_execnz .LBB39_1 10063; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10064; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10065; GFX940-NEXT: s_setpc_b64 s[30:31] 10066; 10067; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10068; GFX11: ; %bb.0: 10069; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10070; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 10071; GFX11-NEXT: s_mov_b32 s0, 0 10072; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) 10073; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 10074; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 10075; GFX11-NEXT: flat_load_b32 v4, v[0:1] 10076; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10077; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 10078; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10079; GFX11-NEXT: v_not_b32_e32 v6, v3 10080; GFX11-NEXT: .p2align 6 10081; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start 10082; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10083; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10084; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10086; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 10087; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 10088; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10089; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 10090; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 10091; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10092; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 10093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10094; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 10095; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10097; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10098; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3 10099; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10100; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 10101; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10102; GFX11-NEXT: buffer_gl1_inv 10103; GFX11-NEXT: buffer_gl0_inv 10104; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10105; GFX11-NEXT: v_mov_b32_e32 v4, v3 10106; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10107; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10108; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10109; GFX11-NEXT: s_cbranch_execnz .LBB39_1 10110; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10111; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10112; GFX11-NEXT: s_setpc_b64 s[30:31] 10113; 10114; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10115; GFX10: ; %bb.0: 10116; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10117; GFX10-NEXT: v_mov_b32_e32 v3, v0 10118; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10119; GFX10-NEXT: s_mov_b32 s4, 0 10120; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 10121; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 10122; GFX10-NEXT: flat_load_dword v4, v[0:1] 10123; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10124; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff 10125; GFX10-NEXT: v_not_b32_e32 v6, v3 10126; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start 10127; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10128; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10129; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10130; GFX10-NEXT: v_min_f32_e32 v3, v3, v2 10131; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 10132; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3 10133; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 10134; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 10135; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo 10136; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10137; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3 10138; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10139; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10140; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10141; GFX10-NEXT: buffer_gl1_inv 10142; GFX10-NEXT: buffer_gl0_inv 10143; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 10144; GFX10-NEXT: v_mov_b32_e32 v4, v3 10145; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10146; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10147; GFX10-NEXT: s_cbranch_execnz .LBB39_1 10148; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10149; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10150; GFX10-NEXT: s_setpc_b64 s[30:31] 10151; 10152; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10153; GFX90A: ; %bb.0: 10154; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10155; GFX90A-NEXT: v_mov_b32_e32 v3, v0 10156; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 10157; GFX90A-NEXT: flat_load_dword v5, v[0:1] 10158; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 10159; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 10160; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10161; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 10162; GFX90A-NEXT: v_not_b32_e32 v6, v4 10163; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10164; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10165; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10166; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start 10167; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10168; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10169; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10170; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 10171; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 10172; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 10173; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6 10174; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 10175; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc 10176; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10177; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4 10178; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 10179; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10180; GFX90A-NEXT: buffer_wbinvl1 10181; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 10182; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10183; GFX90A-NEXT: v_mov_b32_e32 v5, v4 10184; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10185; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 10186; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10187; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10188; GFX90A-NEXT: s_setpc_b64 s[30:31] 10189; 10190; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10191; GFX908: ; %bb.0: 10192; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10193; GFX908-NEXT: v_mov_b32_e32 v3, v0 10194; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 10195; GFX908-NEXT: flat_load_dword v4, v[0:1] 10196; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 10197; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10198; GFX908-NEXT: s_mov_b32 s4, 0xffff 10199; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4 10200; GFX908-NEXT: v_not_b32_e32 v6, v3 10201; GFX908-NEXT: s_mov_b64 s[4:5], 0 10202; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10203; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10204; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start 10205; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10206; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10207; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10208; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 10209; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 10210; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 10211; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6 10212; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10213; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc 10214; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10215; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3 10216; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10217; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10218; GFX908-NEXT: buffer_wbinvl1 10219; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10220; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10221; GFX908-NEXT: v_mov_b32_e32 v4, v3 10222; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10223; GFX908-NEXT: s_cbranch_execnz .LBB39_1 10224; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10225; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10226; GFX908-NEXT: s_setpc_b64 s[30:31] 10227; 10228; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10229; GFX8: ; %bb.0: 10230; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10231; GFX8-NEXT: v_mov_b32_e32 v3, v0 10232; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 10233; GFX8-NEXT: flat_load_dword v4, v[0:1] 10234; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 10235; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10236; GFX8-NEXT: s_mov_b32 s4, 0xffff 10237; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4 10238; GFX8-NEXT: v_not_b32_e32 v6, v3 10239; GFX8-NEXT: s_mov_b64 s[4:5], 0 10240; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10241; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start 10242; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10243; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10244; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10245; GFX8-NEXT: v_min_f32_e32 v3, v3, v2 10246; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1 10247; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3 10248; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10249; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3 10250; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 10251; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc 10252; GFX8-NEXT: v_and_b32_e32 v7, v4, v6 10253; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10254; GFX8-NEXT: v_or_b32_e32 v3, v7, v3 10255; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10256; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10257; GFX8-NEXT: buffer_wbinvl1 10258; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10259; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10260; GFX8-NEXT: v_mov_b32_e32 v4, v3 10261; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10262; GFX8-NEXT: s_cbranch_execnz .LBB39_1 10263; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10264; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10265; GFX8-NEXT: s_setpc_b64 s[30:31] 10266; 10267; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: 10268; GFX7: ; %bb.0: 10269; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10270; GFX7-NEXT: v_mov_b32_e32 v3, v0 10271; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 10272; GFX7-NEXT: flat_load_dword v4, v[0:1] 10273; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 10274; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3 10275; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5 10276; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10277; GFX7-NEXT: v_not_b32_e32 v6, v3 10278; GFX7-NEXT: s_mov_b64 s[4:5], 0 10279; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 10280; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start 10281; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10282; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10283; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4 10284; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 10285; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 10286; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 10287; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 10288; GFX7-NEXT: v_and_b32_e32 v7, v4, v6 10289; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3 10290; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 10291; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 10292; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10293; GFX7-NEXT: buffer_wbinvl1 10294; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 10295; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10296; GFX7-NEXT: v_mov_b32_e32 v4, v3 10297; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10298; GFX7-NEXT: s_cbranch_execnz .LBB39_1 10299; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10300; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10301; GFX7-NEXT: s_setpc_b64 s[30:31] 10302 %unused = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 10303 ret void 10304} 10305 10306define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 10307; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10308; GFX12: ; %bb.0: 10309; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10310; GFX12-NEXT: s_wait_expcnt 0x0 10311; GFX12-NEXT: s_wait_samplecnt 0x0 10312; GFX12-NEXT: s_wait_bvhcnt 0x0 10313; GFX12-NEXT: s_wait_kmcnt 0x0 10314; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 10315; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10316; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10317; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10318; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 10319; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 10320; GFX12-NEXT: s_mov_b32 s0, 0 10321; GFX12-NEXT: flat_load_b32 v3, v[0:1] 10322; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10323; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10324; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10325; GFX12-NEXT: v_not_b32_e32 v5, v5 10326; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start 10327; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10328; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10329; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10330; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10331; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10332; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 10333; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10334; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 10335; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 10336; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10337; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10338; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10339; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10340; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10341; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10342; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10343; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 10344; GFX12-NEXT: s_wait_storecnt 0x0 10345; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10346; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10347; GFX12-NEXT: global_inv scope:SCOPE_DEV 10348; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10349; GFX12-NEXT: v_mov_b32_e32 v3, v2 10350; GFX12-NEXT: s_wait_alu 0xfffe 10351; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10352; GFX12-NEXT: s_wait_alu 0xfffe 10353; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10354; GFX12-NEXT: s_cbranch_execnz .LBB40_1 10355; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10356; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10357; GFX12-NEXT: s_wait_alu 0xfffe 10358; GFX12-NEXT: s_setpc_b64 s[30:31] 10359; 10360; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10361; GFX940: ; %bb.0: 10362; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10363; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 10364; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 10365; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 10366; GFX940-NEXT: v_mov_b32_e32 v1, v5 10367; GFX940-NEXT: flat_load_dword v3, v[0:1] 10368; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 10369; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10370; GFX940-NEXT: s_mov_b32 s0, 0xffff 10371; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 10372; GFX940-NEXT: v_not_b32_e32 v5, v5 10373; GFX940-NEXT: s_mov_b64 s[0:1], 0 10374; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10375; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10376; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start 10377; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10378; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10379; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10380; GFX940-NEXT: s_nop 0 10381; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 10382; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 10383; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 10384; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 10385; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10386; GFX940-NEXT: s_nop 1 10387; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10388; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10389; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 10390; GFX940-NEXT: buffer_wbl2 sc1 10391; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 10392; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10393; GFX940-NEXT: buffer_inv sc1 10394; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10395; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10396; GFX940-NEXT: v_mov_b32_e32 v3, v2 10397; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10398; GFX940-NEXT: s_cbranch_execnz .LBB40_1 10399; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10400; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10401; GFX940-NEXT: s_setpc_b64 s[30:31] 10402; 10403; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10404; GFX11: ; %bb.0: 10405; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10406; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 10407; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10408; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10409; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10410; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 10411; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 10412; GFX11-NEXT: s_mov_b32 s0, 0 10413; GFX11-NEXT: flat_load_b32 v3, v[0:1] 10414; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10415; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10416; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10417; GFX11-NEXT: v_not_b32_e32 v5, v5 10418; GFX11-NEXT: .p2align 6 10419; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start 10420; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10421; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10422; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10423; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10424; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10425; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 10426; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10427; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 10428; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 10429; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10430; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10431; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10432; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10433; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10434; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10435; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10436; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 10437; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10438; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 10439; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10440; GFX11-NEXT: buffer_gl1_inv 10441; GFX11-NEXT: buffer_gl0_inv 10442; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10443; GFX11-NEXT: v_mov_b32_e32 v3, v2 10444; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10445; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10446; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10447; GFX11-NEXT: s_cbranch_execnz .LBB40_1 10448; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10449; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10450; GFX11-NEXT: s_setpc_b64 s[30:31] 10451; 10452; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10453; GFX10: ; %bb.0: 10454; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10455; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 10456; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 10457; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10458; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 10459; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 10460; GFX10-NEXT: s_mov_b32 s4, 0 10461; GFX10-NEXT: flat_load_dword v3, v[0:1] 10462; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10463; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10464; GFX10-NEXT: v_not_b32_e32 v5, v5 10465; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start 10466; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10467; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10468; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10469; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 10470; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 10471; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 10472; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10473; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10474; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10475; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10476; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 10477; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10478; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10479; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10480; GFX10-NEXT: buffer_gl1_inv 10481; GFX10-NEXT: buffer_gl0_inv 10482; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10483; GFX10-NEXT: v_mov_b32_e32 v3, v2 10484; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10485; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10486; GFX10-NEXT: s_cbranch_execnz .LBB40_1 10487; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10488; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10489; GFX10-NEXT: s_setpc_b64 s[30:31] 10490; 10491; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10492; GFX90A: ; %bb.0: 10493; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10494; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 10495; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10496; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 10497; GFX90A-NEXT: flat_load_dword v3, v[0:1] 10498; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 10499; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10500; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10501; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10502; GFX90A-NEXT: v_not_b32_e32 v5, v5 10503; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10504; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10505; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10506; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start 10507; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10508; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10509; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10510; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 10511; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 10512; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 10513; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 10514; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10515; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10516; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10517; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 10518; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10519; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10520; GFX90A-NEXT: buffer_wbinvl1 10521; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10522; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10523; GFX90A-NEXT: v_mov_b32_e32 v3, v2 10524; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10525; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 10526; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10527; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10528; GFX90A-NEXT: s_setpc_b64 s[30:31] 10529; 10530; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10531; GFX908: ; %bb.0: 10532; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10533; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 10534; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10535; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 10536; GFX908-NEXT: flat_load_dword v3, v[0:1] 10537; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 10538; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10539; GFX908-NEXT: s_mov_b32 s4, 0xffff 10540; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10541; GFX908-NEXT: v_not_b32_e32 v5, v5 10542; GFX908-NEXT: s_mov_b64 s[4:5], 0 10543; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10544; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10545; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start 10546; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10547; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10548; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10549; GFX908-NEXT: v_min_f32_e32 v2, v2, v6 10550; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 10551; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 10552; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 10553; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10554; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10555; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10556; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 10557; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10558; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10559; GFX908-NEXT: buffer_wbinvl1 10560; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10561; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10562; GFX908-NEXT: v_mov_b32_e32 v3, v2 10563; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10564; GFX908-NEXT: s_cbranch_execnz .LBB40_1 10565; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10566; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10567; GFX908-NEXT: s_setpc_b64 s[30:31] 10568; 10569; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10570; GFX8: ; %bb.0: 10571; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10572; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 10573; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10574; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 10575; GFX8-NEXT: flat_load_dword v3, v[0:1] 10576; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 10577; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10578; GFX8-NEXT: s_mov_b32 s4, 0xffff 10579; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10580; GFX8-NEXT: v_not_b32_e32 v5, v5 10581; GFX8-NEXT: s_mov_b64 s[4:5], 0 10582; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10583; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start 10584; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10585; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10586; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10587; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 10588; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 10589; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 10590; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10591; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 10592; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10593; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 10594; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 10595; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10596; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 10597; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10598; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10599; GFX8-NEXT: buffer_wbinvl1 10600; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10601; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10602; GFX8-NEXT: v_mov_b32_e32 v3, v2 10603; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10604; GFX8-NEXT: s_cbranch_execnz .LBB40_1 10605; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10606; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10607; GFX8-NEXT: s_setpc_b64 s[30:31] 10608; 10609; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 10610; GFX7: ; %bb.0: 10611; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10612; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 10613; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10614; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 10615; GFX7-NEXT: flat_load_dword v3, v[0:1] 10616; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 10617; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10618; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 10619; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10620; GFX7-NEXT: v_not_b32_e32 v5, v5 10621; GFX7-NEXT: s_mov_b64 s[4:5], 0 10622; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 10623; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start 10624; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10625; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10626; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10627; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10628; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10629; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 10630; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10631; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 10632; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10633; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 10634; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10635; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10636; GFX7-NEXT: buffer_wbinvl1 10637; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10638; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10639; GFX7-NEXT: v_mov_b32_e32 v3, v2 10640; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10641; GFX7-NEXT: s_cbranch_execnz .LBB40_1 10642; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10643; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10644; GFX7-NEXT: s_setpc_b64 s[30:31] 10645 %gep = getelementptr bfloat, ptr %ptr, i64 1023 10646 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 10647 ret void 10648} 10649 10650define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 10651; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10652; GFX12: ; %bb.0: 10653; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10654; GFX12-NEXT: s_wait_expcnt 0x0 10655; GFX12-NEXT: s_wait_samplecnt 0x0 10656; GFX12-NEXT: s_wait_bvhcnt 0x0 10657; GFX12-NEXT: s_wait_kmcnt 0x0 10658; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10659; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10660; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10661; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10662; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 10663; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 10664; GFX12-NEXT: s_mov_b32 s0, 0 10665; GFX12-NEXT: flat_load_b32 v3, v[0:1] 10666; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10667; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10668; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 10669; GFX12-NEXT: v_not_b32_e32 v5, v5 10670; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start 10671; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 10672; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10673; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10674; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10675; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10676; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 10677; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10678; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 10679; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 10680; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10681; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10682; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10683; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10684; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10685; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10686; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10687; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 10688; GFX12-NEXT: s_wait_storecnt 0x0 10689; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 10690; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10691; GFX12-NEXT: global_inv scope:SCOPE_DEV 10692; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10693; GFX12-NEXT: v_mov_b32_e32 v3, v2 10694; GFX12-NEXT: s_wait_alu 0xfffe 10695; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 10696; GFX12-NEXT: s_wait_alu 0xfffe 10697; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10698; GFX12-NEXT: s_cbranch_execnz .LBB41_1 10699; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 10700; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 10701; GFX12-NEXT: s_wait_alu 0xfffe 10702; GFX12-NEXT: s_setpc_b64 s[30:31] 10703; 10704; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10705; GFX940: ; %bb.0: 10706; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10707; GFX940-NEXT: s_movk_i32 s0, 0xf800 10708; GFX940-NEXT: s_mov_b32 s1, -1 10709; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 10710; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 10711; GFX940-NEXT: v_mov_b32_e32 v1, v5 10712; GFX940-NEXT: flat_load_dword v3, v[0:1] 10713; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 10714; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10715; GFX940-NEXT: s_mov_b32 s0, 0xffff 10716; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 10717; GFX940-NEXT: v_not_b32_e32 v5, v5 10718; GFX940-NEXT: s_mov_b64 s[0:1], 0 10719; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10720; GFX940-NEXT: s_movk_i32 s2, 0x7fff 10721; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start 10722; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 10723; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10724; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10725; GFX940-NEXT: s_nop 0 10726; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 10727; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 10728; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 10729; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 10730; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10731; GFX940-NEXT: s_nop 1 10732; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10733; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10734; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 10735; GFX940-NEXT: buffer_wbl2 sc1 10736; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 10737; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10738; GFX940-NEXT: buffer_inv sc1 10739; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10740; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 10741; GFX940-NEXT: v_mov_b32_e32 v3, v2 10742; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 10743; GFX940-NEXT: s_cbranch_execnz .LBB41_1 10744; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 10745; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 10746; GFX940-NEXT: s_setpc_b64 s[30:31] 10747; 10748; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10749; GFX11: ; %bb.0: 10750; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10751; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10752; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10753; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10754; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 10755; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 10756; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 10757; GFX11-NEXT: s_mov_b32 s0, 0 10758; GFX11-NEXT: flat_load_b32 v3, v[0:1] 10759; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10760; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10761; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 10762; GFX11-NEXT: v_not_b32_e32 v5, v5 10763; GFX11-NEXT: .p2align 6 10764; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start 10765; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 10766; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10767; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10769; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10770; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 10771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 10772; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 10773; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 10774; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10775; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10776; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10777; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10778; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10779; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 10780; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10781; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 10782; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 10783; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 10784; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10785; GFX11-NEXT: buffer_gl1_inv 10786; GFX11-NEXT: buffer_gl0_inv 10787; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10788; GFX11-NEXT: v_mov_b32_e32 v3, v2 10789; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 10790; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 10791; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 10792; GFX11-NEXT: s_cbranch_execnz .LBB41_1 10793; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 10794; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 10795; GFX11-NEXT: s_setpc_b64 s[30:31] 10796; 10797; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10798; GFX10: ; %bb.0: 10799; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10800; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 10801; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 10802; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10803; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 10804; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 10805; GFX10-NEXT: s_mov_b32 s4, 0 10806; GFX10-NEXT: flat_load_dword v3, v[0:1] 10807; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10808; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 10809; GFX10-NEXT: v_not_b32_e32 v5, v5 10810; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start 10811; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 10812; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10813; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10814; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 10815; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 10816; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 10817; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 10818; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 10819; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 10820; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10821; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 10822; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 10823; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10824; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10825; GFX10-NEXT: buffer_gl1_inv 10826; GFX10-NEXT: buffer_gl0_inv 10827; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 10828; GFX10-NEXT: v_mov_b32_e32 v3, v2 10829; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 10830; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 10831; GFX10-NEXT: s_cbranch_execnz .LBB41_1 10832; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 10833; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 10834; GFX10-NEXT: s_setpc_b64 s[30:31] 10835; 10836; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10837; GFX90A: ; %bb.0: 10838; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10839; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 10840; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 10841; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 10842; GFX90A-NEXT: flat_load_dword v3, v[0:1] 10843; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 10844; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10845; GFX90A-NEXT: s_mov_b32 s4, 0xffff 10846; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10847; GFX90A-NEXT: v_not_b32_e32 v5, v5 10848; GFX90A-NEXT: s_mov_b64 s[4:5], 0 10849; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10850; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 10851; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start 10852; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 10853; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10854; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10855; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 10856; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 10857; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 10858; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 10859; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10860; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10861; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10862; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 10863; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10864; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10865; GFX90A-NEXT: buffer_wbinvl1 10866; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10867; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10868; GFX90A-NEXT: v_mov_b32_e32 v3, v2 10869; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 10870; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 10871; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 10872; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 10873; GFX90A-NEXT: s_setpc_b64 s[30:31] 10874; 10875; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10876; GFX908: ; %bb.0: 10877; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10878; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 10879; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 10880; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 10881; GFX908-NEXT: flat_load_dword v3, v[0:1] 10882; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 10883; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10884; GFX908-NEXT: s_mov_b32 s4, 0xffff 10885; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10886; GFX908-NEXT: v_not_b32_e32 v5, v5 10887; GFX908-NEXT: s_mov_b64 s[4:5], 0 10888; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10889; GFX908-NEXT: s_movk_i32 s6, 0x7fff 10890; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start 10891; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 10892; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10893; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10894; GFX908-NEXT: v_min_f32_e32 v2, v2, v6 10895; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 10896; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 10897; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 10898; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10899; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 10900; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10901; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 10902; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10903; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10904; GFX908-NEXT: buffer_wbinvl1 10905; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10906; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10907; GFX908-NEXT: v_mov_b32_e32 v3, v2 10908; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 10909; GFX908-NEXT: s_cbranch_execnz .LBB41_1 10910; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 10911; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 10912; GFX908-NEXT: s_setpc_b64 s[30:31] 10913; 10914; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10915; GFX8: ; %bb.0: 10916; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10917; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0 10918; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 10919; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 10920; GFX8-NEXT: flat_load_dword v3, v[0:1] 10921; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 10922; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10923; GFX8-NEXT: s_mov_b32 s4, 0xffff 10924; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 10925; GFX8-NEXT: v_not_b32_e32 v5, v5 10926; GFX8-NEXT: s_mov_b64 s[4:5], 0 10927; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 10928; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start 10929; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 10930; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10931; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 10932; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 10933; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 10934; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 10935; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 10936; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 10937; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 10938; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 10939; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 10940; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 10941; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 10942; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10943; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10944; GFX8-NEXT: buffer_wbinvl1 10945; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10946; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10947; GFX8-NEXT: v_mov_b32_e32 v3, v2 10948; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 10949; GFX8-NEXT: s_cbranch_execnz .LBB41_1 10950; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 10951; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 10952; GFX8-NEXT: s_setpc_b64 s[30:31] 10953; 10954; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 10955; GFX7: ; %bb.0: 10956; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10957; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 10958; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 10959; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 10960; GFX7-NEXT: flat_load_dword v3, v[0:1] 10961; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 10962; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 10963; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 10964; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10965; GFX7-NEXT: v_not_b32_e32 v5, v5 10966; GFX7-NEXT: s_mov_b64 s[4:5], 0 10967; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 10968; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start 10969; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 10970; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10971; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 10972; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 10973; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 10974; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 10975; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 10976; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 10977; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 10978; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 10979; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 10980; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 10981; GFX7-NEXT: buffer_wbinvl1 10982; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 10983; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 10984; GFX7-NEXT: v_mov_b32_e32 v3, v2 10985; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 10986; GFX7-NEXT: s_cbranch_execnz .LBB41_1 10987; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 10988; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 10989; GFX7-NEXT: s_setpc_b64 s[30:31] 10990 %gep = getelementptr bfloat, ptr %ptr, i64 -1024 10991 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 10992 ret void 10993} 10994 10995define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 10996; GFX12-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 10997; GFX12: ; %bb.0: 10998; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 10999; GFX12-NEXT: s_wait_expcnt 0x0 11000; GFX12-NEXT: s_wait_samplecnt 0x0 11001; GFX12-NEXT: s_wait_bvhcnt 0x0 11002; GFX12-NEXT: s_wait_kmcnt 0x0 11003; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 11004; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11005; GFX12-NEXT: s_mov_b32 s0, 0 11006; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start 11007; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11008; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11009; GFX12-NEXT: v_mov_b32_e32 v4, v3 11010; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11011; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4 11012; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v2 11013; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11014; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1 11015; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3 11016; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 11017; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 11018; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11019; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 11020; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11021; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11022; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 11023; GFX12-NEXT: s_wait_storecnt 0x0 11024; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11025; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11026; GFX12-NEXT: global_inv scope:SCOPE_DEV 11027; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 11028; GFX12-NEXT: s_wait_alu 0xfffe 11029; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11030; GFX12-NEXT: s_wait_alu 0xfffe 11031; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11032; GFX12-NEXT: s_cbranch_execnz .LBB42_1 11033; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11034; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11035; GFX12-NEXT: v_mov_b32_e32 v0, v3 11036; GFX12-NEXT: s_wait_alu 0xfffe 11037; GFX12-NEXT: s_setpc_b64 s[30:31] 11038; 11039; GFX940-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11040; GFX940: ; %bb.0: 11041; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11042; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 11043; GFX940-NEXT: s_mov_b64 s[0:1], 0 11044; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11045; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11046; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 11047; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start 11048; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11049; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11050; GFX940-NEXT: v_mov_b32_e32 v5, v3 11051; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5 11052; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 11053; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1 11054; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3 11055; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2 11056; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 11057; GFX940-NEXT: s_nop 1 11058; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 11059; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11060; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3 11061; GFX940-NEXT: buffer_wbl2 sc1 11062; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0 11063; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11064; GFX940-NEXT: buffer_inv sc1 11065; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 11066; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11067; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11068; GFX940-NEXT: s_cbranch_execnz .LBB42_1 11069; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11070; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11071; GFX940-NEXT: v_mov_b32_e32 v0, v3 11072; GFX940-NEXT: s_setpc_b64 s[30:31] 11073; 11074; GFX11-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11075; GFX11: ; %bb.0: 11076; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11077; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 11078; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11079; GFX11-NEXT: s_mov_b32 s0, 0 11080; GFX11-NEXT: .p2align 6 11081; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start 11082; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11083; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11084; GFX11-NEXT: v_mov_b32_e32 v4, v3 11085; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11086; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4 11087; GFX11-NEXT: v_min_f32_e32 v3, v3, v2 11088; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11089; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1 11090; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3 11091; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 11092; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff 11093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11094; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo 11095; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11096; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11097; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 11098; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11099; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc 11100; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11101; GFX11-NEXT: buffer_gl1_inv 11102; GFX11-NEXT: buffer_gl0_inv 11103; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 11104; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11105; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11106; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11107; GFX11-NEXT: s_cbranch_execnz .LBB42_1 11108; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11109; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11110; GFX11-NEXT: v_mov_b32_e32 v0, v3 11111; GFX11-NEXT: s_setpc_b64 s[30:31] 11112; 11113; GFX10-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11114; GFX10: ; %bb.0: 11115; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11116; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11117; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 11118; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 11119; GFX10-NEXT: s_mov_b32 s4, 0 11120; GFX10-NEXT: flat_load_dword v0, v[3:4] 11121; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start 11122; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11123; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11124; GFX10-NEXT: v_mov_b32_e32 v6, v0 11125; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 11126; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 11127; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1 11128; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 11129; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 11130; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff 11131; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo 11132; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 11133; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0 11134; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11135; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 11136; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11137; GFX10-NEXT: buffer_gl1_inv 11138; GFX10-NEXT: buffer_gl0_inv 11139; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 11140; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11141; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11142; GFX10-NEXT: s_cbranch_execnz .LBB42_1 11143; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11144; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11145; GFX10-NEXT: s_setpc_b64 s[30:31] 11146; 11147; GFX90A-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11148; GFX90A: ; %bb.0: 11149; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11150; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 11151; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11152; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11153; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11154; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 11155; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start 11156; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11157; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11158; GFX90A-NEXT: v_mov_b32_e32 v5, v3 11159; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 11160; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 11161; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1 11162; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3 11163; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6 11164; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 11165; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc 11166; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11167; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3 11168; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc 11169; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11170; GFX90A-NEXT: buffer_wbinvl1 11171; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 11172; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11173; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11174; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 11175; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11176; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11177; GFX90A-NEXT: v_mov_b32_e32 v0, v3 11178; GFX90A-NEXT: s_setpc_b64 s[30:31] 11179; 11180; GFX908-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11181; GFX908: ; %bb.0: 11182; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11183; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 11184; GFX908-NEXT: s_mov_b64 s[4:5], 0 11185; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11186; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11187; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 11188; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start 11189; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11190; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11191; GFX908-NEXT: v_mov_b32_e32 v4, v3 11192; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4 11193; GFX908-NEXT: v_min_f32_e32 v3, v3, v2 11194; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1 11195; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3 11196; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6 11197; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 11198; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 11199; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11200; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3 11201; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc 11202; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11203; GFX908-NEXT: buffer_wbinvl1 11204; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 11205; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11206; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11207; GFX908-NEXT: s_cbranch_execnz .LBB42_1 11208; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11209; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11210; GFX908-NEXT: v_mov_b32_e32 v0, v3 11211; GFX908-NEXT: s_setpc_b64 s[30:31] 11212; 11213; GFX8-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11214; GFX8: ; %bb.0: 11215; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11216; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 11217; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 11218; GFX8-NEXT: flat_load_dword v0, v[3:4] 11219; GFX8-NEXT: s_mov_b64 s[4:5], 0 11220; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 11221; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start 11222; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11223; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11224; GFX8-NEXT: v_mov_b32_e32 v6, v0 11225; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 11226; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 11227; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1 11228; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0 11229; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 11230; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0 11231; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 11232; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 11233; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc 11234; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11235; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 11236; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11237; GFX8-NEXT: buffer_wbinvl1 11238; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 11239; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11240; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11241; GFX8-NEXT: s_cbranch_execnz .LBB42_1 11242; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11243; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11244; GFX8-NEXT: s_setpc_b64 s[30:31] 11245; 11246; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: 11247; GFX7: ; %bb.0: 11248; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11249; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 11250; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11251; GFX7-NEXT: flat_load_dword v3, v[0:1] 11252; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11253; GFX7-NEXT: s_mov_b64 s[4:5], 0 11254; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11255; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start 11256; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11257; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11258; GFX7-NEXT: v_mov_b32_e32 v4, v3 11259; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 11260; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 11261; GFX7-NEXT: v_min_f32_e32 v3, v3, v2 11262; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 11263; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 11264; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 11265; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 11266; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11267; GFX7-NEXT: buffer_wbinvl1 11268; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 11269; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11270; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11271; GFX7-NEXT: s_cbranch_execnz .LBB42_1 11272; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11273; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11274; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 11275; GFX7-NEXT: s_setpc_b64 s[30:31] 11276 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11277 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 11278 ret bfloat %result 11279} 11280 11281define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11282; GFX12-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11283; GFX12: ; %bb.0: 11284; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11285; GFX12-NEXT: s_wait_expcnt 0x0 11286; GFX12-NEXT: s_wait_samplecnt 0x0 11287; GFX12-NEXT: s_wait_bvhcnt 0x0 11288; GFX12-NEXT: s_wait_kmcnt 0x0 11289; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046 11290; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11291; GFX12-NEXT: s_mov_b32 s0, 0 11292; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start 11293; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11294; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11295; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11296; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11297; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 11298; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1 11299; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2 11300; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11301; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11302; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 11303; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 11304; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11305; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11306; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 11307; GFX12-NEXT: s_wait_storecnt 0x0 11308; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 11309; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11310; GFX12-NEXT: global_inv scope:SCOPE_DEV 11311; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11312; GFX12-NEXT: v_mov_b32_e32 v3, v2 11313; GFX12-NEXT: s_wait_alu 0xfffe 11314; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11315; GFX12-NEXT: s_wait_alu 0xfffe 11316; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11317; GFX12-NEXT: s_cbranch_execnz .LBB43_1 11318; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11319; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11320; GFX12-NEXT: s_wait_alu 0xfffe 11321; GFX12-NEXT: s_setpc_b64 s[30:31] 11322; 11323; GFX940-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11324; GFX940: ; %bb.0: 11325; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11326; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046 11327; GFX940-NEXT: s_mov_b64 s[0:1], 0 11328; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11329; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11330; GFX940-NEXT: s_mov_b32 s3, 0xffff0000 11331; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start 11332; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11333; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11334; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11335; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 11336; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1 11337; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2 11338; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2 11339; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11340; GFX940-NEXT: s_nop 1 11341; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11342; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11343; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2 11344; GFX940-NEXT: buffer_wbl2 sc1 11345; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0 11346; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11347; GFX940-NEXT: buffer_inv sc1 11348; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11349; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11350; GFX940-NEXT: v_mov_b32_e32 v3, v2 11351; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11352; GFX940-NEXT: s_cbranch_execnz .LBB43_1 11353; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11354; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11355; GFX940-NEXT: s_setpc_b64 s[30:31] 11356; 11357; GFX11-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11358; GFX11: ; %bb.0: 11359; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11360; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046 11361; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11362; GFX11-NEXT: s_mov_b32 s0, 0 11363; GFX11-NEXT: .p2align 6 11364; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start 11365; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11366; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11367; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11369; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 11370; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1 11371; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2 11372; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11373; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11374; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 11375; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 11376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11377; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11378; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 11379; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11380; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc 11381; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11382; GFX11-NEXT: buffer_gl1_inv 11383; GFX11-NEXT: buffer_gl0_inv 11384; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11385; GFX11-NEXT: v_mov_b32_e32 v3, v2 11386; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11387; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11388; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11389; GFX11-NEXT: s_cbranch_execnz .LBB43_1 11390; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11391; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11392; GFX11-NEXT: s_setpc_b64 s[30:31] 11393; 11394; GFX10-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11395; GFX10: ; %bb.0: 11396; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11397; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0 11398; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11399; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11400; GFX10-NEXT: s_mov_b32 s4, 0 11401; GFX10-NEXT: flat_load_dword v3, v[0:1] 11402; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start 11403; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11404; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11405; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11406; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 11407; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 11408; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 11409; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11410; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff 11411; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo 11412; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11413; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 11414; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11415; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11416; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11417; GFX10-NEXT: buffer_gl1_inv 11418; GFX10-NEXT: buffer_gl0_inv 11419; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11420; GFX10-NEXT: v_mov_b32_e32 v3, v2 11421; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11422; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11423; GFX10-NEXT: s_cbranch_execnz .LBB43_1 11424; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11425; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11426; GFX10-NEXT: s_setpc_b64 s[30:31] 11427; 11428; GFX90A-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11429; GFX90A: ; %bb.0: 11430; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11431; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046 11432; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11433; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11434; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11435; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000 11436; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start 11437; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11438; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11439; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11440; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 11441; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1 11442; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2 11443; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6 11444; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11445; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11446; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11447; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2 11448; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 11449; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11450; GFX90A-NEXT: buffer_wbinvl1 11451; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11452; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11453; GFX90A-NEXT: v_mov_b32_e32 v3, v2 11454; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11455; GFX90A-NEXT: s_cbranch_execnz .LBB43_1 11456; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11457; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11458; GFX90A-NEXT: s_setpc_b64 s[30:31] 11459; 11460; GFX908-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11461; GFX908: ; %bb.0: 11462; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11463; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046 11464; GFX908-NEXT: s_mov_b64 s[4:5], 0 11465; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11466; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11467; GFX908-NEXT: s_mov_b32 s7, 0xffff0000 11468; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start 11469; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11470; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11471; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11472; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 11473; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1 11474; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2 11475; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6 11476; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11477; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 11478; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11479; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2 11480; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc 11481; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11482; GFX908-NEXT: buffer_wbinvl1 11483; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11484; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11485; GFX908-NEXT: v_mov_b32_e32 v3, v2 11486; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11487; GFX908-NEXT: s_cbranch_execnz .LBB43_1 11488; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11489; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11490; GFX908-NEXT: s_setpc_b64 s[30:31] 11491; 11492; GFX8-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11493; GFX8: ; %bb.0: 11494; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11495; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0 11496; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11497; GFX8-NEXT: flat_load_dword v3, v[0:1] 11498; GFX8-NEXT: s_mov_b64 s[4:5], 0 11499; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 11500; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start 11501; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11502; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11503; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11504; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 11505; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 11506; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 11507; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 11508; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2 11509; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11510; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 11511; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc 11512; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11513; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11514; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11515; GFX8-NEXT: buffer_wbinvl1 11516; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11517; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11518; GFX8-NEXT: v_mov_b32_e32 v3, v2 11519; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11520; GFX8-NEXT: s_cbranch_execnz .LBB43_1 11521; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11522; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11523; GFX8-NEXT: s_setpc_b64 s[30:31] 11524; 11525; GFX7-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: 11526; GFX7: ; %bb.0: 11527; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11528; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0 11529; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11530; GFX7-NEXT: flat_load_dword v3, v[0:1] 11531; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11532; GFX7-NEXT: s_mov_b64 s[4:5], 0 11533; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 11534; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start 11535; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11536; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11537; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 11538; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11539; GFX7-NEXT: v_min_f32_e32 v2, v2, v4 11540; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 11541; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11542; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 11543; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 11544; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11545; GFX7-NEXT: buffer_wbinvl1 11546; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 11547; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11548; GFX7-NEXT: v_mov_b32_e32 v3, v2 11549; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11550; GFX7-NEXT: s_cbranch_execnz .LBB43_1 11551; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11552; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11553; GFX7-NEXT: s_setpc_b64 s[30:31] 11554 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11555 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 11556 ret void 11557} 11558 11559define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11560; GFX12-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11561; GFX12: ; %bb.0: 11562; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11563; GFX12-NEXT: s_wait_expcnt 0x0 11564; GFX12-NEXT: s_wait_samplecnt 0x0 11565; GFX12-NEXT: s_wait_bvhcnt 0x0 11566; GFX12-NEXT: s_wait_kmcnt 0x0 11567; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11568; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11569; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11570; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11571; GFX12-NEXT: v_and_b32_e32 v0, -4, v3 11572; GFX12-NEXT: v_and_b32_e32 v3, 3, v3 11573; GFX12-NEXT: s_mov_b32 s0, 0 11574; GFX12-NEXT: flat_load_b32 v5, v[0:1] 11575; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11576; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11577; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11578; GFX12-NEXT: v_not_b32_e32 v4, v4 11579; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start 11580; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11581; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11582; GFX12-NEXT: v_mov_b32_e32 v6, v5 11583; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11584; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11585; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11586; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11587; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 11588; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1 11589; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5 11590; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11591; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11592; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11593; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11594; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11595; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11596; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11597; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11598; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5 11599; GFX12-NEXT: global_wb scope:SCOPE_SYS 11600; GFX12-NEXT: s_wait_storecnt 0x0 11601; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 11602; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11603; GFX12-NEXT: global_inv scope:SCOPE_SYS 11604; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11605; GFX12-NEXT: s_wait_alu 0xfffe 11606; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11607; GFX12-NEXT: s_wait_alu 0xfffe 11608; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11609; GFX12-NEXT: s_cbranch_execnz .LBB44_1 11610; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11611; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11612; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11613; GFX12-NEXT: s_wait_alu 0xfffe 11614; GFX12-NEXT: s_setpc_b64 s[30:31] 11615; 11616; GFX940-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11617; GFX940: ; %bb.0: 11618; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11619; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11620; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11621; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11622; GFX940-NEXT: v_mov_b32_e32 v1, v5 11623; GFX940-NEXT: flat_load_dword v5, v[0:1] 11624; GFX940-NEXT: v_and_b32_e32 v3, 3, v4 11625; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11626; GFX940-NEXT: s_mov_b32 s0, 0xffff 11627; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0 11628; GFX940-NEXT: v_not_b32_e32 v4, v4 11629; GFX940-NEXT: s_mov_b64 s[0:1], 0 11630; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11631; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11632; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start 11633; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11634; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11635; GFX940-NEXT: v_mov_b32_e32 v7, v5 11636; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11637; GFX940-NEXT: s_nop 0 11638; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 11639; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1 11640; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5 11641; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2 11642; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11643; GFX940-NEXT: s_nop 1 11644; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11645; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11646; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5 11647; GFX940-NEXT: buffer_wbl2 sc0 sc1 11648; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1 11649; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11650; GFX940-NEXT: buffer_inv sc0 sc1 11651; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11652; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 11653; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 11654; GFX940-NEXT: s_cbranch_execnz .LBB44_1 11655; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 11656; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 11657; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11658; GFX940-NEXT: s_setpc_b64 s[30:31] 11659; 11660; GFX11-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11661; GFX11: ; %bb.0: 11662; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11663; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11664; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11665; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11667; GFX11-NEXT: v_and_b32_e32 v0, -4, v3 11668; GFX11-NEXT: v_and_b32_e32 v3, 3, v3 11669; GFX11-NEXT: s_mov_b32 s0, 0 11670; GFX11-NEXT: flat_load_b32 v5, v[0:1] 11671; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11672; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11673; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11674; GFX11-NEXT: v_not_b32_e32 v4, v4 11675; GFX11-NEXT: .p2align 6 11676; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start 11677; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 11678; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11679; GFX11-NEXT: v_mov_b32_e32 v6, v5 11680; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11681; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11682; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11683; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11684; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 11685; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1 11686; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5 11687; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11688; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 11689; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11690; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11691; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11692; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11693; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11694; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 11695; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5 11696; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 11697; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc 11698; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11699; GFX11-NEXT: buffer_gl1_inv 11700; GFX11-NEXT: buffer_gl0_inv 11701; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11702; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 11703; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 11704; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11705; GFX11-NEXT: s_cbranch_execnz .LBB44_1 11706; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 11707; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 11708; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11709; GFX11-NEXT: s_setpc_b64 s[30:31] 11710; 11711; GFX10-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11712; GFX10: ; %bb.0: 11713; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11714; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 11715; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11716; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11717; GFX10-NEXT: v_and_b32_e32 v0, -4, v3 11718; GFX10-NEXT: v_and_b32_e32 v3, 3, v3 11719; GFX10-NEXT: s_mov_b32 s4, 0 11720; GFX10-NEXT: flat_load_dword v5, v[0:1] 11721; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11722; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff 11723; GFX10-NEXT: v_not_b32_e32 v4, v4 11724; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start 11725; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 11726; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11727; GFX10-NEXT: v_mov_b32_e32 v6, v5 11728; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11729; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 11730; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 11731; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5 11732; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 11733; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff 11734; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo 11735; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11736; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5 11737; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 11738; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11739; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11740; GFX10-NEXT: buffer_gl1_inv 11741; GFX10-NEXT: buffer_gl0_inv 11742; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6 11743; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 11744; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 11745; GFX10-NEXT: s_cbranch_execnz .LBB44_1 11746; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 11747; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 11748; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11749; GFX10-NEXT: s_setpc_b64 s[30:31] 11750; 11751; GFX90A-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11752; GFX90A: ; %bb.0: 11753; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11754; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11755; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11756; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3 11757; GFX90A-NEXT: flat_load_dword v5, v[0:1] 11758; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3 11759; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11760; GFX90A-NEXT: s_mov_b32 s4, 0xffff 11761; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11762; GFX90A-NEXT: v_not_b32_e32 v4, v4 11763; GFX90A-NEXT: s_mov_b64 s[4:5], 0 11764; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11765; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 11766; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start 11767; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 11768; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11769; GFX90A-NEXT: v_mov_b32_e32 v7, v5 11770; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11771; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 11772; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1 11773; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5 11774; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6 11775; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11776; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc 11777; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11778; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5 11779; GFX90A-NEXT: buffer_wbl2 11780; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc 11781; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11782; GFX90A-NEXT: buffer_invl2 11783; GFX90A-NEXT: buffer_wbinvl1 11784; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 11785; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11786; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 11787; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 11788; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 11789; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 11790; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11791; GFX90A-NEXT: s_setpc_b64 s[30:31] 11792; 11793; GFX908-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11794; GFX908: ; %bb.0: 11795; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11796; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0 11797; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11798; GFX908-NEXT: v_and_b32_e32 v0, -4, v3 11799; GFX908-NEXT: flat_load_dword v5, v[0:1] 11800; GFX908-NEXT: v_and_b32_e32 v3, 3, v3 11801; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11802; GFX908-NEXT: s_mov_b32 s4, 0xffff 11803; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11804; GFX908-NEXT: v_not_b32_e32 v4, v4 11805; GFX908-NEXT: s_mov_b64 s[4:5], 0 11806; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11807; GFX908-NEXT: s_movk_i32 s6, 0x7fff 11808; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start 11809; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 11810; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11811; GFX908-NEXT: v_mov_b32_e32 v6, v5 11812; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11813; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 11814; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1 11815; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5 11816; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6 11817; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11818; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc 11819; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11820; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5 11821; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11822; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11823; GFX908-NEXT: buffer_wbinvl1 11824; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11825; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11826; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 11827; GFX908-NEXT: s_cbranch_execnz .LBB44_1 11828; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 11829; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 11830; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11831; GFX908-NEXT: s_setpc_b64 s[30:31] 11832; 11833; GFX8-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11834; GFX8: ; %bb.0: 11835; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11836; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0 11837; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11838; GFX8-NEXT: v_and_b32_e32 v0, -4, v3 11839; GFX8-NEXT: flat_load_dword v5, v[0:1] 11840; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 11841; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11842; GFX8-NEXT: s_mov_b32 s4, 0xffff 11843; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4 11844; GFX8-NEXT: v_not_b32_e32 v4, v4 11845; GFX8-NEXT: s_mov_b64 s[4:5], 0 11846; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11847; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start 11848; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 11849; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11850; GFX8-NEXT: v_mov_b32_e32 v6, v5 11851; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11852; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 11853; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1 11854; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5 11855; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 11856; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5 11857; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 11858; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc 11859; GFX8-NEXT: v_and_b32_e32 v7, v6, v4 11860; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 11861; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 11862; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11863; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11864; GFX8-NEXT: buffer_wbinvl1 11865; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11866; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11867; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 11868; GFX8-NEXT: s_cbranch_execnz .LBB44_1 11869; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 11870; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 11871; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11872; GFX8-NEXT: s_setpc_b64 s[30:31] 11873; 11874; GFX7-LABEL: flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11875; GFX7: ; %bb.0: 11876; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11877; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0 11878; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11879; GFX7-NEXT: v_and_b32_e32 v0, -4, v3 11880; GFX7-NEXT: flat_load_dword v5, v[0:1] 11881; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 11882; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 11883; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3 11884; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 11885; GFX7-NEXT: v_not_b32_e32 v4, v4 11886; GFX7-NEXT: s_mov_b64 s[4:5], 0 11887; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 11888; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start 11889; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 11890; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11891; GFX7-NEXT: v_mov_b32_e32 v6, v5 11892; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6 11893; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 11894; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 11895; GFX7-NEXT: v_min_f32_e32 v5, v5, v2 11896; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 11897; GFX7-NEXT: v_and_b32_e32 v7, v6, v4 11898; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5 11899; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 11900; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc 11901; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11902; GFX7-NEXT: buffer_wbinvl1 11903; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6 11904; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 11905; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 11906; GFX7-NEXT: s_cbranch_execnz .LBB44_1 11907; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 11908; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 11909; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 11910; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 11911; GFX7-NEXT: s_setpc_b64 s[30:31] 11912 %gep = getelementptr bfloat, ptr %ptr, i64 1023 11913 %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 11914 ret bfloat %result 11915} 11916 11917define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 { 11918; GFX12-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11919; GFX12: ; %bb.0: 11920; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11921; GFX12-NEXT: s_wait_expcnt 0x0 11922; GFX12-NEXT: s_wait_samplecnt 0x0 11923; GFX12-NEXT: s_wait_bvhcnt 0x0 11924; GFX12-NEXT: s_wait_kmcnt 0x0 11925; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 11926; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 11927; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11928; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 11929; GFX12-NEXT: v_and_b32_e32 v0, -4, v4 11930; GFX12-NEXT: v_and_b32_e32 v4, 3, v4 11931; GFX12-NEXT: s_mov_b32 s0, 0 11932; GFX12-NEXT: flat_load_b32 v3, v[0:1] 11933; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11934; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 11935; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 11936; GFX12-NEXT: v_not_b32_e32 v5, v5 11937; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start 11938; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 11939; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11940; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3 11941; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11942; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 11943; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v6 11944; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 11945; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 11946; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2 11947; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 11948; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 11949; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11950; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 11951; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2 11952; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 11953; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2 11954; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2 11955; GFX12-NEXT: global_wb scope:SCOPE_SYS 11956; GFX12-NEXT: s_wait_storecnt 0x0 11957; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS 11958; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 11959; GFX12-NEXT: global_inv scope:SCOPE_SYS 11960; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 11961; GFX12-NEXT: v_mov_b32_e32 v3, v2 11962; GFX12-NEXT: s_wait_alu 0xfffe 11963; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 11964; GFX12-NEXT: s_wait_alu 0xfffe 11965; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 11966; GFX12-NEXT: s_cbranch_execnz .LBB45_1 11967; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 11968; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 11969; GFX12-NEXT: s_wait_alu 0xfffe 11970; GFX12-NEXT: s_setpc_b64 s[30:31] 11971; 11972; GFX940-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 11973; GFX940: ; %bb.0: 11974; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11975; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe 11976; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] 11977; GFX940-NEXT: v_and_b32_e32 v0, -4, v4 11978; GFX940-NEXT: v_mov_b32_e32 v1, v5 11979; GFX940-NEXT: flat_load_dword v3, v[0:1] 11980; GFX940-NEXT: v_and_b32_e32 v4, 3, v4 11981; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4 11982; GFX940-NEXT: s_mov_b32 s0, 0xffff 11983; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0 11984; GFX940-NEXT: v_not_b32_e32 v5, v5 11985; GFX940-NEXT: s_mov_b64 s[0:1], 0 11986; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2 11987; GFX940-NEXT: s_movk_i32 s2, 0x7fff 11988; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start 11989; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 11990; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 11991; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 11992; GFX940-NEXT: s_nop 0 11993; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 11994; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 11995; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 11996; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2 11997; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 11998; GFX940-NEXT: s_nop 1 11999; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12000; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12001; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2 12002; GFX940-NEXT: buffer_wbl2 sc0 sc1 12003; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 12004; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12005; GFX940-NEXT: buffer_inv sc0 sc1 12006; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12007; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12008; GFX940-NEXT: v_mov_b32_e32 v3, v2 12009; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12010; GFX940-NEXT: s_cbranch_execnz .LBB45_1 12011; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12012; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12013; GFX940-NEXT: s_setpc_b64 s[30:31] 12014; 12015; GFX11-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12016; GFX11: ; %bb.0: 12017; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12018; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 12019; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 12020; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12021; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) 12022; GFX11-NEXT: v_and_b32_e32 v0, -4, v4 12023; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 12024; GFX11-NEXT: s_mov_b32 s0, 0 12025; GFX11-NEXT: flat_load_b32 v3, v[0:1] 12026; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12027; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12028; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12029; GFX11-NEXT: v_not_b32_e32 v5, v5 12030; GFX11-NEXT: .p2align 6 12031; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start 12032; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12033; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12034; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12035; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12036; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12037; GFX11-NEXT: v_min_f32_e32 v2, v2, v6 12038; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 12039; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 12040; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 12041; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12042; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12043; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12044; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12045; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12046; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12047; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12048; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2 12049; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12050; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 12051; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12052; GFX11-NEXT: buffer_gl1_inv 12053; GFX11-NEXT: buffer_gl0_inv 12054; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12055; GFX11-NEXT: v_mov_b32_e32 v3, v2 12056; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12057; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12058; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12059; GFX11-NEXT: s_cbranch_execnz .LBB45_1 12060; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12061; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12062; GFX11-NEXT: s_setpc_b64 s[30:31] 12063; 12064; GFX10-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12065; GFX10: ; %bb.0: 12066; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12067; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 12068; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 12069; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12070; GFX10-NEXT: v_and_b32_e32 v0, -4, v4 12071; GFX10-NEXT: v_and_b32_e32 v4, 3, v4 12072; GFX10-NEXT: s_mov_b32 s4, 0 12073; GFX10-NEXT: flat_load_dword v3, v[0:1] 12074; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12075; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff 12076; GFX10-NEXT: v_not_b32_e32 v5, v5 12077; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start 12078; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12079; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12080; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12081; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 12082; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 12083; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 12084; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 12085; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 12086; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo 12087; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12088; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2 12089; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12090; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12091; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12092; GFX10-NEXT: buffer_gl1_inv 12093; GFX10-NEXT: buffer_gl0_inv 12094; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 12095; GFX10-NEXT: v_mov_b32_e32 v3, v2 12096; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12097; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12098; GFX10-NEXT: s_cbranch_execnz .LBB45_1 12099; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12100; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12101; GFX10-NEXT: s_setpc_b64 s[30:31] 12102; 12103; GFX90A-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12104; GFX90A: ; %bb.0: 12105; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12106; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 12107; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12108; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4 12109; GFX90A-NEXT: flat_load_dword v3, v[0:1] 12110; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4 12111; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12112; GFX90A-NEXT: s_mov_b32 s4, 0xffff 12113; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12114; GFX90A-NEXT: v_not_b32_e32 v5, v5 12115; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12116; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12117; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 12118; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start 12119; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12120; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12121; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12122; GFX90A-NEXT: v_min_f32_e32 v2, v2, v6 12123; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 12124; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 12125; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6 12126; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12127; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12128; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12129; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2 12130; GFX90A-NEXT: buffer_wbl2 12131; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12132; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12133; GFX90A-NEXT: buffer_invl2 12134; GFX90A-NEXT: buffer_wbinvl1 12135; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12136; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12137; GFX90A-NEXT: v_mov_b32_e32 v3, v2 12138; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12139; GFX90A-NEXT: s_cbranch_execnz .LBB45_1 12140; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12141; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12142; GFX90A-NEXT: s_setpc_b64 s[30:31] 12143; 12144; GFX908-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12145; GFX908: ; %bb.0: 12146; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12147; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0 12148; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12149; GFX908-NEXT: v_and_b32_e32 v0, -4, v4 12150; GFX908-NEXT: flat_load_dword v3, v[0:1] 12151; GFX908-NEXT: v_and_b32_e32 v4, 3, v4 12152; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12153; GFX908-NEXT: s_mov_b32 s4, 0xffff 12154; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12155; GFX908-NEXT: v_not_b32_e32 v5, v5 12156; GFX908-NEXT: s_mov_b64 s[4:5], 0 12157; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12158; GFX908-NEXT: s_movk_i32 s6, 0x7fff 12159; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start 12160; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12161; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12162; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12163; GFX908-NEXT: v_min_f32_e32 v2, v2, v6 12164; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 12165; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 12166; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6 12167; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12168; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc 12169; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12170; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2 12171; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12172; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12173; GFX908-NEXT: buffer_wbinvl1 12174; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12175; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12176; GFX908-NEXT: v_mov_b32_e32 v3, v2 12177; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12178; GFX908-NEXT: s_cbranch_execnz .LBB45_1 12179; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12180; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12181; GFX908-NEXT: s_setpc_b64 s[30:31] 12182; 12183; GFX8-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12184; GFX8: ; %bb.0: 12185; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12186; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0 12187; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12188; GFX8-NEXT: v_and_b32_e32 v0, -4, v4 12189; GFX8-NEXT: flat_load_dword v3, v[0:1] 12190; GFX8-NEXT: v_and_b32_e32 v4, 3, v4 12191; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12192; GFX8-NEXT: s_mov_b32 s4, 0xffff 12193; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4 12194; GFX8-NEXT: v_not_b32_e32 v5, v5 12195; GFX8-NEXT: s_mov_b64 s[4:5], 0 12196; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2 12197; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start 12198; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12199; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12200; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12201; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 12202; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1 12203; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2 12204; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 12205; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2 12206; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 12207; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc 12208; GFX8-NEXT: v_and_b32_e32 v7, v3, v5 12209; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 12210; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 12211; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12212; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12213; GFX8-NEXT: buffer_wbinvl1 12214; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12215; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12216; GFX8-NEXT: v_mov_b32_e32 v3, v2 12217; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12218; GFX8-NEXT: s_cbranch_execnz .LBB45_1 12219; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12220; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12221; GFX8-NEXT: s_setpc_b64 s[30:31] 12222; 12223; GFX7-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 12224; GFX7: ; %bb.0: 12225; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12226; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0 12227; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 12228; GFX7-NEXT: v_and_b32_e32 v0, -4, v4 12229; GFX7-NEXT: flat_load_dword v3, v[0:1] 12230; GFX7-NEXT: v_and_b32_e32 v4, 3, v4 12231; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4 12232; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4 12233; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12234; GFX7-NEXT: v_not_b32_e32 v5, v5 12235; GFX7-NEXT: s_mov_b64 s[4:5], 0 12236; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 12237; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start 12238; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12239; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12240; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3 12241; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 12242; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 12243; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 12244; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 12245; GFX7-NEXT: v_and_b32_e32 v7, v3, v5 12246; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2 12247; GFX7-NEXT: v_or_b32_e32 v2, v7, v2 12248; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 12249; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12250; GFX7-NEXT: buffer_wbinvl1 12251; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 12252; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12253; GFX7-NEXT: v_mov_b32_e32 v3, v2 12254; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12255; GFX7-NEXT: s_cbranch_execnz .LBB45_1 12256; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12257; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12258; GFX7-NEXT: s_setpc_b64 s[30:31] 12259 %gep = getelementptr bfloat, ptr %ptr, i64 1023 12260 %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 12261 ret void 12262} 12263 12264; -------------------------------------------------------------------- 12265; <2 x half> 12266; -------------------------------------------------------------------- 12267 12268define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 12269; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12270; GFX12: ; %bb.0: 12271; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12272; GFX12-NEXT: s_wait_expcnt 0x0 12273; GFX12-NEXT: s_wait_samplecnt 0x0 12274; GFX12-NEXT: s_wait_bvhcnt 0x0 12275; GFX12-NEXT: s_wait_kmcnt 0x0 12276; GFX12-NEXT: flat_load_b32 v3, v[0:1] 12277; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 12278; GFX12-NEXT: s_mov_b32 s0, 0 12279; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start 12280; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12281; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12282; GFX12-NEXT: v_mov_b32_e32 v4, v3 12283; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12284; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 12285; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 12286; GFX12-NEXT: s_wait_storecnt 0x0 12287; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12288; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12289; GFX12-NEXT: global_inv scope:SCOPE_DEV 12290; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12291; GFX12-NEXT: s_wait_alu 0xfffe 12292; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12293; GFX12-NEXT: s_wait_alu 0xfffe 12294; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12295; GFX12-NEXT: s_cbranch_execnz .LBB46_1 12296; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12297; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12298; GFX12-NEXT: v_mov_b32_e32 v0, v3 12299; GFX12-NEXT: s_wait_alu 0xfffe 12300; GFX12-NEXT: s_setpc_b64 s[30:31] 12301; 12302; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12303; GFX940: ; %bb.0: 12304; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12305; GFX940-NEXT: flat_load_dword v3, v[0:1] 12306; GFX940-NEXT: s_mov_b64 s[0:1], 0 12307; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 12308; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start 12309; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12310; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12311; GFX940-NEXT: v_mov_b32_e32 v5, v3 12312; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 12313; GFX940-NEXT: s_nop 0 12314; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 12315; GFX940-NEXT: buffer_wbl2 sc1 12316; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 12317; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12318; GFX940-NEXT: buffer_inv sc1 12319; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12320; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12321; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12322; GFX940-NEXT: s_cbranch_execnz .LBB46_1 12323; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12324; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12325; GFX940-NEXT: v_mov_b32_e32 v0, v3 12326; GFX940-NEXT: s_setpc_b64 s[30:31] 12327; 12328; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12329; GFX11: ; %bb.0: 12330; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12331; GFX11-NEXT: flat_load_b32 v3, v[0:1] 12332; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 12333; GFX11-NEXT: s_mov_b32 s0, 0 12334; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start 12335; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12336; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12337; GFX11-NEXT: v_mov_b32_e32 v4, v3 12338; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12339; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 12340; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 12341; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12342; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc 12343; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12344; GFX11-NEXT: buffer_gl1_inv 12345; GFX11-NEXT: buffer_gl0_inv 12346; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12347; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12348; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12349; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12350; GFX11-NEXT: s_cbranch_execnz .LBB46_1 12351; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12352; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12353; GFX11-NEXT: v_mov_b32_e32 v0, v3 12354; GFX11-NEXT: s_setpc_b64 s[30:31] 12355; 12356; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12357; GFX10: ; %bb.0: 12358; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12359; GFX10-NEXT: flat_load_dword v3, v[0:1] 12360; GFX10-NEXT: v_pk_max_f16 v2, v2, v2 12361; GFX10-NEXT: s_mov_b32 s4, 0 12362; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start 12363; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12364; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12365; GFX10-NEXT: v_mov_b32_e32 v4, v3 12366; GFX10-NEXT: v_pk_max_f16 v3, v4, v4 12367; GFX10-NEXT: v_pk_min_f16 v3, v3, v2 12368; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12369; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12370; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12371; GFX10-NEXT: buffer_gl1_inv 12372; GFX10-NEXT: buffer_gl0_inv 12373; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12374; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12375; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12376; GFX10-NEXT: s_cbranch_execnz .LBB46_1 12377; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12378; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12379; GFX10-NEXT: v_mov_b32_e32 v0, v3 12380; GFX10-NEXT: s_setpc_b64 s[30:31] 12381; 12382; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12383; GFX90A: ; %bb.0: 12384; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12385; GFX90A-NEXT: flat_load_dword v3, v[0:1] 12386; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12387; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 12388; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start 12389; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12390; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12391; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12392; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 12393; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 12394; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 12395; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12396; GFX90A-NEXT: buffer_wbinvl1 12397; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12398; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12399; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12400; GFX90A-NEXT: s_cbranch_execnz .LBB46_1 12401; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12402; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12403; GFX90A-NEXT: v_mov_b32_e32 v0, v3 12404; GFX90A-NEXT: s_setpc_b64 s[30:31] 12405; 12406; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12407; GFX908: ; %bb.0: 12408; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12409; GFX908-NEXT: flat_load_dword v3, v[0:1] 12410; GFX908-NEXT: s_mov_b64 s[4:5], 0 12411; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 12412; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start 12413; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12414; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12415; GFX908-NEXT: v_mov_b32_e32 v4, v3 12416; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 12417; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 12418; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 12419; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12420; GFX908-NEXT: buffer_wbinvl1 12421; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12422; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12423; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12424; GFX908-NEXT: s_cbranch_execnz .LBB46_1 12425; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12426; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12427; GFX908-NEXT: v_mov_b32_e32 v0, v3 12428; GFX908-NEXT: s_setpc_b64 s[30:31] 12429; 12430; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12431; GFX8: ; %bb.0: 12432; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12433; GFX8-NEXT: flat_load_dword v3, v[0:1] 12434; GFX8-NEXT: s_mov_b64 s[4:5], 0 12435; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12436; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 12437; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start 12438; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12439; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12440; GFX8-NEXT: v_mov_b32_e32 v6, v3 12441; GFX8-NEXT: v_max_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12442; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 12443; GFX8-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12444; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 12445; GFX8-NEXT: v_or_b32_e32 v5, v5, v3 12446; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 12447; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12448; GFX8-NEXT: buffer_wbinvl1 12449; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 12450; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12451; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12452; GFX8-NEXT: s_cbranch_execnz .LBB46_1 12453; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12454; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12455; GFX8-NEXT: v_mov_b32_e32 v0, v3 12456; GFX8-NEXT: s_setpc_b64 s[30:31] 12457; 12458; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__amdgpu_no_fine_grained_memory: 12459; GFX7: ; %bb.0: 12460; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12461; GFX7-NEXT: flat_load_dword v5, v[0:1] 12462; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 12463; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 12464; GFX7-NEXT: s_mov_b64 s[4:5], 0 12465; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3 12466; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12467; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 12468; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5 12469; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12470; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6 12471; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start 12472; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12473; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 12474; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 12475; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3 12476; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2 12477; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 12478; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 12479; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 12480; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12481; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12482; GFX7-NEXT: v_or_b32_e32 v7, v2, v3 12483; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6 12484; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 12485; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc 12486; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12487; GFX7-NEXT: buffer_wbinvl1 12488; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6 12489; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6 12490; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12491; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12492; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12493; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12494; GFX7-NEXT: s_cbranch_execnz .LBB46_1 12495; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12496; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12497; GFX7-NEXT: v_mov_b32_e32 v0, v2 12498; GFX7-NEXT: v_mov_b32_e32 v1, v3 12499; GFX7-NEXT: s_setpc_b64 s[30:31] 12500 %result = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 12501 ret <2 x half> %result 12502} 12503 12504define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 12505; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12506; GFX12: ; %bb.0: 12507; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12508; GFX12-NEXT: s_wait_expcnt 0x0 12509; GFX12-NEXT: s_wait_samplecnt 0x0 12510; GFX12-NEXT: s_wait_bvhcnt 0x0 12511; GFX12-NEXT: s_wait_kmcnt 0x0 12512; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 12513; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 12514; GFX12-NEXT: s_mov_b32 s0, 0 12515; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start 12516; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12517; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12518; GFX12-NEXT: v_mov_b32_e32 v4, v3 12519; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12520; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 12521; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 12522; GFX12-NEXT: s_wait_storecnt 0x0 12523; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12524; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12525; GFX12-NEXT: global_inv scope:SCOPE_DEV 12526; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12527; GFX12-NEXT: s_wait_alu 0xfffe 12528; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12529; GFX12-NEXT: s_wait_alu 0xfffe 12530; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12531; GFX12-NEXT: s_cbranch_execnz .LBB47_1 12532; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12533; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12534; GFX12-NEXT: v_mov_b32_e32 v0, v3 12535; GFX12-NEXT: s_wait_alu 0xfffe 12536; GFX12-NEXT: s_setpc_b64 s[30:31] 12537; 12538; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12539; GFX940: ; %bb.0: 12540; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12541; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 12542; GFX940-NEXT: s_mov_b64 s[0:1], 0 12543; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 12544; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start 12545; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12546; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12547; GFX940-NEXT: v_mov_b32_e32 v5, v3 12548; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 12549; GFX940-NEXT: s_nop 0 12550; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 12551; GFX940-NEXT: buffer_wbl2 sc1 12552; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 12553; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12554; GFX940-NEXT: buffer_inv sc1 12555; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12556; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12557; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12558; GFX940-NEXT: s_cbranch_execnz .LBB47_1 12559; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12560; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12561; GFX940-NEXT: v_mov_b32_e32 v0, v3 12562; GFX940-NEXT: s_setpc_b64 s[30:31] 12563; 12564; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12565; GFX11: ; %bb.0: 12566; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12567; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 12568; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 12569; GFX11-NEXT: s_mov_b32 s0, 0 12570; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start 12571; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12572; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12573; GFX11-NEXT: v_mov_b32_e32 v4, v3 12574; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12575; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 12576; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 12577; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12578; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 12579; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12580; GFX11-NEXT: buffer_gl1_inv 12581; GFX11-NEXT: buffer_gl0_inv 12582; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12583; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12584; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12585; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12586; GFX11-NEXT: s_cbranch_execnz .LBB47_1 12587; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12588; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12589; GFX11-NEXT: v_mov_b32_e32 v0, v3 12590; GFX11-NEXT: s_setpc_b64 s[30:31] 12591; 12592; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12593; GFX10: ; %bb.0: 12594; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12595; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 12596; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 12597; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 12598; GFX10-NEXT: s_mov_b32 s4, 0 12599; GFX10-NEXT: flat_load_dword v0, v[3:4] 12600; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start 12601; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12602; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12603; GFX10-NEXT: v_mov_b32_e32 v6, v0 12604; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 12605; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 12606; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12607; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12608; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12609; GFX10-NEXT: buffer_gl1_inv 12610; GFX10-NEXT: buffer_gl0_inv 12611; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 12612; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12613; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12614; GFX10-NEXT: s_cbranch_execnz .LBB47_1 12615; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12616; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12617; GFX10-NEXT: s_setpc_b64 s[30:31] 12618; 12619; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12620; GFX90A: ; %bb.0: 12621; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12622; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 12623; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12624; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 12625; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start 12626; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12627; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12628; GFX90A-NEXT: v_mov_b32_e32 v5, v3 12629; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 12630; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 12631; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 12632; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12633; GFX90A-NEXT: buffer_wbinvl1 12634; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 12635; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12636; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12637; GFX90A-NEXT: s_cbranch_execnz .LBB47_1 12638; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12639; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12640; GFX90A-NEXT: v_mov_b32_e32 v0, v3 12641; GFX90A-NEXT: s_setpc_b64 s[30:31] 12642; 12643; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12644; GFX908: ; %bb.0: 12645; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12646; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 12647; GFX908-NEXT: s_mov_b64 s[4:5], 0 12648; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 12649; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start 12650; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12651; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12652; GFX908-NEXT: v_mov_b32_e32 v4, v3 12653; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 12654; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 12655; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 12656; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12657; GFX908-NEXT: buffer_wbinvl1 12658; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 12659; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12660; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12661; GFX908-NEXT: s_cbranch_execnz .LBB47_1 12662; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12663; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12664; GFX908-NEXT: v_mov_b32_e32 v0, v3 12665; GFX908-NEXT: s_setpc_b64 s[30:31] 12666; 12667; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12668; GFX8: ; %bb.0: 12669; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12670; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 12671; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 12672; GFX8-NEXT: flat_load_dword v0, v[3:4] 12673; GFX8-NEXT: s_mov_b64 s[4:5], 0 12674; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12675; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 12676; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start 12677; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12678; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12679; GFX8-NEXT: v_mov_b32_e32 v6, v0 12680; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12681; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 12682; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12683; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 12684; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 12685; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12686; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12687; GFX8-NEXT: buffer_wbinvl1 12688; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 12689; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12690; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12691; GFX8-NEXT: s_cbranch_execnz .LBB47_1 12692; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12693; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12694; GFX8-NEXT: s_setpc_b64 s[30:31] 12695; 12696; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 12697; GFX7: ; %bb.0: 12698; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12699; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 12700; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 12701; GFX7-NEXT: flat_load_dword v1, v[4:5] 12702; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 12703; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 12704; GFX7-NEXT: s_mov_b64 s[4:5], 0 12705; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 12706; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12707; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12708; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 12709; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 12710; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12711; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start 12712; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12713; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 12714; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 12715; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 12716; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 12717; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12718; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 12719; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 12720; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12721; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12722; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 12723; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12724; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 12725; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 12726; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12727; GFX7-NEXT: buffer_wbinvl1 12728; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 12729; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 12730; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12731; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12732; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12733; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12734; GFX7-NEXT: s_cbranch_execnz .LBB47_1 12735; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12736; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12737; GFX7-NEXT: s_setpc_b64 s[30:31] 12738 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 12739 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 12740 ret <2 x half> %result 12741} 12742 12743define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 12744; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12745; GFX12: ; %bb.0: 12746; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12747; GFX12-NEXT: s_wait_expcnt 0x0 12748; GFX12-NEXT: s_wait_samplecnt 0x0 12749; GFX12-NEXT: s_wait_bvhcnt 0x0 12750; GFX12-NEXT: s_wait_kmcnt 0x0 12751; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 12752; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 12753; GFX12-NEXT: s_mov_b32 s0, 0 12754; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start 12755; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 12756; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12757; GFX12-NEXT: v_mov_b32_e32 v4, v3 12758; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12759; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 12760; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 12761; GFX12-NEXT: s_wait_storecnt 0x0 12762; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 12763; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 12764; GFX12-NEXT: global_inv scope:SCOPE_DEV 12765; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 12766; GFX12-NEXT: s_wait_alu 0xfffe 12767; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 12768; GFX12-NEXT: s_wait_alu 0xfffe 12769; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12770; GFX12-NEXT: s_cbranch_execnz .LBB48_1 12771; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 12772; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 12773; GFX12-NEXT: v_mov_b32_e32 v0, v3 12774; GFX12-NEXT: s_wait_alu 0xfffe 12775; GFX12-NEXT: s_setpc_b64 s[30:31] 12776; 12777; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12778; GFX940: ; %bb.0: 12779; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12780; GFX940-NEXT: v_mov_b32_e32 v4, v0 12781; GFX940-NEXT: v_mov_b32_e32 v5, v1 12782; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 12783; GFX940-NEXT: s_movk_i32 s0, 0xf800 12784; GFX940-NEXT: s_nop 0 12785; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 12786; GFX940-NEXT: flat_load_dword v0, v[0:1] 12787; GFX940-NEXT: s_mov_b32 s1, -1 12788; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 12789; GFX940-NEXT: s_mov_b64 s[0:1], 0 12790; GFX940-NEXT: v_pk_max_f16 v1, v2, v2 12791; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start 12792; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 12793; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12794; GFX940-NEXT: v_mov_b32_e32 v3, v0 12795; GFX940-NEXT: v_pk_max_f16 v0, v3, v3 12796; GFX940-NEXT: s_nop 0 12797; GFX940-NEXT: v_pk_min_f16 v2, v0, v1 12798; GFX940-NEXT: buffer_wbl2 sc1 12799; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] sc0 12800; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12801; GFX940-NEXT: buffer_inv sc1 12802; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 12803; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 12804; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 12805; GFX940-NEXT: s_cbranch_execnz .LBB48_1 12806; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 12807; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 12808; GFX940-NEXT: s_setpc_b64 s[30:31] 12809; 12810; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12811; GFX11: ; %bb.0: 12812; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12813; GFX11-NEXT: v_mov_b32_e32 v3, v0 12814; GFX11-NEXT: s_mov_b32 s0, 0 12815; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 12816; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 12817; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 12818; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 12819; GFX11-NEXT: flat_load_b32 v0, v[4:5] 12820; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 12821; GFX11-NEXT: v_pk_max_f16 v1, v2, v2 12822; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start 12823; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 12824; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12825; GFX11-NEXT: v_mov_b32_e32 v6, v0 12826; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 12827; GFX11-NEXT: v_pk_max_f16 v0, v6, v6 12828; GFX11-NEXT: v_pk_min_f16 v5, v0, v1 12829; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 12830; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc 12831; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12832; GFX11-NEXT: buffer_gl1_inv 12833; GFX11-NEXT: buffer_gl0_inv 12834; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 12835; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 12836; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 12837; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 12838; GFX11-NEXT: s_cbranch_execnz .LBB48_1 12839; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 12840; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 12841; GFX11-NEXT: s_setpc_b64 s[30:31] 12842; 12843; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12844; GFX10: ; %bb.0: 12845; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12846; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 12847; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 12848; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 12849; GFX10-NEXT: s_mov_b32 s4, 0 12850; GFX10-NEXT: flat_load_dword v0, v[3:4] 12851; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start 12852; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 12853; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12854; GFX10-NEXT: v_mov_b32_e32 v6, v0 12855; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 12856; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 12857; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 12858; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12859; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12860; GFX10-NEXT: buffer_gl1_inv 12861; GFX10-NEXT: buffer_gl0_inv 12862; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 12863; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 12864; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 12865; GFX10-NEXT: s_cbranch_execnz .LBB48_1 12866; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 12867; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 12868; GFX10-NEXT: s_setpc_b64 s[30:31] 12869; 12870; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12871; GFX90A: ; %bb.0: 12872; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12873; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 12874; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 12875; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 12876; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12877; GFX90A-NEXT: flat_load_dword v0, v[0:1] 12878; GFX90A-NEXT: s_mov_b64 s[4:5], 0 12879; GFX90A-NEXT: v_pk_max_f16 v1, v2, v2 12880; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start 12881; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 12882; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12883; GFX90A-NEXT: v_mov_b32_e32 v3, v0 12884; GFX90A-NEXT: v_pk_max_f16 v0, v3, v3 12885; GFX90A-NEXT: v_pk_min_f16 v2, v0, v1 12886; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[2:3] glc 12887; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12888; GFX90A-NEXT: buffer_wbinvl1 12889; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 12890; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12891; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 12892; GFX90A-NEXT: s_cbranch_execnz .LBB48_1 12893; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 12894; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 12895; GFX90A-NEXT: s_setpc_b64 s[30:31] 12896; 12897; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12898; GFX908: ; %bb.0: 12899; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12900; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 12901; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 12902; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 12903; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 12904; GFX908-NEXT: flat_load_dword v0, v[0:1] 12905; GFX908-NEXT: s_mov_b64 s[4:5], 0 12906; GFX908-NEXT: v_pk_max_f16 v1, v2, v2 12907; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start 12908; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 12909; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12910; GFX908-NEXT: v_mov_b32_e32 v6, v0 12911; GFX908-NEXT: v_pk_max_f16 v0, v6, v6 12912; GFX908-NEXT: v_pk_min_f16 v5, v0, v1 12913; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12914; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12915; GFX908-NEXT: buffer_wbinvl1 12916; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 12917; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12918; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 12919; GFX908-NEXT: s_cbranch_execnz .LBB48_1 12920; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 12921; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 12922; GFX908-NEXT: s_setpc_b64 s[30:31] 12923; 12924; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12925; GFX8: ; %bb.0: 12926; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12927; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 12928; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 12929; GFX8-NEXT: flat_load_dword v0, v[3:4] 12930; GFX8-NEXT: s_mov_b64 s[4:5], 0 12931; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12932; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 12933; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start 12934; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 12935; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12936; GFX8-NEXT: v_mov_b32_e32 v6, v0 12937; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 12938; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 12939; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 12940; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 12941; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 12942; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 12943; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12944; GFX8-NEXT: buffer_wbinvl1 12945; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 12946; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12947; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 12948; GFX8-NEXT: s_cbranch_execnz .LBB48_1 12949; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 12950; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 12951; GFX8-NEXT: s_setpc_b64 s[30:31] 12952; 12953; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 12954; GFX7: ; %bb.0: 12955; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12956; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 12957; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 12958; GFX7-NEXT: flat_load_dword v1, v[4:5] 12959; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 12960; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 12961; GFX7-NEXT: s_mov_b64 s[4:5], 0 12962; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 12963; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 12964; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12965; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 12966; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 12967; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12968; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start 12969; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 12970; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 12971; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 12972; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 12973; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 12974; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 12975; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 12976; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 12977; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 12978; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 12979; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 12980; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 12981; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 12982; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 12983; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 12984; GFX7-NEXT: buffer_wbinvl1 12985; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 12986; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 12987; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 12988; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 12989; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 12990; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 12991; GFX7-NEXT: s_cbranch_execnz .LBB48_1 12992; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 12993; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 12994; GFX7-NEXT: s_setpc_b64 s[30:31] 12995 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 12996 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 12997 ret <2 x half> %result 12998} 12999 13000define void @flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 13001; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13002; GFX12: ; %bb.0: 13003; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13004; GFX12-NEXT: s_wait_expcnt 0x0 13005; GFX12-NEXT: s_wait_samplecnt 0x0 13006; GFX12-NEXT: s_wait_bvhcnt 0x0 13007; GFX12-NEXT: s_wait_kmcnt 0x0 13008; GFX12-NEXT: flat_load_b32 v3, v[0:1] 13009; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 13010; GFX12-NEXT: s_mov_b32 s0, 0 13011; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start 13012; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13013; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13014; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 13015; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13016; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 13017; GFX12-NEXT: s_wait_storecnt 0x0 13018; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13019; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13020; GFX12-NEXT: global_inv scope:SCOPE_DEV 13021; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13022; GFX12-NEXT: v_mov_b32_e32 v3, v2 13023; GFX12-NEXT: s_wait_alu 0xfffe 13024; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13025; GFX12-NEXT: s_wait_alu 0xfffe 13026; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13027; GFX12-NEXT: s_cbranch_execnz .LBB49_1 13028; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13029; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13030; GFX12-NEXT: s_wait_alu 0xfffe 13031; GFX12-NEXT: s_setpc_b64 s[30:31] 13032; 13033; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13034; GFX940: ; %bb.0: 13035; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13036; GFX940-NEXT: flat_load_dword v3, v[0:1] 13037; GFX940-NEXT: s_mov_b64 s[0:1], 0 13038; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 13039; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start 13040; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13041; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13042; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 13043; GFX940-NEXT: s_nop 0 13044; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 13045; GFX940-NEXT: buffer_wbl2 sc1 13046; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 13047; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13048; GFX940-NEXT: buffer_inv sc1 13049; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13050; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13051; GFX940-NEXT: v_mov_b32_e32 v3, v2 13052; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13053; GFX940-NEXT: s_cbranch_execnz .LBB49_1 13054; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13055; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13056; GFX940-NEXT: s_setpc_b64 s[30:31] 13057; 13058; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13059; GFX11: ; %bb.0: 13060; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13061; GFX11-NEXT: flat_load_b32 v3, v[0:1] 13062; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 13063; GFX11-NEXT: s_mov_b32 s0, 0 13064; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start 13065; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13066; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13067; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 13068; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13069; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 13070; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13071; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 13072; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13073; GFX11-NEXT: buffer_gl1_inv 13074; GFX11-NEXT: buffer_gl0_inv 13075; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13076; GFX11-NEXT: v_mov_b32_e32 v3, v2 13077; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13078; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13079; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13080; GFX11-NEXT: s_cbranch_execnz .LBB49_1 13081; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13082; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13083; GFX11-NEXT: s_setpc_b64 s[30:31] 13084; 13085; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13086; GFX10: ; %bb.0: 13087; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13088; GFX10-NEXT: flat_load_dword v3, v[0:1] 13089; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 13090; GFX10-NEXT: s_mov_b32 s4, 0 13091; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start 13092; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13093; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13094; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 13095; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 13096; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13097; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13098; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13099; GFX10-NEXT: buffer_gl1_inv 13100; GFX10-NEXT: buffer_gl0_inv 13101; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13102; GFX10-NEXT: v_mov_b32_e32 v3, v2 13103; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13104; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13105; GFX10-NEXT: s_cbranch_execnz .LBB49_1 13106; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13107; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13108; GFX10-NEXT: s_setpc_b64 s[30:31] 13109; 13110; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13111; GFX90A: ; %bb.0: 13112; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13113; GFX90A-NEXT: flat_load_dword v3, v[0:1] 13114; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13115; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 13116; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start 13117; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13118; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13119; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 13120; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 13121; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13122; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13123; GFX90A-NEXT: buffer_wbinvl1 13124; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13125; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13126; GFX90A-NEXT: v_mov_b32_e32 v3, v2 13127; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13128; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 13129; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13130; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13131; GFX90A-NEXT: s_setpc_b64 s[30:31] 13132; 13133; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13134; GFX908: ; %bb.0: 13135; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13136; GFX908-NEXT: flat_load_dword v3, v[0:1] 13137; GFX908-NEXT: s_mov_b64 s[4:5], 0 13138; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 13139; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start 13140; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13141; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13142; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 13143; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 13144; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13145; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13146; GFX908-NEXT: buffer_wbinvl1 13147; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13148; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13149; GFX908-NEXT: v_mov_b32_e32 v3, v2 13150; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13151; GFX908-NEXT: s_cbranch_execnz .LBB49_1 13152; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13153; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13154; GFX908-NEXT: s_setpc_b64 s[30:31] 13155; 13156; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13157; GFX8: ; %bb.0: 13158; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13159; GFX8-NEXT: flat_load_dword v3, v[0:1] 13160; GFX8-NEXT: s_mov_b64 s[4:5], 0 13161; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13162; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 13163; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start 13164; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13165; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13166; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13167; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 13168; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13169; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 13170; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 13171; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13172; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13173; GFX8-NEXT: buffer_wbinvl1 13174; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13175; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13176; GFX8-NEXT: v_mov_b32_e32 v3, v2 13177; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13178; GFX8-NEXT: s_cbranch_execnz .LBB49_1 13179; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13180; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13181; GFX8-NEXT: s_setpc_b64 s[30:31] 13182; 13183; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__amdgpu_no_fine_grained_memory: 13184; GFX7: ; %bb.0: 13185; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13186; GFX7-NEXT: flat_load_dword v5, v[0:1] 13187; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13188; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13189; GFX7-NEXT: s_mov_b64 s[4:5], 0 13190; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13191; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13192; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13193; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13194; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13195; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13196; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start 13197; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13198; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13199; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13200; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13201; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13202; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13203; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 13204; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 13205; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13206; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13207; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13208; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13209; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13210; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13211; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13212; GFX7-NEXT: buffer_wbinvl1 13213; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13214; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13215; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13216; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13217; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13218; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13219; GFX7-NEXT: s_cbranch_execnz .LBB49_1 13220; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13221; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13222; GFX7-NEXT: s_setpc_b64 s[30:31] 13223 %unused = atomicrmw fmin ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 13224 ret void 13225} 13226 13227define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 13228; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13229; GFX12: ; %bb.0: 13230; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13231; GFX12-NEXT: s_wait_expcnt 0x0 13232; GFX12-NEXT: s_wait_samplecnt 0x0 13233; GFX12-NEXT: s_wait_bvhcnt 0x0 13234; GFX12-NEXT: s_wait_kmcnt 0x0 13235; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13236; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 13237; GFX12-NEXT: s_mov_b32 s0, 0 13238; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start 13239; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13240; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13241; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 13242; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13243; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 13244; GFX12-NEXT: s_wait_storecnt 0x0 13245; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13246; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13247; GFX12-NEXT: global_inv scope:SCOPE_DEV 13248; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13249; GFX12-NEXT: v_mov_b32_e32 v3, v2 13250; GFX12-NEXT: s_wait_alu 0xfffe 13251; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13252; GFX12-NEXT: s_wait_alu 0xfffe 13253; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13254; GFX12-NEXT: s_cbranch_execnz .LBB50_1 13255; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13256; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13257; GFX12-NEXT: s_wait_alu 0xfffe 13258; GFX12-NEXT: s_setpc_b64 s[30:31] 13259; 13260; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13261; GFX940: ; %bb.0: 13262; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13263; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 13264; GFX940-NEXT: s_mov_b64 s[0:1], 0 13265; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 13266; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start 13267; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13268; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13269; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 13270; GFX940-NEXT: s_nop 0 13271; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 13272; GFX940-NEXT: buffer_wbl2 sc1 13273; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 13274; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13275; GFX940-NEXT: buffer_inv sc1 13276; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13277; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13278; GFX940-NEXT: v_mov_b32_e32 v3, v2 13279; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13280; GFX940-NEXT: s_cbranch_execnz .LBB50_1 13281; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13282; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13283; GFX940-NEXT: s_setpc_b64 s[30:31] 13284; 13285; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13286; GFX11: ; %bb.0: 13287; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13288; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13289; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 13290; GFX11-NEXT: s_mov_b32 s0, 0 13291; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start 13292; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13293; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13294; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 13295; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13296; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 13297; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13298; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 13299; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13300; GFX11-NEXT: buffer_gl1_inv 13301; GFX11-NEXT: buffer_gl0_inv 13302; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13303; GFX11-NEXT: v_mov_b32_e32 v3, v2 13304; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13305; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13306; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13307; GFX11-NEXT: s_cbranch_execnz .LBB50_1 13308; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13309; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13310; GFX11-NEXT: s_setpc_b64 s[30:31] 13311; 13312; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13313; GFX10: ; %bb.0: 13314; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13315; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 13316; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 13317; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 13318; GFX10-NEXT: s_mov_b32 s4, 0 13319; GFX10-NEXT: flat_load_dword v3, v[0:1] 13320; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start 13321; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13322; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13323; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 13324; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 13325; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13326; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13327; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13328; GFX10-NEXT: buffer_gl1_inv 13329; GFX10-NEXT: buffer_gl0_inv 13330; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13331; GFX10-NEXT: v_mov_b32_e32 v3, v2 13332; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13333; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13334; GFX10-NEXT: s_cbranch_execnz .LBB50_1 13335; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13336; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13337; GFX10-NEXT: s_setpc_b64 s[30:31] 13338; 13339; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13340; GFX90A: ; %bb.0: 13341; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13342; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 13343; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13344; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 13345; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start 13346; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13347; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13348; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 13349; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 13350; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 13351; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13352; GFX90A-NEXT: buffer_wbinvl1 13353; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13354; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13355; GFX90A-NEXT: v_mov_b32_e32 v3, v2 13356; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13357; GFX90A-NEXT: s_cbranch_execnz .LBB50_1 13358; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13359; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13360; GFX90A-NEXT: s_setpc_b64 s[30:31] 13361; 13362; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13363; GFX908: ; %bb.0: 13364; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13365; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 13366; GFX908-NEXT: s_mov_b64 s[4:5], 0 13367; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 13368; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start 13369; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13370; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13371; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 13372; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 13373; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 13374; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13375; GFX908-NEXT: buffer_wbinvl1 13376; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13377; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13378; GFX908-NEXT: v_mov_b32_e32 v3, v2 13379; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13380; GFX908-NEXT: s_cbranch_execnz .LBB50_1 13381; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13382; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13383; GFX908-NEXT: s_setpc_b64 s[30:31] 13384; 13385; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13386; GFX8: ; %bb.0: 13387; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13388; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 13389; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13390; GFX8-NEXT: flat_load_dword v3, v[0:1] 13391; GFX8-NEXT: s_mov_b64 s[4:5], 0 13392; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13393; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 13394; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start 13395; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13396; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13397; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13398; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 13399; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13400; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 13401; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 13402; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13403; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13404; GFX8-NEXT: buffer_wbinvl1 13405; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13406; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13407; GFX8-NEXT: v_mov_b32_e32 v3, v2 13408; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13409; GFX8-NEXT: s_cbranch_execnz .LBB50_1 13410; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13411; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13412; GFX8-NEXT: s_setpc_b64 s[30:31] 13413; 13414; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13415; GFX7: ; %bb.0: 13416; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13417; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 13418; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13419; GFX7-NEXT: flat_load_dword v5, v[0:1] 13420; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13421; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13422; GFX7-NEXT: s_mov_b64 s[4:5], 0 13423; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13424; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13425; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13426; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13427; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13428; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13429; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start 13430; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13431; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13432; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13433; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13434; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13435; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13436; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 13437; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 13438; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13439; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13440; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13441; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13442; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13443; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13444; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13445; GFX7-NEXT: buffer_wbinvl1 13446; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13447; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13448; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13449; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13450; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13451; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13452; GFX7-NEXT: s_cbranch_execnz .LBB50_1 13453; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13454; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13455; GFX7-NEXT: s_setpc_b64 s[30:31] 13456 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 13457 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 13458 ret void 13459} 13460 13461define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 13462; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13463; GFX12: ; %bb.0: 13464; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13465; GFX12-NEXT: s_wait_expcnt 0x0 13466; GFX12-NEXT: s_wait_samplecnt 0x0 13467; GFX12-NEXT: s_wait_bvhcnt 0x0 13468; GFX12-NEXT: s_wait_kmcnt 0x0 13469; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 13470; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 13471; GFX12-NEXT: s_mov_b32 s0, 0 13472; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start 13473; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13474; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13475; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 13476; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13477; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 13478; GFX12-NEXT: s_wait_storecnt 0x0 13479; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 13480; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13481; GFX12-NEXT: global_inv scope:SCOPE_DEV 13482; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13483; GFX12-NEXT: v_mov_b32_e32 v3, v2 13484; GFX12-NEXT: s_wait_alu 0xfffe 13485; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13486; GFX12-NEXT: s_wait_alu 0xfffe 13487; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13488; GFX12-NEXT: s_cbranch_execnz .LBB51_1 13489; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13490; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13491; GFX12-NEXT: s_wait_alu 0xfffe 13492; GFX12-NEXT: s_setpc_b64 s[30:31] 13493; 13494; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13495; GFX940: ; %bb.0: 13496; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13497; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 13498; GFX940-NEXT: s_movk_i32 s0, 0xf800 13499; GFX940-NEXT: s_nop 0 13500; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 13501; GFX940-NEXT: flat_load_dword v3, v[4:5] 13502; GFX940-NEXT: s_mov_b32 s1, -1 13503; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 13504; GFX940-NEXT: s_mov_b64 s[0:1], 0 13505; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 13506; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start 13507; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13508; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13509; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 13510; GFX940-NEXT: s_nop 0 13511; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 13512; GFX940-NEXT: buffer_wbl2 sc1 13513; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 13514; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13515; GFX940-NEXT: buffer_inv sc1 13516; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13517; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13518; GFX940-NEXT: v_mov_b32_e32 v3, v2 13519; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13520; GFX940-NEXT: s_cbranch_execnz .LBB51_1 13521; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13522; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13523; GFX940-NEXT: s_setpc_b64 s[30:31] 13524; 13525; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13526; GFX11: ; %bb.0: 13527; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13528; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 13529; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 13530; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 13531; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 13532; GFX11-NEXT: flat_load_b32 v3, v[3:4] 13533; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 13534; GFX11-NEXT: s_mov_b32 s0, 0 13535; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start 13536; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13537; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13538; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 13539; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 13540; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 13541; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13542; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 13543; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13544; GFX11-NEXT: buffer_gl1_inv 13545; GFX11-NEXT: buffer_gl0_inv 13546; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13547; GFX11-NEXT: v_mov_b32_e32 v3, v2 13548; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13549; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13550; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13551; GFX11-NEXT: s_cbranch_execnz .LBB51_1 13552; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13553; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13554; GFX11-NEXT: s_setpc_b64 s[30:31] 13555; 13556; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13557; GFX10: ; %bb.0: 13558; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13559; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 13560; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 13561; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 13562; GFX10-NEXT: s_mov_b32 s4, 0 13563; GFX10-NEXT: flat_load_dword v3, v[0:1] 13564; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start 13565; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13566; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13567; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 13568; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 13569; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13570; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13571; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13572; GFX10-NEXT: buffer_gl1_inv 13573; GFX10-NEXT: buffer_gl0_inv 13574; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13575; GFX10-NEXT: v_mov_b32_e32 v3, v2 13576; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13577; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13578; GFX10-NEXT: s_cbranch_execnz .LBB51_1 13579; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13580; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13581; GFX10-NEXT: s_setpc_b64 s[30:31] 13582; 13583; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13584; GFX90A: ; %bb.0: 13585; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13586; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 13587; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 13588; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 13589; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 13590; GFX90A-NEXT: flat_load_dword v1, v[0:1] 13591; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13592; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 13593; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start 13594; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13595; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13596; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 13597; GFX90A-NEXT: v_pk_min_f16 v0, v0, v2 13598; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 13599; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13600; GFX90A-NEXT: buffer_wbinvl1 13601; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 13602; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13603; GFX90A-NEXT: v_mov_b32_e32 v1, v0 13604; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13605; GFX90A-NEXT: s_cbranch_execnz .LBB51_1 13606; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13607; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13608; GFX90A-NEXT: s_setpc_b64 s[30:31] 13609; 13610; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13611; GFX908: ; %bb.0: 13612; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13613; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 13614; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 13615; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 13616; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 13617; GFX908-NEXT: flat_load_dword v1, v[0:1] 13618; GFX908-NEXT: s_mov_b64 s[4:5], 0 13619; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 13620; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start 13621; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13622; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13623; GFX908-NEXT: v_pk_max_f16 v0, v1, v1 13624; GFX908-NEXT: v_pk_min_f16 v0, v0, v2 13625; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 13626; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13627; GFX908-NEXT: buffer_wbinvl1 13628; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 13629; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13630; GFX908-NEXT: v_mov_b32_e32 v1, v0 13631; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13632; GFX908-NEXT: s_cbranch_execnz .LBB51_1 13633; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13634; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13635; GFX908-NEXT: s_setpc_b64 s[30:31] 13636; 13637; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13638; GFX8: ; %bb.0: 13639; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13640; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 13641; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 13642; GFX8-NEXT: flat_load_dword v3, v[0:1] 13643; GFX8-NEXT: s_mov_b64 s[4:5], 0 13644; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13645; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 13646; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start 13647; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13648; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13649; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13650; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 13651; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13652; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 13653; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 13654; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 13655; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13656; GFX8-NEXT: buffer_wbinvl1 13657; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 13658; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13659; GFX8-NEXT: v_mov_b32_e32 v3, v2 13660; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13661; GFX8-NEXT: s_cbranch_execnz .LBB51_1 13662; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13663; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13664; GFX8-NEXT: s_setpc_b64 s[30:31] 13665; 13666; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: 13667; GFX7: ; %bb.0: 13668; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13669; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 13670; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 13671; GFX7-NEXT: flat_load_dword v5, v[0:1] 13672; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 13673; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 13674; GFX7-NEXT: s_mov_b64 s[4:5], 0 13675; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 13676; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13677; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 13678; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 13679; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 13680; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 13681; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start 13682; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13683; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 13684; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 13685; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 13686; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 13687; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 13688; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 13689; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 13690; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 13691; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 13692; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 13693; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 13694; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 13695; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 13696; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13697; GFX7-NEXT: buffer_wbinvl1 13698; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 13699; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 13700; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 13701; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 13702; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13703; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13704; GFX7-NEXT: s_cbranch_execnz .LBB51_1 13705; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13706; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13707; GFX7-NEXT: s_setpc_b64 s[30:31] 13708 %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 13709 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 13710 ret void 13711} 13712 13713define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 13714; GFX12-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13715; GFX12: ; %bb.0: 13716; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13717; GFX12-NEXT: s_wait_expcnt 0x0 13718; GFX12-NEXT: s_wait_samplecnt 0x0 13719; GFX12-NEXT: s_wait_bvhcnt 0x0 13720; GFX12-NEXT: s_wait_kmcnt 0x0 13721; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13722; GFX12-NEXT: v_pk_max_num_f16 v2, v2, v2 13723; GFX12-NEXT: s_mov_b32 s0, 0 13724; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start 13725; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13726; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13727; GFX12-NEXT: v_mov_b32_e32 v4, v3 13728; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13729; GFX12-NEXT: v_pk_max_num_f16 v3, v4, v4 13730; GFX12-NEXT: v_pk_min_num_f16 v3, v3, v2 13731; GFX12-NEXT: global_wb scope:SCOPE_SYS 13732; GFX12-NEXT: s_wait_storecnt 0x0 13733; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13734; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13735; GFX12-NEXT: global_inv scope:SCOPE_SYS 13736; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13737; GFX12-NEXT: s_wait_alu 0xfffe 13738; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13739; GFX12-NEXT: s_wait_alu 0xfffe 13740; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13741; GFX12-NEXT: s_cbranch_execnz .LBB52_1 13742; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13743; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13744; GFX12-NEXT: v_mov_b32_e32 v0, v3 13745; GFX12-NEXT: s_wait_alu 0xfffe 13746; GFX12-NEXT: s_setpc_b64 s[30:31] 13747; 13748; GFX940-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13749; GFX940: ; %bb.0: 13750; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13751; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 13752; GFX940-NEXT: s_mov_b64 s[0:1], 0 13753; GFX940-NEXT: v_pk_max_f16 v2, v2, v2 13754; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start 13755; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13756; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13757; GFX940-NEXT: v_mov_b32_e32 v5, v3 13758; GFX940-NEXT: v_pk_max_f16 v3, v5, v5 13759; GFX940-NEXT: s_nop 0 13760; GFX940-NEXT: v_pk_min_f16 v4, v3, v2 13761; GFX940-NEXT: buffer_wbl2 sc0 sc1 13762; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 sc0 sc1 13763; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13764; GFX940-NEXT: buffer_inv sc0 sc1 13765; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13766; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 13767; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 13768; GFX940-NEXT: s_cbranch_execnz .LBB52_1 13769; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 13770; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 13771; GFX940-NEXT: v_mov_b32_e32 v0, v3 13772; GFX940-NEXT: s_setpc_b64 s[30:31] 13773; 13774; GFX11-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13775; GFX11: ; %bb.0: 13776; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13777; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13778; GFX11-NEXT: v_pk_max_f16 v2, v2, v2 13779; GFX11-NEXT: s_mov_b32 s0, 0 13780; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start 13781; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 13782; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13783; GFX11-NEXT: v_mov_b32_e32 v4, v3 13784; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 13785; GFX11-NEXT: v_pk_max_f16 v3, v4, v4 13786; GFX11-NEXT: v_pk_min_f16 v3, v3, v2 13787; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 13788; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc 13789; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13790; GFX11-NEXT: buffer_gl1_inv 13791; GFX11-NEXT: buffer_gl0_inv 13792; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 13793; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 13794; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 13795; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13796; GFX11-NEXT: s_cbranch_execnz .LBB52_1 13797; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 13798; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 13799; GFX11-NEXT: v_mov_b32_e32 v0, v3 13800; GFX11-NEXT: s_setpc_b64 s[30:31] 13801; 13802; GFX10-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13803; GFX10: ; %bb.0: 13804; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13805; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 13806; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 13807; GFX10-NEXT: v_pk_max_f16 v1, v2, v2 13808; GFX10-NEXT: s_mov_b32 s4, 0 13809; GFX10-NEXT: flat_load_dword v0, v[3:4] 13810; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start 13811; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 13812; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13813; GFX10-NEXT: v_mov_b32_e32 v6, v0 13814; GFX10-NEXT: v_pk_max_f16 v0, v6, v6 13815; GFX10-NEXT: v_pk_min_f16 v5, v0, v1 13816; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 13817; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 13818; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13819; GFX10-NEXT: buffer_gl1_inv 13820; GFX10-NEXT: buffer_gl0_inv 13821; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 13822; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 13823; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 13824; GFX10-NEXT: s_cbranch_execnz .LBB52_1 13825; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 13826; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 13827; GFX10-NEXT: s_setpc_b64 s[30:31] 13828; 13829; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13830; GFX90A: ; %bb.0: 13831; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13832; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 13833; GFX90A-NEXT: s_mov_b64 s[4:5], 0 13834; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 13835; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start 13836; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 13837; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13838; GFX90A-NEXT: v_mov_b32_e32 v5, v3 13839; GFX90A-NEXT: v_pk_max_f16 v3, v5, v5 13840; GFX90A-NEXT: v_pk_min_f16 v4, v3, v2 13841; GFX90A-NEXT: buffer_wbl2 13842; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc 13843; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13844; GFX90A-NEXT: buffer_invl2 13845; GFX90A-NEXT: buffer_wbinvl1 13846; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 13847; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13848; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 13849; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 13850; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 13851; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 13852; GFX90A-NEXT: v_mov_b32_e32 v0, v3 13853; GFX90A-NEXT: s_setpc_b64 s[30:31] 13854; 13855; GFX908-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13856; GFX908: ; %bb.0: 13857; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13858; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 13859; GFX908-NEXT: s_mov_b64 s[4:5], 0 13860; GFX908-NEXT: v_pk_max_f16 v2, v2, v2 13861; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start 13862; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 13863; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13864; GFX908-NEXT: v_mov_b32_e32 v4, v3 13865; GFX908-NEXT: v_pk_max_f16 v3, v4, v4 13866; GFX908-NEXT: v_pk_min_f16 v3, v3, v2 13867; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc 13868; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13869; GFX908-NEXT: buffer_wbinvl1 13870; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 13871; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13872; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 13873; GFX908-NEXT: s_cbranch_execnz .LBB52_1 13874; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 13875; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 13876; GFX908-NEXT: v_mov_b32_e32 v0, v3 13877; GFX908-NEXT: s_setpc_b64 s[30:31] 13878; 13879; GFX8-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13880; GFX8: ; %bb.0: 13881; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13882; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 13883; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 13884; GFX8-NEXT: flat_load_dword v0, v[3:4] 13885; GFX8-NEXT: s_mov_b64 s[4:5], 0 13886; GFX8-NEXT: v_max_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13887; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 13888; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start 13889; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 13890; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13891; GFX8-NEXT: v_mov_b32_e32 v6, v0 13892; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 13893; GFX8-NEXT: v_max_f16_e32 v5, v6, v6 13894; GFX8-NEXT: v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 13895; GFX8-NEXT: v_min_f16_e32 v5, v5, v2 13896; GFX8-NEXT: v_or_b32_e32 v5, v5, v0 13897; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 13898; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13899; GFX8-NEXT: buffer_wbinvl1 13900; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 13901; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13902; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 13903; GFX8-NEXT: s_cbranch_execnz .LBB52_1 13904; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 13905; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 13906; GFX8-NEXT: s_setpc_b64 s[30:31] 13907; 13908; GFX7-LABEL: flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13909; GFX7: ; %bb.0: 13910; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13911; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 13912; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 13913; GFX7-NEXT: flat_load_dword v1, v[4:5] 13914; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3 13915; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2 13916; GFX7-NEXT: s_mov_b64 s[4:5], 0 13917; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 13918; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 13919; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13920; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 13921; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 13922; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 13923; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start 13924; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 13925; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 13926; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 13927; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1 13928; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 13929; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 13930; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 13931; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 13932; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 13933; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7 13934; GFX7-NEXT: v_or_b32_e32 v7, v0, v1 13935; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6 13936; GFX7-NEXT: v_or_b32_e32 v6, v8, v0 13937; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc 13938; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13939; GFX7-NEXT: buffer_wbinvl1 13940; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6 13941; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6 13942; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 13943; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7 13944; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 13945; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 13946; GFX7-NEXT: s_cbranch_execnz .LBB52_1 13947; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 13948; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 13949; GFX7-NEXT: s_setpc_b64 s[30:31] 13950 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 13951 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 13952 ret <2 x half> %result 13953} 13954 13955define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 { 13956; GFX12-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13957; GFX12: ; %bb.0: 13958; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13959; GFX12-NEXT: s_wait_expcnt 0x0 13960; GFX12-NEXT: s_wait_samplecnt 0x0 13961; GFX12-NEXT: s_wait_bvhcnt 0x0 13962; GFX12-NEXT: s_wait_kmcnt 0x0 13963; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 13964; GFX12-NEXT: v_pk_max_num_f16 v4, v2, v2 13965; GFX12-NEXT: s_mov_b32 s0, 0 13966; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start 13967; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 13968; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13969; GFX12-NEXT: v_pk_max_num_f16 v2, v3, v3 13970; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 13971; GFX12-NEXT: v_pk_min_num_f16 v2, v2, v4 13972; GFX12-NEXT: global_wb scope:SCOPE_SYS 13973; GFX12-NEXT: s_wait_storecnt 0x0 13974; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 13975; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 13976; GFX12-NEXT: global_inv scope:SCOPE_SYS 13977; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 13978; GFX12-NEXT: v_mov_b32_e32 v3, v2 13979; GFX12-NEXT: s_wait_alu 0xfffe 13980; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 13981; GFX12-NEXT: s_wait_alu 0xfffe 13982; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 13983; GFX12-NEXT: s_cbranch_execnz .LBB53_1 13984; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 13985; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 13986; GFX12-NEXT: s_wait_alu 0xfffe 13987; GFX12-NEXT: s_setpc_b64 s[30:31] 13988; 13989; GFX940-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 13990; GFX940: ; %bb.0: 13991; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 13992; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 13993; GFX940-NEXT: s_mov_b64 s[0:1], 0 13994; GFX940-NEXT: v_pk_max_f16 v4, v2, v2 13995; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start 13996; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 13997; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 13998; GFX940-NEXT: v_pk_max_f16 v2, v3, v3 13999; GFX940-NEXT: s_nop 0 14000; GFX940-NEXT: v_pk_min_f16 v2, v2, v4 14001; GFX940-NEXT: buffer_wbl2 sc0 sc1 14002; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 14003; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14004; GFX940-NEXT: buffer_inv sc0 sc1 14005; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14006; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 14007; GFX940-NEXT: v_mov_b32_e32 v3, v2 14008; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 14009; GFX940-NEXT: s_cbranch_execnz .LBB53_1 14010; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14011; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 14012; GFX940-NEXT: s_setpc_b64 s[30:31] 14013; 14014; GFX11-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14015; GFX11: ; %bb.0: 14016; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14017; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14018; GFX11-NEXT: v_pk_max_f16 v4, v2, v2 14019; GFX11-NEXT: s_mov_b32 s0, 0 14020; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start 14021; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14022; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14023; GFX11-NEXT: v_pk_max_f16 v2, v3, v3 14024; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14025; GFX11-NEXT: v_pk_min_f16 v2, v2, v4 14026; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14027; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 14028; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14029; GFX11-NEXT: buffer_gl1_inv 14030; GFX11-NEXT: buffer_gl0_inv 14031; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14032; GFX11-NEXT: v_mov_b32_e32 v3, v2 14033; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 14034; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14035; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 14036; GFX11-NEXT: s_cbranch_execnz .LBB53_1 14037; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14038; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 14039; GFX11-NEXT: s_setpc_b64 s[30:31] 14040; 14041; GFX10-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14042; GFX10: ; %bb.0: 14043; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14044; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 14045; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 14046; GFX10-NEXT: v_pk_max_f16 v4, v2, v2 14047; GFX10-NEXT: s_mov_b32 s4, 0 14048; GFX10-NEXT: flat_load_dword v3, v[0:1] 14049; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start 14050; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14051; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14052; GFX10-NEXT: v_pk_max_f16 v2, v3, v3 14053; GFX10-NEXT: v_pk_min_f16 v2, v2, v4 14054; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14055; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14056; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14057; GFX10-NEXT: buffer_gl1_inv 14058; GFX10-NEXT: buffer_gl0_inv 14059; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 14060; GFX10-NEXT: v_mov_b32_e32 v3, v2 14061; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 14062; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 14063; GFX10-NEXT: s_cbranch_execnz .LBB53_1 14064; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14065; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 14066; GFX10-NEXT: s_setpc_b64 s[30:31] 14067; 14068; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14069; GFX90A: ; %bb.0: 14070; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14071; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 14072; GFX90A-NEXT: s_mov_b64 s[4:5], 0 14073; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 14074; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start 14075; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14076; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14077; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 14078; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 14079; GFX90A-NEXT: buffer_wbl2 14080; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 14081; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14082; GFX90A-NEXT: buffer_invl2 14083; GFX90A-NEXT: buffer_wbinvl1 14084; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14085; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14086; GFX90A-NEXT: v_mov_b32_e32 v3, v2 14087; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 14088; GFX90A-NEXT: s_cbranch_execnz .LBB53_1 14089; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14090; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 14091; GFX90A-NEXT: s_setpc_b64 s[30:31] 14092; 14093; GFX908-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14094; GFX908: ; %bb.0: 14095; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14096; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 14097; GFX908-NEXT: s_mov_b64 s[4:5], 0 14098; GFX908-NEXT: v_pk_max_f16 v4, v2, v2 14099; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start 14100; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14101; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14102; GFX908-NEXT: v_pk_max_f16 v2, v3, v3 14103; GFX908-NEXT: v_pk_min_f16 v2, v2, v4 14104; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 14105; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14106; GFX908-NEXT: buffer_wbinvl1 14107; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14108; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14109; GFX908-NEXT: v_mov_b32_e32 v3, v2 14110; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 14111; GFX908-NEXT: s_cbranch_execnz .LBB53_1 14112; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14113; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 14114; GFX908-NEXT: s_setpc_b64 s[30:31] 14115; 14116; GFX8-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14117; GFX8: ; %bb.0: 14118; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14119; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 14120; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 14121; GFX8-NEXT: flat_load_dword v3, v[0:1] 14122; GFX8-NEXT: s_mov_b64 s[4:5], 0 14123; GFX8-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 14124; GFX8-NEXT: v_max_f16_e32 v5, v2, v2 14125; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start 14126; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14127; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14128; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 14129; GFX8-NEXT: v_max_f16_e32 v6, v3, v3 14130; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 14131; GFX8-NEXT: v_min_f16_e32 v6, v6, v5 14132; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 14133; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 14134; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14135; GFX8-NEXT: buffer_wbinvl1 14136; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 14137; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14138; GFX8-NEXT: v_mov_b32_e32 v3, v2 14139; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 14140; GFX8-NEXT: s_cbranch_execnz .LBB53_1 14141; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14142; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 14143; GFX8-NEXT: s_setpc_b64 s[30:31] 14144; 14145; GFX7-LABEL: flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: 14146; GFX7: ; %bb.0: 14147; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14148; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 14149; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 14150; GFX7-NEXT: flat_load_dword v5, v[0:1] 14151; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 14152; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2 14153; GFX7-NEXT: s_mov_b64 s[4:5], 0 14154; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3 14155; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14156; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 14157; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5 14158; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3 14159; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6 14160; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start 14161; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14162; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 14163; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 14164; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5 14165; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4 14166; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 14167; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 14168; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 14169; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6 14170; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 14171; GFX7-NEXT: v_or_b32_e32 v6, v4, v5 14172; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 14173; GFX7-NEXT: v_or_b32_e32 v5, v7, v4 14174; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc 14175; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14176; GFX7-NEXT: buffer_wbinvl1 14177; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7 14178; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7 14179; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 14180; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 14181; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14182; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14183; GFX7-NEXT: s_cbranch_execnz .LBB53_1 14184; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14185; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14186; GFX7-NEXT: s_setpc_b64 s[30:31] 14187 %gep = getelementptr <2 x half>, ptr %ptr, i64 511 14188 %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 14189 ret void 14190} 14191 14192; -------------------------------------------------------------------- 14193; <2 x bfloat> 14194; -------------------------------------------------------------------- 14195 14196define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 14197; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14198; GFX12: ; %bb.0: 14199; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14200; GFX12-NEXT: s_wait_expcnt 0x0 14201; GFX12-NEXT: s_wait_samplecnt 0x0 14202; GFX12-NEXT: s_wait_bvhcnt 0x0 14203; GFX12-NEXT: s_wait_kmcnt 0x0 14204; GFX12-NEXT: flat_load_b32 v3, v[0:1] 14205; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14206; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14207; GFX12-NEXT: s_mov_b32 s1, 0 14208; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start 14209; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14210; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14211; GFX12-NEXT: v_mov_b32_e32 v6, v3 14212; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14213; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14214; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 14215; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14216; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14217; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 14218; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 14219; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 14220; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14221; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14222; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14223; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 14224; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 14225; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14226; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14227; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14228; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14229; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14230; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14231; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14232; GFX12-NEXT: s_wait_storecnt 0x0 14233; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14234; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14235; GFX12-NEXT: global_inv scope:SCOPE_DEV 14236; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14237; GFX12-NEXT: s_wait_alu 0xfffe 14238; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14239; GFX12-NEXT: s_wait_alu 0xfffe 14240; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14241; GFX12-NEXT: s_cbranch_execnz .LBB54_1 14242; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14243; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14244; GFX12-NEXT: v_mov_b32_e32 v0, v3 14245; GFX12-NEXT: s_wait_alu 0xfffe 14246; GFX12-NEXT: s_setpc_b64 s[30:31] 14247; 14248; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14249; GFX940: ; %bb.0: 14250; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14251; GFX940-NEXT: flat_load_dword v3, v[0:1] 14252; GFX940-NEXT: s_mov_b64 s[2:3], 0 14253; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14254; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14255; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14256; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14257; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start 14258; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14259; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14260; GFX940-NEXT: v_mov_b32_e32 v7, v3 14261; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14262; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14263; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 14264; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 14265; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 14266; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 14267; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 14268; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 14269; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 14270; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 14271; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14272; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 14273; GFX940-NEXT: s_nop 0 14274; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14275; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 14276; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 14277; GFX940-NEXT: buffer_wbl2 sc1 14278; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 14279; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14280; GFX940-NEXT: buffer_inv sc1 14281; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14282; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14283; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14284; GFX940-NEXT: s_cbranch_execnz .LBB54_1 14285; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14286; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14287; GFX940-NEXT: v_mov_b32_e32 v0, v3 14288; GFX940-NEXT: s_setpc_b64 s[30:31] 14289; 14290; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14291; GFX11: ; %bb.0: 14292; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14293; GFX11-NEXT: flat_load_b32 v3, v[0:1] 14294; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14295; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14296; GFX11-NEXT: s_mov_b32 s1, 0 14297; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 14298; GFX11-NEXT: .p2align 6 14299; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start 14300; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14301; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14302; GFX11-NEXT: v_mov_b32_e32 v6, v3 14303; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14304; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14305; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 14306; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14307; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14308; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 14309; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 14310; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 14311; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14312; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14313; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14314; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 14315; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 14316; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14317; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14318; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14319; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14320; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14321; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14322; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14323; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14324; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc 14325; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14326; GFX11-NEXT: buffer_gl1_inv 14327; GFX11-NEXT: buffer_gl0_inv 14328; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14329; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 14330; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14331; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14332; GFX11-NEXT: s_cbranch_execnz .LBB54_1 14333; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14334; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 14335; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 14336; GFX11-NEXT: v_mov_b32_e32 v0, v3 14337; GFX11-NEXT: s_setpc_b64 s[30:31] 14338; 14339; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14340; GFX10: ; %bb.0: 14341; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14342; GFX10-NEXT: flat_load_dword v3, v[0:1] 14343; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14344; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14345; GFX10-NEXT: s_mov_b32 s5, 0 14346; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start 14347; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14348; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14349; GFX10-NEXT: v_mov_b32_e32 v6, v3 14350; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14351; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14352; GFX10-NEXT: v_min_f32_e32 v3, v3, v4 14353; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 14354; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 14355; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 14356; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3 14357; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 14358; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14359; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14360; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14361; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3 14362; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14363; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4 14364; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14365; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14366; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 14367; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14368; GFX10-NEXT: buffer_gl1_inv 14369; GFX10-NEXT: buffer_gl0_inv 14370; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14371; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 14372; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 14373; GFX10-NEXT: s_cbranch_execnz .LBB54_1 14374; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14375; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 14376; GFX10-NEXT: v_mov_b32_e32 v0, v3 14377; GFX10-NEXT: s_setpc_b64 s[30:31] 14378; 14379; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14380; GFX90A: ; %bb.0: 14381; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14382; GFX90A-NEXT: flat_load_dword v3, v[0:1] 14383; GFX90A-NEXT: s_mov_b64 s[6:7], 0 14384; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14385; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 14386; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14387; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 14388; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start 14389; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14390; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14391; GFX90A-NEXT: v_mov_b32_e32 v7, v3 14392; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14393; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14394; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 14395; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 14396; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 14397; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 14398; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 14399; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 14400; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 14401; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 14402; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14403; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14404; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 14405; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14406; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 14407; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc 14408; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14409; GFX90A-NEXT: buffer_wbinvl1 14410; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14411; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14412; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 14413; GFX90A-NEXT: s_cbranch_execnz .LBB54_1 14414; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14415; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 14416; GFX90A-NEXT: v_mov_b32_e32 v0, v3 14417; GFX90A-NEXT: s_setpc_b64 s[30:31] 14418; 14419; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14420; GFX908: ; %bb.0: 14421; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14422; GFX908-NEXT: flat_load_dword v3, v[0:1] 14423; GFX908-NEXT: s_mov_b64 s[6:7], 0 14424; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14425; GFX908-NEXT: s_movk_i32 s8, 0x7fff 14426; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14427; GFX908-NEXT: s_mov_b32 s9, 0x7060302 14428; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start 14429; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14430; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14431; GFX908-NEXT: v_mov_b32_e32 v6, v3 14432; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14433; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14434; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 14435; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 14436; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 14437; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 14438; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 14439; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 14440; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 14441; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 14442; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14443; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14444; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 14445; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14446; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 14447; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 14448; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14449; GFX908-NEXT: buffer_wbinvl1 14450; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 14451; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14452; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 14453; GFX908-NEXT: s_cbranch_execnz .LBB54_1 14454; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14455; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 14456; GFX908-NEXT: v_mov_b32_e32 v0, v3 14457; GFX908-NEXT: s_setpc_b64 s[30:31] 14458; 14459; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14460; GFX8: ; %bb.0: 14461; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14462; GFX8-NEXT: flat_load_dword v3, v[0:1] 14463; GFX8-NEXT: s_mov_b64 s[6:7], 0 14464; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14465; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14466; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start 14467; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14468; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14469; GFX8-NEXT: v_mov_b32_e32 v6, v3 14470; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14471; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14472; GFX8-NEXT: v_min_f32_e32 v3, v3, v4 14473; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 14474; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 14475; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14476; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 14477; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14478; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 14479; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14480; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14481; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14482; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3 14483; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14484; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14485; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 14486; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14487; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16 14488; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc 14489; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14490; GFX8-NEXT: buffer_wbinvl1 14491; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 14492; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14493; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 14494; GFX8-NEXT: s_cbranch_execnz .LBB54_1 14495; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14496; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 14497; GFX8-NEXT: v_mov_b32_e32 v0, v3 14498; GFX8-NEXT: s_setpc_b64 s[30:31] 14499; 14500; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__amdgpu_no_fine_grained_memory: 14501; GFX7: ; %bb.0: 14502; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14503; GFX7-NEXT: flat_load_dword v5, v[0:1] 14504; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14505; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 14506; GFX7-NEXT: s_mov_b64 s[4:5], 0 14507; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 14508; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14509; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 14510; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5 14511; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14512; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start 14513; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14514; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 14515; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 14516; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 14517; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 14518; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 14519; GFX7-NEXT: v_min_f32_e32 v6, v6, v4 14520; GFX7-NEXT: v_min_f32_e32 v7, v7, v5 14521; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16 14522; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v6 14523; GFX7-NEXT: v_alignbit_b32 v2, v2, v7, 16 14524; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc 14525; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14526; GFX7-NEXT: buffer_wbinvl1 14527; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 14528; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 14529; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14530; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14531; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14532; GFX7-NEXT: s_cbranch_execnz .LBB54_1 14533; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14534; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14535; GFX7-NEXT: v_mov_b32_e32 v0, v3 14536; GFX7-NEXT: v_mov_b32_e32 v1, v2 14537; GFX7-NEXT: s_setpc_b64 s[30:31] 14538 %result = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14539 ret <2 x bfloat> %result 14540} 14541 14542define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 14543; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14544; GFX12: ; %bb.0: 14545; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14546; GFX12-NEXT: s_wait_expcnt 0x0 14547; GFX12-NEXT: s_wait_samplecnt 0x0 14548; GFX12-NEXT: s_wait_bvhcnt 0x0 14549; GFX12-NEXT: s_wait_kmcnt 0x0 14550; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14551; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14552; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14553; GFX12-NEXT: s_mov_b32 s1, 0 14554; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start 14555; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14556; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14557; GFX12-NEXT: v_mov_b32_e32 v6, v3 14558; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14559; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14560; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 14561; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14562; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14563; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 14564; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 14565; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 14566; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14567; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14568; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14569; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 14570; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 14571; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14572; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14573; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14574; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14575; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14576; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14577; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14578; GFX12-NEXT: s_wait_storecnt 0x0 14579; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14580; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14581; GFX12-NEXT: global_inv scope:SCOPE_DEV 14582; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14583; GFX12-NEXT: s_wait_alu 0xfffe 14584; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14585; GFX12-NEXT: s_wait_alu 0xfffe 14586; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14587; GFX12-NEXT: s_cbranch_execnz .LBB55_1 14588; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14589; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14590; GFX12-NEXT: v_mov_b32_e32 v0, v3 14591; GFX12-NEXT: s_wait_alu 0xfffe 14592; GFX12-NEXT: s_setpc_b64 s[30:31] 14593; 14594; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14595; GFX940: ; %bb.0: 14596; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14597; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 14598; GFX940-NEXT: s_mov_b64 s[2:3], 0 14599; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14600; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14601; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14602; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14603; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start 14604; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14605; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14606; GFX940-NEXT: v_mov_b32_e32 v7, v3 14607; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14608; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14609; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 14610; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 14611; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 14612; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 14613; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 14614; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 14615; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 14616; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 14617; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14618; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 14619; GFX940-NEXT: s_nop 0 14620; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14621; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 14622; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 14623; GFX940-NEXT: buffer_wbl2 sc1 14624; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 14625; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14626; GFX940-NEXT: buffer_inv sc1 14627; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14628; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14629; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14630; GFX940-NEXT: s_cbranch_execnz .LBB55_1 14631; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14632; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14633; GFX940-NEXT: v_mov_b32_e32 v0, v3 14634; GFX940-NEXT: s_setpc_b64 s[30:31] 14635; 14636; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14637; GFX11: ; %bb.0: 14638; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14639; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 14640; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14641; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14642; GFX11-NEXT: s_mov_b32 s1, 0 14643; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 14644; GFX11-NEXT: .p2align 6 14645; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start 14646; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 14647; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14648; GFX11-NEXT: v_mov_b32_e32 v6, v3 14649; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14650; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14651; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 14652; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14653; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14654; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 14655; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 14656; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 14657; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14658; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14659; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14660; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 14661; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 14662; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14663; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14664; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14665; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14667; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14668; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14669; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 14670; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 14671; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14672; GFX11-NEXT: buffer_gl1_inv 14673; GFX11-NEXT: buffer_gl0_inv 14674; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14675; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 14676; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 14677; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14678; GFX11-NEXT: s_cbranch_execnz .LBB55_1 14679; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 14680; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 14681; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 14682; GFX11-NEXT: v_mov_b32_e32 v0, v3 14683; GFX11-NEXT: s_setpc_b64 s[30:31] 14684; 14685; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14686; GFX10: ; %bb.0: 14687; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14688; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 14689; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 14690; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14691; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14692; GFX10-NEXT: s_mov_b32 s5, 0 14693; GFX10-NEXT: flat_load_dword v0, v[3:4] 14694; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start 14695; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 14696; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14697; GFX10-NEXT: v_mov_b32_e32 v6, v0 14698; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14699; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14700; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 14701; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 14702; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 14703; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 14704; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 14705; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 14706; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14707; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 14708; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14709; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 14710; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14711; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 14712; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 14713; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 14714; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14715; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14716; GFX10-NEXT: buffer_gl1_inv 14717; GFX10-NEXT: buffer_gl0_inv 14718; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 14719; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 14720; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 14721; GFX10-NEXT: s_cbranch_execnz .LBB55_1 14722; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 14723; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 14724; GFX10-NEXT: s_setpc_b64 s[30:31] 14725; 14726; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14727; GFX90A: ; %bb.0: 14728; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14729; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 14730; GFX90A-NEXT: s_mov_b64 s[6:7], 0 14731; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14732; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 14733; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14734; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 14735; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start 14736; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 14737; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14738; GFX90A-NEXT: v_mov_b32_e32 v7, v3 14739; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 14740; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 14741; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 14742; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 14743; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 14744; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 14745; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 14746; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 14747; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 14748; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 14749; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14750; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14751; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 14752; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14753; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 14754; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 14755; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14756; GFX90A-NEXT: buffer_wbinvl1 14757; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 14758; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14759; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 14760; GFX90A-NEXT: s_cbranch_execnz .LBB55_1 14761; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 14762; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 14763; GFX90A-NEXT: v_mov_b32_e32 v0, v3 14764; GFX90A-NEXT: s_setpc_b64 s[30:31] 14765; 14766; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14767; GFX908: ; %bb.0: 14768; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14769; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 14770; GFX908-NEXT: s_mov_b64 s[6:7], 0 14771; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14772; GFX908-NEXT: s_movk_i32 s8, 0x7fff 14773; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14774; GFX908-NEXT: s_mov_b32 s9, 0x7060302 14775; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start 14776; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 14777; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14778; GFX908-NEXT: v_mov_b32_e32 v6, v3 14779; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14780; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14781; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 14782; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 14783; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 14784; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 14785; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 14786; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 14787; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 14788; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 14789; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14790; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 14791; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 14792; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14793; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 14794; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 14795; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14796; GFX908-NEXT: buffer_wbinvl1 14797; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 14798; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14799; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 14800; GFX908-NEXT: s_cbranch_execnz .LBB55_1 14801; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 14802; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 14803; GFX908-NEXT: v_mov_b32_e32 v0, v3 14804; GFX908-NEXT: s_setpc_b64 s[30:31] 14805; 14806; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14807; GFX8: ; %bb.0: 14808; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14809; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 14810; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 14811; GFX8-NEXT: flat_load_dword v0, v[3:4] 14812; GFX8-NEXT: s_mov_b64 s[6:7], 0 14813; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14814; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14815; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start 14816; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 14817; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14818; GFX8-NEXT: v_mov_b32_e32 v6, v0 14819; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 14820; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14821; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 14822; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 14823; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 14824; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 14825; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 14826; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 14827; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 14828; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 14829; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 14830; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 14831; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 14832; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 14833; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 14834; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 14835; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 14836; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 14837; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 14838; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14839; GFX8-NEXT: buffer_wbinvl1 14840; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 14841; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 14842; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 14843; GFX8-NEXT: s_cbranch_execnz .LBB55_1 14844; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 14845; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 14846; GFX8-NEXT: s_setpc_b64 s[30:31] 14847; 14848; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 14849; GFX7: ; %bb.0: 14850; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14851; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 14852; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 14853; GFX7-NEXT: flat_load_dword v0, v[4:5] 14854; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 14855; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 14856; GFX7-NEXT: s_mov_b64 s[4:5], 0 14857; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 14858; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 14859; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14860; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14861; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14862; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start 14863; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 14864; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 14865; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 14866; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 14867; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 14868; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 14869; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 14870; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 14871; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 14872; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 14873; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 14874; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 14875; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14876; GFX7-NEXT: buffer_wbinvl1 14877; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 14878; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 14879; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 14880; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 14881; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 14882; GFX7-NEXT: s_cbranch_execnz .LBB55_1 14883; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 14884; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 14885; GFX7-NEXT: s_setpc_b64 s[30:31] 14886 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 14887 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 14888 ret <2 x bfloat> %result 14889} 14890 14891define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 14892; GFX12-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 14893; GFX12: ; %bb.0: 14894; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14895; GFX12-NEXT: s_wait_expcnt 0x0 14896; GFX12-NEXT: s_wait_samplecnt 0x0 14897; GFX12-NEXT: s_wait_bvhcnt 0x0 14898; GFX12-NEXT: s_wait_kmcnt 0x0 14899; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 14900; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 14901; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14902; GFX12-NEXT: s_mov_b32 s1, 0 14903; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start 14904; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 14905; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14906; GFX12-NEXT: v_mov_b32_e32 v6, v3 14907; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14908; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 14909; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 14910; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 14911; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 14912; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 14913; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 14914; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 14915; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 14916; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14917; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 14918; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 14919; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 14920; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 14921; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 14922; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 14923; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 14924; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 14925; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 14926; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 14927; GFX12-NEXT: s_wait_storecnt 0x0 14928; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 14929; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 14930; GFX12-NEXT: global_inv scope:SCOPE_DEV 14931; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 14932; GFX12-NEXT: s_wait_alu 0xfffe 14933; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 14934; GFX12-NEXT: s_wait_alu 0xfffe 14935; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 14936; GFX12-NEXT: s_cbranch_execnz .LBB56_1 14937; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 14938; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 14939; GFX12-NEXT: v_mov_b32_e32 v0, v3 14940; GFX12-NEXT: s_wait_alu 0xfffe 14941; GFX12-NEXT: s_setpc_b64 s[30:31] 14942; 14943; GFX940-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 14944; GFX940: ; %bb.0: 14945; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14946; GFX940-NEXT: v_mov_b32_e32 v4, v0 14947; GFX940-NEXT: v_mov_b32_e32 v5, v1 14948; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 14949; GFX940-NEXT: s_movk_i32 s0, 0xf800 14950; GFX940-NEXT: s_nop 0 14951; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc 14952; GFX940-NEXT: flat_load_dword v0, v[0:1] 14953; GFX940-NEXT: s_mov_b32 s1, -1 14954; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] 14955; GFX940-NEXT: s_mov_b64 s[2:3], 0 14956; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 14957; GFX940-NEXT: s_movk_i32 s4, 0x7fff 14958; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 14959; GFX940-NEXT: s_mov_b32 s5, 0x7060302 14960; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start 14961; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 14962; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14963; GFX940-NEXT: v_mov_b32_e32 v7, v0 14964; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 14965; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 14966; GFX940-NEXT: v_min_f32_e32 v0, v0, v1 14967; GFX940-NEXT: v_min_f32_e32 v3, v3, v2 14968; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 14969; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 14970; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 14971; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 14972; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 14973; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 14974; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 14975; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 14976; GFX940-NEXT: s_nop 0 14977; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc 14978; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] 14979; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 14980; GFX940-NEXT: buffer_wbl2 sc1 14981; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 14982; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 14983; GFX940-NEXT: buffer_inv sc1 14984; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 14985; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 14986; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 14987; GFX940-NEXT: s_cbranch_execnz .LBB56_1 14988; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 14989; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 14990; GFX940-NEXT: s_setpc_b64 s[30:31] 14991; 14992; GFX11-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 14993; GFX11: ; %bb.0: 14994; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14995; GFX11-NEXT: v_mov_b32_e32 v3, v0 14996; GFX11-NEXT: s_mov_b32 s1, 0 14997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 14998; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3 14999; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo 15000; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3 15001; GFX11-NEXT: flat_load_b32 v0, v[4:5] 15002; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 15003; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15004; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15005; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15006; GFX11-NEXT: .p2align 6 15007; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start 15008; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15009; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15010; GFX11-NEXT: v_mov_b32_e32 v6, v0 15011; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15012; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15013; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 15014; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15015; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15016; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 15017; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 15018; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 15019; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15020; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15021; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 15022; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 15023; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0 15024; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0 15025; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 15026; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 15027; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 15028; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 15029; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0 15030; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 15031; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15032; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc 15033; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15034; GFX11-NEXT: buffer_gl1_inv 15035; GFX11-NEXT: buffer_gl0_inv 15036; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 15037; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15038; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15039; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15040; GFX11-NEXT: s_cbranch_execnz .LBB56_1 15041; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15042; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15043; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15044; GFX11-NEXT: s_setpc_b64 s[30:31] 15045; 15046; GFX10-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15047; GFX10: ; %bb.0: 15048; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15049; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 15050; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 15051; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15052; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15053; GFX10-NEXT: s_mov_b32 s5, 0 15054; GFX10-NEXT: flat_load_dword v0, v[3:4] 15055; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start 15056; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15057; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15058; GFX10-NEXT: v_mov_b32_e32 v6, v0 15059; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15060; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15061; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 15062; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 15063; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 15064; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 15065; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 15066; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 15067; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 15068; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 15069; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 15070; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 15071; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 15072; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 15073; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 15074; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15075; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 15076; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15077; GFX10-NEXT: buffer_gl1_inv 15078; GFX10-NEXT: buffer_gl0_inv 15079; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 15080; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 15081; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 15082; GFX10-NEXT: s_cbranch_execnz .LBB56_1 15083; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15084; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 15085; GFX10-NEXT: s_setpc_b64 s[30:31] 15086; 15087; GFX90A-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15088; GFX90A: ; %bb.0: 15089; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15090; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 15091; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 15092; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15093; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15094; GFX90A-NEXT: flat_load_dword v0, v[0:1] 15095; GFX90A-NEXT: s_mov_b64 s[6:7], 0 15096; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15097; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 15098; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15099; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 15100; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start 15101; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15102; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15103; GFX90A-NEXT: v_mov_b32_e32 v7, v0 15104; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7 15105; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 15106; GFX90A-NEXT: v_min_f32_e32 v0, v0, v1 15107; GFX90A-NEXT: v_min_f32_e32 v3, v3, v2 15108; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 15109; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1 15110; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 15111; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3 15112; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 15113; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8 15114; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 15115; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 15116; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5] 15117; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc 15118; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9 15119; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc 15120; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15121; GFX90A-NEXT: buffer_wbinvl1 15122; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 15123; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15124; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15125; GFX90A-NEXT: s_cbranch_execnz .LBB56_1 15126; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15127; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15128; GFX90A-NEXT: s_setpc_b64 s[30:31] 15129; 15130; GFX908-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15131; GFX908: ; %bb.0: 15132; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15133; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 15134; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 15135; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 15136; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 15137; GFX908-NEXT: flat_load_dword v0, v[0:1] 15138; GFX908-NEXT: s_mov_b64 s[6:7], 0 15139; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15140; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15141; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15142; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15143; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start 15144; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15145; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15146; GFX908-NEXT: v_mov_b32_e32 v6, v0 15147; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15148; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15149; GFX908-NEXT: v_min_f32_e32 v0, v0, v1 15150; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 15151; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 15152; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 15153; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 15154; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 15155; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 15156; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 15157; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 15158; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 15159; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 15160; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 15161; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9 15162; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 15163; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15164; GFX908-NEXT: buffer_wbinvl1 15165; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 15166; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15167; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15168; GFX908-NEXT: s_cbranch_execnz .LBB56_1 15169; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15170; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15171; GFX908-NEXT: s_setpc_b64 s[30:31] 15172; 15173; GFX8-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15174; GFX8: ; %bb.0: 15175; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15176; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0 15177; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc 15178; GFX8-NEXT: flat_load_dword v0, v[3:4] 15179; GFX8-NEXT: s_mov_b64 s[6:7], 0 15180; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 15181; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 15182; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start 15183; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15184; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15185; GFX8-NEXT: v_mov_b32_e32 v6, v0 15186; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 15187; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 15188; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 15189; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 15190; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 15191; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 15192; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 15193; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 15194; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15195; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15196; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 15197; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 15198; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 15199; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 15200; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 15201; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 15202; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 15203; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 15204; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 15205; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15206; GFX8-NEXT: buffer_wbinvl1 15207; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 15208; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15209; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15210; GFX8-NEXT: s_cbranch_execnz .LBB56_1 15211; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15212; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15213; GFX8-NEXT: s_setpc_b64 s[30:31] 15214; 15215; GFX7-LABEL: flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15216; GFX7: ; %bb.0: 15217; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15218; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0 15219; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc 15220; GFX7-NEXT: flat_load_dword v0, v[4:5] 15221; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 15222; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 15223; GFX7-NEXT: s_mov_b64 s[4:5], 0 15224; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 15225; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 15226; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15227; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 15228; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 15229; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start 15230; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15231; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 15232; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 15233; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 15234; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 15235; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 15236; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 15237; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 15238; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 15239; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 15240; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 15241; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 15242; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15243; GFX7-NEXT: buffer_wbinvl1 15244; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 15245; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 15246; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15247; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 15248; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15249; GFX7-NEXT: s_cbranch_execnz .LBB56_1 15250; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15251; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15252; GFX7-NEXT: s_setpc_b64 s[30:31] 15253 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 15254 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 15255 ret <2 x bfloat> %result 15256} 15257 15258define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 15259; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15260; GFX12: ; %bb.0: 15261; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15262; GFX12-NEXT: s_wait_expcnt 0x0 15263; GFX12-NEXT: s_wait_samplecnt 0x0 15264; GFX12-NEXT: s_wait_bvhcnt 0x0 15265; GFX12-NEXT: s_wait_kmcnt 0x0 15266; GFX12-NEXT: flat_load_b32 v3, v[0:1] 15267; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15268; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15269; GFX12-NEXT: s_mov_b32 s1, 0 15270; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start 15271; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15272; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15273; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15274; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15275; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15276; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 15277; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 15278; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15279; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 15280; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 15281; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 15282; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 15283; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15284; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15285; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15286; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15287; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15288; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15289; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15290; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 15291; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15292; GFX12-NEXT: s_wait_storecnt 0x0 15293; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15294; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15295; GFX12-NEXT: global_inv scope:SCOPE_DEV 15296; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15297; GFX12-NEXT: v_mov_b32_e32 v3, v2 15298; GFX12-NEXT: s_wait_alu 0xfffe 15299; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15300; GFX12-NEXT: s_wait_alu 0xfffe 15301; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15302; GFX12-NEXT: s_cbranch_execnz .LBB57_1 15303; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15304; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15305; GFX12-NEXT: s_wait_alu 0xfffe 15306; GFX12-NEXT: s_setpc_b64 s[30:31] 15307; 15308; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15309; GFX940: ; %bb.0: 15310; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15311; GFX940-NEXT: flat_load_dword v3, v[0:1] 15312; GFX940-NEXT: s_mov_b64 s[2:3], 0 15313; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15314; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15315; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15316; GFX940-NEXT: s_mov_b32 s5, 0x7060302 15317; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start 15318; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 15319; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15320; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15321; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15322; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 15323; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 15324; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 15325; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 15326; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 15327; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 15328; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 15329; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 15330; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15331; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 15332; GFX940-NEXT: s_nop 0 15333; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15334; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 15335; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 15336; GFX940-NEXT: buffer_wbl2 sc1 15337; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 15338; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15339; GFX940-NEXT: buffer_inv sc1 15340; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15341; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 15342; GFX940-NEXT: v_mov_b32_e32 v3, v2 15343; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 15344; GFX940-NEXT: s_cbranch_execnz .LBB57_1 15345; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 15346; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 15347; GFX940-NEXT: s_setpc_b64 s[30:31] 15348; 15349; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15350; GFX11: ; %bb.0: 15351; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15352; GFX11-NEXT: flat_load_b32 v3, v[0:1] 15353; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15354; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15355; GFX11-NEXT: s_mov_b32 s1, 0 15356; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15357; GFX11-NEXT: .p2align 6 15358; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start 15359; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15360; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15361; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15362; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15363; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15364; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 15365; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 15366; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15367; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 15368; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 15369; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 15370; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 15371; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15372; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15373; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15374; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15376; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15377; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15379; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15380; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15381; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 15382; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15383; GFX11-NEXT: buffer_gl1_inv 15384; GFX11-NEXT: buffer_gl0_inv 15385; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15386; GFX11-NEXT: v_mov_b32_e32 v3, v2 15387; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15388; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15389; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15390; GFX11-NEXT: s_cbranch_execnz .LBB57_1 15391; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15392; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15393; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15394; GFX11-NEXT: s_setpc_b64 s[30:31] 15395; 15396; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15397; GFX10: ; %bb.0: 15398; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15399; GFX10-NEXT: flat_load_dword v3, v[0:1] 15400; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15401; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15402; GFX10-NEXT: s_mov_b32 s5, 0 15403; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start 15404; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15405; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15406; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15407; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15408; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 15409; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 15410; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 15411; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 15412; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 15413; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 15414; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15415; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15416; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15417; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 15418; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15419; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 15420; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15421; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15422; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15423; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15424; GFX10-NEXT: buffer_gl1_inv 15425; GFX10-NEXT: buffer_gl0_inv 15426; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15427; GFX10-NEXT: v_mov_b32_e32 v3, v2 15428; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 15429; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 15430; GFX10-NEXT: s_cbranch_execnz .LBB57_1 15431; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15432; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 15433; GFX10-NEXT: s_setpc_b64 s[30:31] 15434; 15435; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15436; GFX90A: ; %bb.0: 15437; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15438; GFX90A-NEXT: flat_load_dword v3, v[0:1] 15439; GFX90A-NEXT: s_mov_b64 s[6:7], 0 15440; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15441; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 15442; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15443; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 15444; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start 15445; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15446; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15447; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15448; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15449; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 15450; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 15451; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 15452; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 15453; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 15454; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 15455; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 15456; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 15457; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15458; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15459; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15460; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15461; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 15462; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15463; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15464; GFX90A-NEXT: buffer_wbinvl1 15465; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15466; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15467; GFX90A-NEXT: v_mov_b32_e32 v3, v2 15468; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15469; GFX90A-NEXT: s_cbranch_execnz .LBB57_1 15470; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15471; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15472; GFX90A-NEXT: s_setpc_b64 s[30:31] 15473; 15474; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15475; GFX908: ; %bb.0: 15476; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15477; GFX908-NEXT: flat_load_dword v3, v[0:1] 15478; GFX908-NEXT: s_mov_b64 s[6:7], 0 15479; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15480; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15481; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15482; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15483; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start 15484; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15485; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15486; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15487; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15488; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 15489; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 15490; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 15491; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 15492; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 15493; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 15494; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 15495; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 15496; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15497; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15498; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15499; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15500; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 15501; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15502; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15503; GFX908-NEXT: buffer_wbinvl1 15504; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15505; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15506; GFX908-NEXT: v_mov_b32_e32 v3, v2 15507; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15508; GFX908-NEXT: s_cbranch_execnz .LBB57_1 15509; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15510; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15511; GFX908-NEXT: s_setpc_b64 s[30:31] 15512; 15513; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15514; GFX8: ; %bb.0: 15515; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15516; GFX8-NEXT: flat_load_dword v3, v[0:1] 15517; GFX8-NEXT: s_mov_b64 s[6:7], 0 15518; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15519; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15520; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start 15521; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15522; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15523; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15524; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15525; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 15526; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 15527; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 15528; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 15529; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 15530; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 15531; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15532; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15533; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 15534; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15535; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 15536; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15537; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15538; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15539; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 15540; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 15541; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15542; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15543; GFX8-NEXT: buffer_wbinvl1 15544; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15545; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15546; GFX8-NEXT: v_mov_b32_e32 v3, v2 15547; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15548; GFX8-NEXT: s_cbranch_execnz .LBB57_1 15549; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15550; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15551; GFX8-NEXT: s_setpc_b64 s[30:31] 15552; 15553; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory: 15554; GFX7: ; %bb.0: 15555; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15556; GFX7-NEXT: flat_load_dword v4, v[0:1] 15557; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15558; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 15559; GFX7-NEXT: s_mov_b64 s[4:5], 0 15560; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 15561; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 15562; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15563; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 15564; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 15565; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start 15566; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15567; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15568; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15569; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 15570; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 15571; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 15572; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 15573; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 15574; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 15575; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 15576; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 15577; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 15578; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15579; GFX7-NEXT: buffer_wbinvl1 15580; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 15581; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 15582; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15583; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 15584; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15585; GFX7-NEXT: s_cbranch_execnz .LBB57_1 15586; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15587; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15588; GFX7-NEXT: s_setpc_b64 s[30:31] 15589 %unused = atomicrmw fmin ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 15590 ret void 15591} 15592 15593define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 15594; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15595; GFX12: ; %bb.0: 15596; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15597; GFX12-NEXT: s_wait_expcnt 0x0 15598; GFX12-NEXT: s_wait_samplecnt 0x0 15599; GFX12-NEXT: s_wait_bvhcnt 0x0 15600; GFX12-NEXT: s_wait_kmcnt 0x0 15601; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15602; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15603; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15604; GFX12-NEXT: s_mov_b32 s1, 0 15605; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start 15606; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15607; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15608; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15609; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15610; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15611; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 15612; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 15613; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15614; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 15615; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 15616; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 15617; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 15618; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15619; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15620; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15621; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15622; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15623; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15624; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15625; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 15626; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15627; GFX12-NEXT: s_wait_storecnt 0x0 15628; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15629; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15630; GFX12-NEXT: global_inv scope:SCOPE_DEV 15631; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15632; GFX12-NEXT: v_mov_b32_e32 v3, v2 15633; GFX12-NEXT: s_wait_alu 0xfffe 15634; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15635; GFX12-NEXT: s_wait_alu 0xfffe 15636; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15637; GFX12-NEXT: s_cbranch_execnz .LBB58_1 15638; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15639; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15640; GFX12-NEXT: s_wait_alu 0xfffe 15641; GFX12-NEXT: s_setpc_b64 s[30:31] 15642; 15643; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15644; GFX940: ; %bb.0: 15645; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15646; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 15647; GFX940-NEXT: s_mov_b64 s[2:3], 0 15648; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15649; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15650; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15651; GFX940-NEXT: s_mov_b32 s5, 0x7060302 15652; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start 15653; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 15654; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15655; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15656; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15657; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 15658; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 15659; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 15660; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 15661; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 15662; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 15663; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 15664; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 15665; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15666; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 15667; GFX940-NEXT: s_nop 0 15668; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15669; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 15670; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 15671; GFX940-NEXT: buffer_wbl2 sc1 15672; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 15673; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15674; GFX940-NEXT: buffer_inv sc1 15675; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15676; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 15677; GFX940-NEXT: v_mov_b32_e32 v3, v2 15678; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 15679; GFX940-NEXT: s_cbranch_execnz .LBB58_1 15680; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 15681; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 15682; GFX940-NEXT: s_setpc_b64 s[30:31] 15683; 15684; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15685; GFX11: ; %bb.0: 15686; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15687; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 15688; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15689; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15690; GFX11-NEXT: s_mov_b32 s1, 0 15691; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 15692; GFX11-NEXT: .p2align 6 15693; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start 15694; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 15695; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15696; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15697; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15698; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15699; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 15700; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 15701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15702; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 15703; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 15704; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 15705; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 15706; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15707; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15708; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15709; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15710; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15711; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15712; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15713; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 15714; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15715; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 15716; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 15717; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15718; GFX11-NEXT: buffer_gl1_inv 15719; GFX11-NEXT: buffer_gl0_inv 15720; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15721; GFX11-NEXT: v_mov_b32_e32 v3, v2 15722; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 15723; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 15724; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15725; GFX11-NEXT: s_cbranch_execnz .LBB58_1 15726; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 15727; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 15728; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 15729; GFX11-NEXT: s_setpc_b64 s[30:31] 15730; 15731; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15732; GFX10: ; %bb.0: 15733; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15734; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 15735; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 15736; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15737; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15738; GFX10-NEXT: s_mov_b32 s5, 0 15739; GFX10-NEXT: flat_load_dword v3, v[0:1] 15740; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start 15741; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 15742; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15743; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15744; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15745; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 15746; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 15747; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 15748; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 15749; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 15750; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 15751; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15752; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15753; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15754; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 15755; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15756; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 15757; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15758; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 15759; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15760; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15761; GFX10-NEXT: buffer_gl1_inv 15762; GFX10-NEXT: buffer_gl0_inv 15763; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15764; GFX10-NEXT: v_mov_b32_e32 v3, v2 15765; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 15766; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 15767; GFX10-NEXT: s_cbranch_execnz .LBB58_1 15768; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 15769; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 15770; GFX10-NEXT: s_setpc_b64 s[30:31] 15771; 15772; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15773; GFX90A: ; %bb.0: 15774; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15775; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 15776; GFX90A-NEXT: s_mov_b64 s[6:7], 0 15777; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15778; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 15779; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15780; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 15781; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start 15782; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 15783; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15784; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15785; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15786; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 15787; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 15788; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 15789; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 15790; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 15791; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 15792; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 15793; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 15794; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15795; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15796; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15797; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15798; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 15799; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 15800; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15801; GFX90A-NEXT: buffer_wbinvl1 15802; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15803; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15804; GFX90A-NEXT: v_mov_b32_e32 v3, v2 15805; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 15806; GFX90A-NEXT: s_cbranch_execnz .LBB58_1 15807; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 15808; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 15809; GFX90A-NEXT: s_setpc_b64 s[30:31] 15810; 15811; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15812; GFX908: ; %bb.0: 15813; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15814; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 15815; GFX908-NEXT: s_mov_b64 s[6:7], 0 15816; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15817; GFX908-NEXT: s_movk_i32 s8, 0x7fff 15818; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15819; GFX908-NEXT: s_mov_b32 s9, 0x7060302 15820; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start 15821; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 15822; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15823; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15824; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15825; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 15826; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 15827; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 15828; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 15829; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 15830; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 15831; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 15832; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 15833; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15834; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15835; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15836; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15837; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 15838; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 15839; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15840; GFX908-NEXT: buffer_wbinvl1 15841; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15842; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15843; GFX908-NEXT: v_mov_b32_e32 v3, v2 15844; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 15845; GFX908-NEXT: s_cbranch_execnz .LBB58_1 15846; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 15847; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 15848; GFX908-NEXT: s_setpc_b64 s[30:31] 15849; 15850; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15851; GFX8: ; %bb.0: 15852; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15853; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 15854; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15855; GFX8-NEXT: flat_load_dword v3, v[0:1] 15856; GFX8-NEXT: s_mov_b64 s[6:7], 0 15857; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15858; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15859; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start 15860; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 15861; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15862; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15863; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15864; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 15865; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 15866; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 15867; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 15868; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 15869; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 15870; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 15871; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 15872; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 15873; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 15874; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 15875; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 15876; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 15877; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 15878; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 15879; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 15880; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 15881; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15882; GFX8-NEXT: buffer_wbinvl1 15883; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 15884; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 15885; GFX8-NEXT: v_mov_b32_e32 v3, v2 15886; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 15887; GFX8-NEXT: s_cbranch_execnz .LBB58_1 15888; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 15889; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 15890; GFX8-NEXT: s_setpc_b64 s[30:31] 15891; 15892; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 15893; GFX7: ; %bb.0: 15894; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15895; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 15896; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 15897; GFX7-NEXT: flat_load_dword v4, v[0:1] 15898; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 15899; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 15900; GFX7-NEXT: s_mov_b64 s[4:5], 0 15901; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 15902; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 15903; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15904; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 15905; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 15906; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start 15907; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 15908; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 15909; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 15910; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 15911; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 15912; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 15913; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 15914; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 15915; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 15916; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 15917; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 15918; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 15919; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 15920; GFX7-NEXT: buffer_wbinvl1 15921; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 15922; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 15923; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 15924; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 15925; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 15926; GFX7-NEXT: s_cbranch_execnz .LBB58_1 15927; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 15928; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 15929; GFX7-NEXT: s_setpc_b64 s[30:31] 15930 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 15931 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 15932 ret void 15933} 15934 15935define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 15936; GFX12-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15937; GFX12: ; %bb.0: 15938; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15939; GFX12-NEXT: s_wait_expcnt 0x0 15940; GFX12-NEXT: s_wait_samplecnt 0x0 15941; GFX12-NEXT: s_wait_bvhcnt 0x0 15942; GFX12-NEXT: s_wait_kmcnt 0x0 15943; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 15944; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15945; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15946; GFX12-NEXT: s_mov_b32 s1, 0 15947; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start 15948; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 15949; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15950; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 15951; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 15952; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15953; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 15954; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 15955; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15956; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 15957; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 15958; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 15959; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 15960; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 15961; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 15962; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 15963; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 15964; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 15965; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 15966; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 15967; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 15968; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 15969; GFX12-NEXT: s_wait_storecnt 0x0 15970; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 15971; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 15972; GFX12-NEXT: global_inv scope:SCOPE_DEV 15973; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 15974; GFX12-NEXT: v_mov_b32_e32 v3, v2 15975; GFX12-NEXT: s_wait_alu 0xfffe 15976; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 15977; GFX12-NEXT: s_wait_alu 0xfffe 15978; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 15979; GFX12-NEXT: s_cbranch_execnz .LBB59_1 15980; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 15981; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 15982; GFX12-NEXT: s_wait_alu 0xfffe 15983; GFX12-NEXT: s_setpc_b64 s[30:31] 15984; 15985; GFX940-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 15986; GFX940: ; %bb.0: 15987; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15988; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 15989; GFX940-NEXT: s_movk_i32 s0, 0xf800 15990; GFX940-NEXT: s_nop 0 15991; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 15992; GFX940-NEXT: flat_load_dword v3, v[4:5] 15993; GFX940-NEXT: s_mov_b32 s1, -1 15994; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] 15995; GFX940-NEXT: s_mov_b64 s[2:3], 0 15996; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 15997; GFX940-NEXT: s_movk_i32 s4, 0x7fff 15998; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 15999; GFX940-NEXT: s_mov_b32 s5, 0x7060302 16000; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start 16001; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 16002; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16003; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16004; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16005; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 16006; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 16007; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 16008; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 16009; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 16010; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 16011; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 16012; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 16013; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16014; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 16015; GFX940-NEXT: s_nop 0 16016; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16017; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 16018; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 16019; GFX940-NEXT: buffer_wbl2 sc1 16020; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 16021; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16022; GFX940-NEXT: buffer_inv sc1 16023; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16024; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 16025; GFX940-NEXT: v_mov_b32_e32 v3, v2 16026; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 16027; GFX940-NEXT: s_cbranch_execnz .LBB59_1 16028; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 16029; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 16030; GFX940-NEXT: s_setpc_b64 s[30:31] 16031; 16032; GFX11-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16033; GFX11: ; %bb.0: 16034; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16035; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 16036; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo 16037; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 16038; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 16039; GFX11-NEXT: flat_load_b32 v3, v[3:4] 16040; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16041; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16042; GFX11-NEXT: s_mov_b32 s1, 0 16043; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16044; GFX11-NEXT: .p2align 6 16045; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start 16046; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16047; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16048; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16049; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16050; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16051; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 16052; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 16053; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16054; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 16055; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 16056; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 16057; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 16058; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16059; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16060; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16061; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 16062; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16063; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16064; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 16065; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 16066; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16067; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16068; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc 16069; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16070; GFX11-NEXT: buffer_gl1_inv 16071; GFX11-NEXT: buffer_gl0_inv 16072; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16073; GFX11-NEXT: v_mov_b32_e32 v3, v2 16074; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16075; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16076; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16077; GFX11-NEXT: s_cbranch_execnz .LBB59_1 16078; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16079; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16080; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16081; GFX11-NEXT: s_setpc_b64 s[30:31] 16082; 16083; GFX10-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16084; GFX10: ; %bb.0: 16085; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16086; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 16087; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo 16088; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16089; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16090; GFX10-NEXT: s_mov_b32 s5, 0 16091; GFX10-NEXT: flat_load_dword v3, v[0:1] 16092; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start 16093; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16094; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16095; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16096; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16097; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 16098; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 16099; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 16100; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 16101; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 16102; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 16103; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16104; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16105; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16106; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 16107; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16108; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 16109; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16110; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16111; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16112; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16113; GFX10-NEXT: buffer_gl1_inv 16114; GFX10-NEXT: buffer_gl0_inv 16115; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16116; GFX10-NEXT: v_mov_b32_e32 v3, v2 16117; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16118; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16119; GFX10-NEXT: s_cbranch_execnz .LBB59_1 16120; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16121; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16122; GFX10-NEXT: s_setpc_b64 s[30:31] 16123; 16124; GFX90A-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16125; GFX90A: ; %bb.0: 16126; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16127; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 16128; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc 16129; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 16130; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 16131; GFX90A-NEXT: flat_load_dword v1, v[0:1] 16132; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16133; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2 16134; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16135; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16136; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16137; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start 16138; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16139; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16140; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 16141; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 16142; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 16143; GFX90A-NEXT: v_min_f32_e32 v6, v6, v2 16144; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 16145; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 16146; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 16147; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 16148; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 16149; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 16150; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16151; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 16152; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 16153; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16154; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 16155; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 16156; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16157; GFX90A-NEXT: buffer_wbinvl1 16158; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 16159; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16160; GFX90A-NEXT: v_mov_b32_e32 v1, v0 16161; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16162; GFX90A-NEXT: s_cbranch_execnz .LBB59_1 16163; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16164; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16165; GFX90A-NEXT: s_setpc_b64 s[30:31] 16166; 16167; GFX908-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16168; GFX908: ; %bb.0: 16169; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16170; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0 16171; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc 16172; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 16173; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc 16174; GFX908-NEXT: flat_load_dword v1, v[0:1] 16175; GFX908-NEXT: s_mov_b64 s[6:7], 0 16176; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2 16177; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16178; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16179; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16180; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start 16181; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16182; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16183; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1 16184; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 16185; GFX908-NEXT: v_min_f32_e32 v0, v0, v5 16186; GFX908-NEXT: v_min_f32_e32 v6, v6, v2 16187; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1 16188; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 16189; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0 16190; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 16191; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8 16192; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 16193; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16194; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 16195; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 16196; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16197; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9 16198; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc 16199; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16200; GFX908-NEXT: buffer_wbinvl1 16201; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 16202; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16203; GFX908-NEXT: v_mov_b32_e32 v1, v0 16204; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16205; GFX908-NEXT: s_cbranch_execnz .LBB59_1 16206; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16207; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16208; GFX908-NEXT: s_setpc_b64 s[30:31] 16209; 16210; GFX8-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16211; GFX8: ; %bb.0: 16212; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16213; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0 16214; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 16215; GFX8-NEXT: flat_load_dword v3, v[0:1] 16216; GFX8-NEXT: s_mov_b64 s[6:7], 0 16217; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16218; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16219; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start 16220; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16221; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16222; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16223; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16224; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 16225; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 16226; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 16227; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 16228; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 16229; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 16230; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16231; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16232; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 16233; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16234; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 16235; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16236; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16237; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16238; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 16239; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 16240; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16241; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16242; GFX8-NEXT: buffer_wbinvl1 16243; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16244; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16245; GFX8-NEXT: v_mov_b32_e32 v3, v2 16246; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16247; GFX8-NEXT: s_cbranch_execnz .LBB59_1 16248; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16249; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16250; GFX8-NEXT: s_setpc_b64 s[30:31] 16251; 16252; GFX7-LABEL: flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: 16253; GFX7: ; %bb.0: 16254; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16255; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0 16256; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc 16257; GFX7-NEXT: flat_load_dword v4, v[0:1] 16258; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 16259; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 16260; GFX7-NEXT: s_mov_b64 s[4:5], 0 16261; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 16262; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 16263; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16264; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 16265; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 16266; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start 16267; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16268; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 16269; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 16270; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 16271; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 16272; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16273; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 16274; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 16275; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 16276; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 16277; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 16278; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 16279; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16280; GFX7-NEXT: buffer_wbinvl1 16281; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 16282; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 16283; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16284; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 16285; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16286; GFX7-NEXT: s_cbranch_execnz .LBB59_1 16287; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16288; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16289; GFX7-NEXT: s_setpc_b64 s[30:31] 16290 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 16291 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 16292 ret void 16293} 16294 16295define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 16296; GFX12-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16297; GFX12: ; %bb.0: 16298; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16299; GFX12-NEXT: s_wait_expcnt 0x0 16300; GFX12-NEXT: s_wait_samplecnt 0x0 16301; GFX12-NEXT: s_wait_bvhcnt 0x0 16302; GFX12-NEXT: s_wait_kmcnt 0x0 16303; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16304; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16305; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16306; GFX12-NEXT: s_mov_b32 s1, 0 16307; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start 16308; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 16309; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16310; GFX12-NEXT: v_mov_b32_e32 v6, v3 16311; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16312; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16313; GFX12-NEXT: v_min_num_f32_e32 v5, v5, v2 16314; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16315; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16316; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 16317; GFX12-NEXT: v_min_num_f32_e32 v3, v3, v4 16318; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 16319; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16320; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16321; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16322; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 16323; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 16324; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 16325; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16326; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16327; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 16328; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16329; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 16330; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 16331; GFX12-NEXT: global_wb scope:SCOPE_SYS 16332; GFX12-NEXT: s_wait_storecnt 0x0 16333; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 16334; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16335; GFX12-NEXT: global_inv scope:SCOPE_SYS 16336; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 16337; GFX12-NEXT: s_wait_alu 0xfffe 16338; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 16339; GFX12-NEXT: s_wait_alu 0xfffe 16340; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16341; GFX12-NEXT: s_cbranch_execnz .LBB60_1 16342; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 16343; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 16344; GFX12-NEXT: v_mov_b32_e32 v0, v3 16345; GFX12-NEXT: s_wait_alu 0xfffe 16346; GFX12-NEXT: s_setpc_b64 s[30:31] 16347; 16348; GFX940-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16349; GFX940: ; %bb.0: 16350; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16351; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 16352; GFX940-NEXT: s_mov_b64 s[2:3], 0 16353; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16354; GFX940-NEXT: s_movk_i32 s4, 0x7fff 16355; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16356; GFX940-NEXT: s_mov_b32 s5, 0x7060302 16357; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start 16358; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 16359; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16360; GFX940-NEXT: v_mov_b32_e32 v7, v3 16361; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 16362; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 16363; GFX940-NEXT: v_min_f32_e32 v3, v3, v4 16364; GFX940-NEXT: v_min_f32_e32 v5, v5, v2 16365; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 16366; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 16367; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 16368; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 16369; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 16370; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 16371; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16372; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 16373; GFX940-NEXT: s_nop 0 16374; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16375; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] 16376; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 16377; GFX940-NEXT: buffer_wbl2 sc0 sc1 16378; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 sc0 sc1 16379; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16380; GFX940-NEXT: buffer_inv sc0 sc1 16381; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 16382; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 16383; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 16384; GFX940-NEXT: s_cbranch_execnz .LBB60_1 16385; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 16386; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 16387; GFX940-NEXT: v_mov_b32_e32 v0, v3 16388; GFX940-NEXT: s_setpc_b64 s[30:31] 16389; 16390; GFX11-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16391; GFX11: ; %bb.0: 16392; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16393; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16394; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16395; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16396; GFX11-NEXT: s_mov_b32 s1, 0 16397; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16398; GFX11-NEXT: .p2align 6 16399; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start 16400; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16401; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16402; GFX11-NEXT: v_mov_b32_e32 v6, v3 16403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16404; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16405; GFX11-NEXT: v_min_f32_e32 v5, v5, v2 16406; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16407; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16408; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 16409; GFX11-NEXT: v_min_f32_e32 v3, v3, v4 16410; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 16411; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16412; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16413; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16414; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 16415; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 16416; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3 16417; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 16418; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16419; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff 16420; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 16421; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 16422; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 16423; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16424; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc 16425; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16426; GFX11-NEXT: buffer_gl1_inv 16427; GFX11-NEXT: buffer_gl0_inv 16428; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 16429; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16430; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16431; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16432; GFX11-NEXT: s_cbranch_execnz .LBB60_1 16433; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16434; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16435; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16436; GFX11-NEXT: v_mov_b32_e32 v0, v3 16437; GFX11-NEXT: s_setpc_b64 s[30:31] 16438; 16439; GFX10-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16440; GFX10: ; %bb.0: 16441; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16442; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0 16443; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo 16444; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 16445; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16446; GFX10-NEXT: s_mov_b32 s5, 0 16447; GFX10-NEXT: flat_load_dword v0, v[3:4] 16448; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start 16449; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16450; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16451; GFX10-NEXT: v_mov_b32_e32 v6, v0 16452; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6 16453; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16454; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 16455; GFX10-NEXT: v_min_f32_e32 v5, v5, v2 16456; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1 16457; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1 16458; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 16459; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5 16460; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 16461; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff 16462; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff 16463; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0 16464; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo 16465; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4 16466; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302 16467; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16468; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 16469; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16470; GFX10-NEXT: buffer_gl1_inv 16471; GFX10-NEXT: buffer_gl0_inv 16472; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6 16473; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16474; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16475; GFX10-NEXT: s_cbranch_execnz .LBB60_1 16476; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16477; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16478; GFX10-NEXT: s_setpc_b64 s[30:31] 16479; 16480; GFX90A-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16481; GFX90A: ; %bb.0: 16482; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16483; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 16484; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16485; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16486; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16487; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16488; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16489; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start 16490; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16491; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16492; GFX90A-NEXT: v_mov_b32_e32 v7, v3 16493; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7 16494; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 16495; GFX90A-NEXT: v_min_f32_e32 v3, v3, v4 16496; GFX90A-NEXT: v_min_f32_e32 v5, v5, v2 16497; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 16498; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1 16499; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3 16500; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5 16501; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 16502; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8 16503; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16504; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16505; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5] 16506; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16507; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9 16508; GFX90A-NEXT: buffer_wbl2 16509; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc 16510; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16511; GFX90A-NEXT: buffer_invl2 16512; GFX90A-NEXT: buffer_wbinvl1 16513; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 16514; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16515; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16516; GFX90A-NEXT: s_cbranch_execnz .LBB60_1 16517; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16518; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16519; GFX90A-NEXT: v_mov_b32_e32 v0, v3 16520; GFX90A-NEXT: s_setpc_b64 s[30:31] 16521; 16522; GFX908-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16523; GFX908: ; %bb.0: 16524; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16525; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 16526; GFX908-NEXT: s_mov_b64 s[6:7], 0 16527; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16528; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16529; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16530; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16531; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start 16532; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16533; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16534; GFX908-NEXT: v_mov_b32_e32 v6, v3 16535; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6 16536; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16537; GFX908-NEXT: v_min_f32_e32 v3, v3, v4 16538; GFX908-NEXT: v_min_f32_e32 v5, v5, v2 16539; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1 16540; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1 16541; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3 16542; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5 16543; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8 16544; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8 16545; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16546; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 16547; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5] 16548; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16549; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9 16550; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc 16551; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16552; GFX908-NEXT: buffer_wbinvl1 16553; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 16554; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16555; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16556; GFX908-NEXT: s_cbranch_execnz .LBB60_1 16557; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16558; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16559; GFX908-NEXT: v_mov_b32_e32 v0, v3 16560; GFX908-NEXT: s_setpc_b64 s[30:31] 16561; 16562; GFX8-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16563; GFX8: ; %bb.0: 16564; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16565; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0 16566; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 16567; GFX8-NEXT: flat_load_dword v0, v[3:4] 16568; GFX8-NEXT: s_mov_b64 s[6:7], 0 16569; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 16570; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 16571; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start 16572; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16573; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16574; GFX8-NEXT: v_mov_b32_e32 v6, v0 16575; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6 16576; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 16577; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 16578; GFX8-NEXT: v_min_f32_e32 v5, v5, v2 16579; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 16580; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 16581; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 16582; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 16583; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16584; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16585; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 16586; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 16587; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 16588; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 16589; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc 16590; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] 16591; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16592; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16 16593; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc 16594; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16595; GFX8-NEXT: buffer_wbinvl1 16596; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6 16597; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16598; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16599; GFX8-NEXT: s_cbranch_execnz .LBB60_1 16600; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16601; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16602; GFX8-NEXT: s_setpc_b64 s[30:31] 16603; 16604; GFX7-LABEL: flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16605; GFX7: ; %bb.0: 16606; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16607; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0 16608; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 16609; GFX7-NEXT: flat_load_dword v0, v[4:5] 16610; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v3 16611; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v2 16612; GFX7-NEXT: s_mov_b64 s[4:5], 0 16613; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 16614; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 16615; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16616; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 16617; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 16618; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start 16619; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16620; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 16621; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 16622; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 16623; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 16624; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 16625; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 16626; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 16627; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16 16628; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6 16629; GFX7-NEXT: v_alignbit_b32 v0, v0, v7, 16 16630; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc 16631; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16632; GFX7-NEXT: buffer_wbinvl1 16633; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 16634; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 16635; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16636; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 16637; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16638; GFX7-NEXT: s_cbranch_execnz .LBB60_1 16639; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16640; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16641; GFX7-NEXT: s_setpc_b64 s[30:31] 16642 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 16643 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 16644 ret <2 x bfloat> %result 16645} 16646 16647define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 { 16648; GFX12-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16649; GFX12: ; %bb.0: 16650; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16651; GFX12-NEXT: s_wait_expcnt 0x0 16652; GFX12-NEXT: s_wait_samplecnt 0x0 16653; GFX12-NEXT: s_wait_bvhcnt 0x0 16654; GFX12-NEXT: s_wait_kmcnt 0x0 16655; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16656; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16657; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16658; GFX12-NEXT: s_mov_b32 s1, 0 16659; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start 16660; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 16661; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16662; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16663; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16664; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16665; GFX12-NEXT: v_min_num_f32_e32 v2, v2, v4 16666; GFX12-NEXT: v_min_num_f32_e32 v6, v6, v5 16667; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16668; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 16669; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 16670; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 16671; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 16672; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16673; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16674; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16675; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 16676; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16677; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16678; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 16679; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 16680; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16681; GFX12-NEXT: global_wb scope:SCOPE_SYS 16682; GFX12-NEXT: s_wait_storecnt 0x0 16683; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS 16684; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 16685; GFX12-NEXT: global_inv scope:SCOPE_SYS 16686; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16687; GFX12-NEXT: v_mov_b32_e32 v3, v2 16688; GFX12-NEXT: s_wait_alu 0xfffe 16689; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 16690; GFX12-NEXT: s_wait_alu 0xfffe 16691; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16692; GFX12-NEXT: s_cbranch_execnz .LBB61_1 16693; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 16694; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 16695; GFX12-NEXT: s_wait_alu 0xfffe 16696; GFX12-NEXT: s_setpc_b64 s[30:31] 16697; 16698; GFX940-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16699; GFX940: ; %bb.0: 16700; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16701; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2044 16702; GFX940-NEXT: s_mov_b64 s[2:3], 0 16703; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16704; GFX940-NEXT: s_movk_i32 s4, 0x7fff 16705; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16706; GFX940-NEXT: s_mov_b32 s5, 0x7060302 16707; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start 16708; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 16709; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16710; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16711; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16712; GFX940-NEXT: v_min_f32_e32 v2, v2, v4 16713; GFX940-NEXT: v_min_f32_e32 v6, v6, v5 16714; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 16715; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 16716; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 16717; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 16718; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 16719; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 16720; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16721; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 16722; GFX940-NEXT: s_nop 0 16723; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16724; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] 16725; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 16726; GFX940-NEXT: buffer_wbl2 sc0 sc1 16727; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 sc0 sc1 16728; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16729; GFX940-NEXT: buffer_inv sc0 sc1 16730; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16731; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] 16732; GFX940-NEXT: v_mov_b32_e32 v3, v2 16733; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] 16734; GFX940-NEXT: s_cbranch_execnz .LBB61_1 16735; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 16736; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] 16737; GFX940-NEXT: s_setpc_b64 s[30:31] 16738; 16739; GFX11-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16740; GFX11: ; %bb.0: 16741; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16742; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044 16743; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16744; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16745; GFX11-NEXT: s_mov_b32 s1, 0 16746; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 16747; GFX11-NEXT: .p2align 6 16748; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start 16749; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 16750; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16751; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16752; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16753; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16754; GFX11-NEXT: v_min_f32_e32 v2, v2, v4 16755; GFX11-NEXT: v_min_f32_e32 v6, v6, v5 16756; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16757; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 16758; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 16759; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 16760; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 16761; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16762; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16763; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16764; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2 16765; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 16766; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16767; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 16768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 16769; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16770; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 16771; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc 16772; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16773; GFX11-NEXT: buffer_gl1_inv 16774; GFX11-NEXT: buffer_gl0_inv 16775; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16776; GFX11-NEXT: v_mov_b32_e32 v3, v2 16777; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 16778; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 16779; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 16780; GFX11-NEXT: s_cbranch_execnz .LBB61_1 16781; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 16782; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 16783; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 16784; GFX11-NEXT: s_setpc_b64 s[30:31] 16785; 16786; GFX10-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16787; GFX10: ; %bb.0: 16788; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16789; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0 16790; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 16791; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16792; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16793; GFX10-NEXT: s_mov_b32 s5, 0 16794; GFX10-NEXT: flat_load_dword v3, v[0:1] 16795; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start 16796; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 16797; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16798; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16799; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16800; GFX10-NEXT: v_min_f32_e32 v2, v2, v4 16801; GFX10-NEXT: v_min_f32_e32 v6, v6, v5 16802; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1 16803; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 16804; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 16805; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6 16806; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 16807; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff 16808; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff 16809; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2 16810; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo 16811; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4 16812; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 16813; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 16814; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16815; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16816; GFX10-NEXT: buffer_gl1_inv 16817; GFX10-NEXT: buffer_gl0_inv 16818; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 16819; GFX10-NEXT: v_mov_b32_e32 v3, v2 16820; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5 16821; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5 16822; GFX10-NEXT: s_cbranch_execnz .LBB61_1 16823; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end 16824; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5 16825; GFX10-NEXT: s_setpc_b64 s[30:31] 16826; 16827; GFX90A-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16828; GFX90A: ; %bb.0: 16829; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16830; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044 16831; GFX90A-NEXT: s_mov_b64 s[6:7], 0 16832; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16833; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 16834; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16835; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 16836; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start 16837; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 16838; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16839; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16840; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16841; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 16842; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 16843; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 16844; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 16845; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 16846; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 16847; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 16848; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 16849; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16850; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16851; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16852; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16853; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 16854; GFX90A-NEXT: buffer_wbl2 16855; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 16856; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16857; GFX90A-NEXT: buffer_invl2 16858; GFX90A-NEXT: buffer_wbinvl1 16859; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16860; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16861; GFX90A-NEXT: v_mov_b32_e32 v3, v2 16862; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] 16863; GFX90A-NEXT: s_cbranch_execnz .LBB61_1 16864; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 16865; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] 16866; GFX90A-NEXT: s_setpc_b64 s[30:31] 16867; 16868; GFX908-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16869; GFX908: ; %bb.0: 16870; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16871; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044 16872; GFX908-NEXT: s_mov_b64 s[6:7], 0 16873; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16874; GFX908-NEXT: s_movk_i32 s8, 0x7fff 16875; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16876; GFX908-NEXT: s_mov_b32 s9, 0x7060302 16877; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start 16878; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 16879; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16880; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16881; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16882; GFX908-NEXT: v_min_f32_e32 v2, v2, v4 16883; GFX908-NEXT: v_min_f32_e32 v6, v6, v5 16884; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1 16885; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1 16886; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2 16887; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6 16888; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8 16889; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8 16890; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16891; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16892; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16893; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16894; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9 16895; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc 16896; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16897; GFX908-NEXT: buffer_wbinvl1 16898; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16899; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16900; GFX908-NEXT: v_mov_b32_e32 v3, v2 16901; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7] 16902; GFX908-NEXT: s_cbranch_execnz .LBB61_1 16903; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 16904; GFX908-NEXT: s_or_b64 exec, exec, s[6:7] 16905; GFX908-NEXT: s_setpc_b64 s[30:31] 16906; 16907; GFX8-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16908; GFX8: ; %bb.0: 16909; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16910; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0 16911; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16912; GFX8-NEXT: flat_load_dword v3, v[0:1] 16913; GFX8-NEXT: s_mov_b64 s[6:7], 0 16914; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2 16915; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 16916; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start 16917; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 16918; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16919; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 16920; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 16921; GFX8-NEXT: v_min_f32_e32 v2, v2, v4 16922; GFX8-NEXT: v_min_f32_e32 v6, v6, v5 16923; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 16924; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1 16925; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 16926; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6 16927; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 16928; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 16929; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6 16930; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 16931; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2 16932; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 16933; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc 16934; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] 16935; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 16936; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16 16937; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 16938; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16939; GFX8-NEXT: buffer_wbinvl1 16940; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 16941; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7] 16942; GFX8-NEXT: v_mov_b32_e32 v3, v2 16943; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7] 16944; GFX8-NEXT: s_cbranch_execnz .LBB61_1 16945; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 16946; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] 16947; GFX8-NEXT: s_setpc_b64 s[30:31] 16948; 16949; GFX7-LABEL: flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: 16950; GFX7: ; %bb.0: 16951; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16952; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0 16953; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16954; GFX7-NEXT: flat_load_dword v4, v[0:1] 16955; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 16956; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2 16957; GFX7-NEXT: s_mov_b64 s[4:5], 0 16958; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 16959; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 16960; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16961; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 16962; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 16963; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start 16964; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 16965; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 16966; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 16967; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 16968; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 16969; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 16970; GFX7-NEXT: v_min_f32_e32 v6, v6, v2 16971; GFX7-NEXT: v_min_f32_e32 v7, v7, v3 16972; GFX7-NEXT: v_alignbit_b32 v5, v5, v4, 16 16973; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6 16974; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16 16975; GFX7-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc 16976; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 16977; GFX7-NEXT: buffer_wbinvl1 16978; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 16979; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 16980; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 16981; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 16982; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] 16983; GFX7-NEXT: s_cbranch_execnz .LBB61_1 16984; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end 16985; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] 16986; GFX7-NEXT: s_setpc_b64 s[30:31] 16987 %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 16988 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 16989 ret void 16990} 16991 16992attributes #0 = { nounwind } 16993attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 16994 16995!0 = !{} 16996