1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s 7; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s 8; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s 9; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s 10 11; TODO: Delete this and add run lines to use *-atomicrmw-fmax.ll tests 12 13define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) { 14; GFX12-LABEL: local_atomic_fmax_ret_f32: 15; GFX12: ; %bb.0: 16; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 17; GFX12-NEXT: s_wait_expcnt 0x0 18; GFX12-NEXT: s_wait_samplecnt 0x0 19; GFX12-NEXT: s_wait_bvhcnt 0x0 20; GFX12-NEXT: s_wait_kmcnt 0x0 21; GFX12-NEXT: s_wait_storecnt 0x0 22; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1 23; GFX12-NEXT: s_wait_dscnt 0x0 24; GFX12-NEXT: global_inv scope:SCOPE_SE 25; GFX12-NEXT: s_setpc_b64 s[30:31] 26; 27; GFX940-LABEL: local_atomic_fmax_ret_f32: 28; GFX940: ; %bb.0: 29; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; GFX940-NEXT: ds_max_rtn_f32 v0, v0, v1 31; GFX940-NEXT: s_waitcnt lgkmcnt(0) 32; GFX940-NEXT: s_setpc_b64 s[30:31] 33; 34; GFX11-LABEL: local_atomic_fmax_ret_f32: 35; GFX11: ; %bb.0: 36; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 38; GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1 39; GFX11-NEXT: s_waitcnt lgkmcnt(0) 40; GFX11-NEXT: buffer_gl0_inv 41; GFX11-NEXT: s_setpc_b64 s[30:31] 42; 43; GFX10-LABEL: local_atomic_fmax_ret_f32: 44; GFX10: ; %bb.0: 45; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 47; GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 48; GFX10-NEXT: s_waitcnt lgkmcnt(0) 49; GFX10-NEXT: buffer_gl0_inv 50; GFX10-NEXT: s_setpc_b64 s[30:31] 51; 52; GFX90A-LABEL: local_atomic_fmax_ret_f32: 53; GFX90A: ; %bb.0: 54; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 55; GFX90A-NEXT: ds_max_rtn_f32 v0, v0, v1 56; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 57; GFX90A-NEXT: s_setpc_b64 s[30:31] 58; 59; GFX908-LABEL: local_atomic_fmax_ret_f32: 60; GFX908: ; %bb.0: 61; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX908-NEXT: ds_max_rtn_f32 v0, v0, v1 63; GFX908-NEXT: s_waitcnt lgkmcnt(0) 64; GFX908-NEXT: s_setpc_b64 s[30:31] 65; 66; GFX8-LABEL: local_atomic_fmax_ret_f32: 67; GFX8: ; %bb.0: 68; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX8-NEXT: s_mov_b32 m0, -1 70; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 71; GFX8-NEXT: s_waitcnt lgkmcnt(0) 72; GFX8-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX7-LABEL: local_atomic_fmax_ret_f32: 75; GFX7: ; %bb.0: 76; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX7-NEXT: s_mov_b32 m0, -1 78; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 79; GFX7-NEXT: s_waitcnt lgkmcnt(0) 80; GFX7-NEXT: s_setpc_b64 s[30:31] 81 %result = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst 82 ret float %result 83} 84 85define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) { 86; GFX12-LABEL: local_atomic_fmax_noret_f32: 87; GFX12: ; %bb.0: 88; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 89; GFX12-NEXT: s_wait_expcnt 0x0 90; GFX12-NEXT: s_wait_samplecnt 0x0 91; GFX12-NEXT: s_wait_bvhcnt 0x0 92; GFX12-NEXT: s_wait_kmcnt 0x0 93; GFX12-NEXT: s_wait_storecnt 0x0 94; GFX12-NEXT: ds_max_num_f32 v0, v1 95; GFX12-NEXT: s_wait_dscnt 0x0 96; GFX12-NEXT: global_inv scope:SCOPE_SE 97; GFX12-NEXT: s_setpc_b64 s[30:31] 98; 99; GFX940-LABEL: local_atomic_fmax_noret_f32: 100; GFX940: ; %bb.0: 101; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 102; GFX940-NEXT: ds_max_f32 v0, v1 103; GFX940-NEXT: s_waitcnt lgkmcnt(0) 104; GFX940-NEXT: s_setpc_b64 s[30:31] 105; 106; GFX11-LABEL: local_atomic_fmax_noret_f32: 107; GFX11: ; %bb.0: 108; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 109; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 110; GFX11-NEXT: ds_max_f32 v0, v1 111; GFX11-NEXT: s_waitcnt lgkmcnt(0) 112; GFX11-NEXT: buffer_gl0_inv 113; GFX11-NEXT: s_setpc_b64 s[30:31] 114; 115; GFX10-LABEL: local_atomic_fmax_noret_f32: 116; GFX10: ; %bb.0: 117; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 119; GFX10-NEXT: ds_max_f32 v0, v1 120; GFX10-NEXT: s_waitcnt lgkmcnt(0) 121; GFX10-NEXT: buffer_gl0_inv 122; GFX10-NEXT: s_setpc_b64 s[30:31] 123; 124; GFX90A-LABEL: local_atomic_fmax_noret_f32: 125; GFX90A: ; %bb.0: 126; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 127; GFX90A-NEXT: ds_max_f32 v0, v1 128; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 129; GFX90A-NEXT: s_setpc_b64 s[30:31] 130; 131; GFX908-LABEL: local_atomic_fmax_noret_f32: 132; GFX908: ; %bb.0: 133; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 134; GFX908-NEXT: ds_max_f32 v0, v1 135; GFX908-NEXT: s_waitcnt lgkmcnt(0) 136; GFX908-NEXT: s_setpc_b64 s[30:31] 137; 138; GFX8-LABEL: local_atomic_fmax_noret_f32: 139; GFX8: ; %bb.0: 140; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 141; GFX8-NEXT: s_mov_b32 m0, -1 142; GFX8-NEXT: ds_max_f32 v0, v1 143; GFX8-NEXT: s_waitcnt lgkmcnt(0) 144; GFX8-NEXT: s_setpc_b64 s[30:31] 145; 146; GFX7-LABEL: local_atomic_fmax_noret_f32: 147; GFX7: ; %bb.0: 148; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; GFX7-NEXT: s_mov_b32 m0, -1 150; GFX7-NEXT: ds_max_f32 v0, v1 151; GFX7-NEXT: s_waitcnt lgkmcnt(0) 152; GFX7-NEXT: s_setpc_b64 s[30:31] 153 %unused = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst 154 ret void 155} 156 157define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) { 158; GFX12-LABEL: local_atomic_fmax_ret_f64: 159; GFX12: ; %bb.0: 160; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 161; GFX12-NEXT: s_wait_expcnt 0x0 162; GFX12-NEXT: s_wait_samplecnt 0x0 163; GFX12-NEXT: s_wait_bvhcnt 0x0 164; GFX12-NEXT: s_wait_kmcnt 0x0 165; GFX12-NEXT: s_wait_storecnt 0x0 166; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2] 167; GFX12-NEXT: s_wait_dscnt 0x0 168; GFX12-NEXT: global_inv scope:SCOPE_SE 169; GFX12-NEXT: s_setpc_b64 s[30:31] 170; 171; GFX940-LABEL: local_atomic_fmax_ret_f64: 172; GFX940: ; %bb.0: 173; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 174; GFX940-NEXT: v_mov_b32_e32 v4, v1 175; GFX940-NEXT: v_mov_b32_e32 v5, v2 176; GFX940-NEXT: ds_max_rtn_f64 v[0:1], v0, v[4:5] 177; GFX940-NEXT: s_waitcnt lgkmcnt(0) 178; GFX940-NEXT: s_setpc_b64 s[30:31] 179; 180; GFX11-LABEL: local_atomic_fmax_ret_f64: 181; GFX11: ; %bb.0: 182; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] 185; GFX11-NEXT: s_waitcnt lgkmcnt(0) 186; GFX11-NEXT: buffer_gl0_inv 187; GFX11-NEXT: s_setpc_b64 s[30:31] 188; 189; GFX10-LABEL: local_atomic_fmax_ret_f64: 190; GFX10: ; %bb.0: 191; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 192; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 193; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] 194; GFX10-NEXT: s_waitcnt lgkmcnt(0) 195; GFX10-NEXT: buffer_gl0_inv 196; GFX10-NEXT: s_setpc_b64 s[30:31] 197; 198; GFX90A-LABEL: local_atomic_fmax_ret_f64: 199; GFX90A: ; %bb.0: 200; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 201; GFX90A-NEXT: v_mov_b32_e32 v4, v1 202; GFX90A-NEXT: v_mov_b32_e32 v5, v2 203; GFX90A-NEXT: ds_max_rtn_f64 v[0:1], v0, v[4:5] 204; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 205; GFX90A-NEXT: s_setpc_b64 s[30:31] 206; 207; GFX908-LABEL: local_atomic_fmax_ret_f64: 208; GFX908: ; %bb.0: 209; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; GFX908-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] 211; GFX908-NEXT: s_waitcnt lgkmcnt(0) 212; GFX908-NEXT: s_setpc_b64 s[30:31] 213; 214; GFX8-LABEL: local_atomic_fmax_ret_f64: 215; GFX8: ; %bb.0: 216; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX8-NEXT: s_mov_b32 m0, -1 218; GFX8-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] 219; GFX8-NEXT: s_waitcnt lgkmcnt(0) 220; GFX8-NEXT: s_setpc_b64 s[30:31] 221; 222; GFX7-LABEL: local_atomic_fmax_ret_f64: 223; GFX7: ; %bb.0: 224; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX7-NEXT: s_mov_b32 m0, -1 226; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[1:2] 227; GFX7-NEXT: s_waitcnt lgkmcnt(0) 228; GFX7-NEXT: s_setpc_b64 s[30:31] 229 %result = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst 230 ret double %result 231} 232 233define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) { 234; GFX12-LABEL: local_atomic_fmax_noret_f64: 235; GFX12: ; %bb.0: 236; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 237; GFX12-NEXT: s_wait_expcnt 0x0 238; GFX12-NEXT: s_wait_samplecnt 0x0 239; GFX12-NEXT: s_wait_bvhcnt 0x0 240; GFX12-NEXT: s_wait_kmcnt 0x0 241; GFX12-NEXT: s_wait_storecnt 0x0 242; GFX12-NEXT: ds_max_num_f64 v0, v[1:2] 243; GFX12-NEXT: s_wait_dscnt 0x0 244; GFX12-NEXT: global_inv scope:SCOPE_SE 245; GFX12-NEXT: s_setpc_b64 s[30:31] 246; 247; GFX940-LABEL: local_atomic_fmax_noret_f64: 248; GFX940: ; %bb.0: 249; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 250; GFX940-NEXT: v_mov_b32_e32 v4, v1 251; GFX940-NEXT: v_mov_b32_e32 v5, v2 252; GFX940-NEXT: ds_max_f64 v0, v[4:5] 253; GFX940-NEXT: s_waitcnt lgkmcnt(0) 254; GFX940-NEXT: s_setpc_b64 s[30:31] 255; 256; GFX11-LABEL: local_atomic_fmax_noret_f64: 257; GFX11: ; %bb.0: 258; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 260; GFX11-NEXT: ds_max_f64 v0, v[1:2] 261; GFX11-NEXT: s_waitcnt lgkmcnt(0) 262; GFX11-NEXT: buffer_gl0_inv 263; GFX11-NEXT: s_setpc_b64 s[30:31] 264; 265; GFX10-LABEL: local_atomic_fmax_noret_f64: 266; GFX10: ; %bb.0: 267; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 268; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 269; GFX10-NEXT: ds_max_f64 v0, v[1:2] 270; GFX10-NEXT: s_waitcnt lgkmcnt(0) 271; GFX10-NEXT: buffer_gl0_inv 272; GFX10-NEXT: s_setpc_b64 s[30:31] 273; 274; GFX90A-LABEL: local_atomic_fmax_noret_f64: 275; GFX90A: ; %bb.0: 276; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 277; GFX90A-NEXT: v_mov_b32_e32 v4, v1 278; GFX90A-NEXT: v_mov_b32_e32 v5, v2 279; GFX90A-NEXT: ds_max_f64 v0, v[4:5] 280; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 281; GFX90A-NEXT: s_setpc_b64 s[30:31] 282; 283; GFX908-LABEL: local_atomic_fmax_noret_f64: 284; GFX908: ; %bb.0: 285; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 286; GFX908-NEXT: ds_max_f64 v0, v[1:2] 287; GFX908-NEXT: s_waitcnt lgkmcnt(0) 288; GFX908-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX8-LABEL: local_atomic_fmax_noret_f64: 291; GFX8: ; %bb.0: 292; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX8-NEXT: s_mov_b32 m0, -1 294; GFX8-NEXT: ds_max_f64 v0, v[1:2] 295; GFX8-NEXT: s_waitcnt lgkmcnt(0) 296; GFX8-NEXT: s_setpc_b64 s[30:31] 297; 298; GFX7-LABEL: local_atomic_fmax_noret_f64: 299; GFX7: ; %bb.0: 300; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 301; GFX7-NEXT: s_mov_b32 m0, -1 302; GFX7-NEXT: ds_max_f64 v0, v[1:2] 303; GFX7-NEXT: s_waitcnt lgkmcnt(0) 304; GFX7-NEXT: s_setpc_b64 s[30:31] 305 %unused = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst 306 ret void 307} 308 309define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) { 310; GFX12-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 311; GFX12: ; %bb.0: 312; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 313; GFX12-NEXT: s_wait_expcnt 0x0 314; GFX12-NEXT: s_wait_samplecnt 0x0 315; GFX12-NEXT: s_wait_bvhcnt 0x0 316; GFX12-NEXT: s_wait_kmcnt 0x0 317; GFX12-NEXT: s_wait_storecnt 0x0 318; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV 319; GFX12-NEXT: s_wait_loadcnt 0x0 320; GFX12-NEXT: global_inv scope:SCOPE_DEV 321; GFX12-NEXT: s_setpc_b64 s[30:31] 322; 323; GFX940-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 324; GFX940: ; %bb.0: 325; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 326; GFX940-NEXT: global_load_dword v3, v[0:1], off 327; GFX940-NEXT: s_mov_b64 s[0:1], 0 328; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 329; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start 330; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 331; GFX940-NEXT: s_waitcnt vmcnt(0) 332; GFX940-NEXT: v_mov_b32_e32 v5, v3 333; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 334; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 335; GFX940-NEXT: buffer_wbl2 sc1 336; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 337; GFX940-NEXT: s_waitcnt vmcnt(0) 338; GFX940-NEXT: buffer_inv sc1 339; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 340; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 341; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 342; GFX940-NEXT: s_cbranch_execnz .LBB4_1 343; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 344; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 345; GFX940-NEXT: v_mov_b32_e32 v0, v3 346; GFX940-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX11-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 349; GFX11: ; %bb.0: 350; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 351; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 352; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc 353; GFX11-NEXT: s_waitcnt vmcnt(0) 354; GFX11-NEXT: buffer_gl1_inv 355; GFX11-NEXT: buffer_gl0_inv 356; GFX11-NEXT: s_setpc_b64 s[30:31] 357; 358; GFX10-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 359; GFX10: ; %bb.0: 360; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 361; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 362; GFX10-NEXT: global_atomic_fmax v0, v[0:1], v2, off glc 363; GFX10-NEXT: s_waitcnt vmcnt(0) 364; GFX10-NEXT: buffer_gl1_inv 365; GFX10-NEXT: buffer_gl0_inv 366; GFX10-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX90A-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 369; GFX90A: ; %bb.0: 370; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX90A-NEXT: global_load_dword v3, v[0:1], off 372; GFX90A-NEXT: s_mov_b64 s[4:5], 0 373; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 374; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start 375; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 376; GFX90A-NEXT: s_waitcnt vmcnt(0) 377; GFX90A-NEXT: v_mov_b32_e32 v5, v3 378; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 379; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 380; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc 381; GFX90A-NEXT: s_waitcnt vmcnt(0) 382; GFX90A-NEXT: buffer_wbinvl1 383; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 384; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 385; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 386; GFX90A-NEXT: s_cbranch_execnz .LBB4_1 387; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 388; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 389; GFX90A-NEXT: v_mov_b32_e32 v0, v3 390; GFX90A-NEXT: s_setpc_b64 s[30:31] 391; 392; GFX908-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 393; GFX908: ; %bb.0: 394; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 395; GFX908-NEXT: global_load_dword v3, v[0:1], off 396; GFX908-NEXT: s_mov_b64 s[4:5], 0 397; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 398; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start 399; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 400; GFX908-NEXT: s_waitcnt vmcnt(0) 401; GFX908-NEXT: v_mov_b32_e32 v4, v3 402; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 403; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 404; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc 405; GFX908-NEXT: s_waitcnt vmcnt(0) 406; GFX908-NEXT: buffer_wbinvl1 407; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 408; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 409; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 410; GFX908-NEXT: s_cbranch_execnz .LBB4_1 411; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 412; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 413; GFX908-NEXT: v_mov_b32_e32 v0, v3 414; GFX908-NEXT: s_setpc_b64 s[30:31] 415; 416; GFX8-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 417; GFX8: ; %bb.0: 418; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX8-NEXT: flat_load_dword v3, v[0:1] 420; GFX8-NEXT: s_mov_b64 s[4:5], 0 421; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 422; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start 423; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 424; GFX8-NEXT: s_waitcnt vmcnt(0) 425; GFX8-NEXT: v_mov_b32_e32 v4, v3 426; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 427; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 428; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 429; GFX8-NEXT: s_waitcnt vmcnt(0) 430; GFX8-NEXT: buffer_wbinvl1 431; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 432; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 433; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 434; GFX8-NEXT: s_cbranch_execnz .LBB4_1 435; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 436; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 437; GFX8-NEXT: v_mov_b32_e32 v0, v3 438; GFX8-NEXT: s_setpc_b64 s[30:31] 439; 440; GFX7-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 441; GFX7: ; %bb.0: 442; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 443; GFX7-NEXT: s_mov_b32 s6, 0 444; GFX7-NEXT: s_mov_b32 s7, 0xf000 445; GFX7-NEXT: s_mov_b64 s[4:5], 0 446; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 glc 447; GFX7-NEXT: s_waitcnt vmcnt(0) 448; GFX7-NEXT: buffer_wbinvl1 449; GFX7-NEXT: v_mov_b32_e32 v0, v2 450; GFX7-NEXT: s_setpc_b64 s[30:31] 451 %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 452 ret float %result 453} 454 455define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) { 456; GFX12-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 457; GFX12: ; %bb.0: 458; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 459; GFX12-NEXT: s_wait_expcnt 0x0 460; GFX12-NEXT: s_wait_samplecnt 0x0 461; GFX12-NEXT: s_wait_bvhcnt 0x0 462; GFX12-NEXT: s_wait_kmcnt 0x0 463; GFX12-NEXT: s_wait_storecnt 0x0 464; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV 465; GFX12-NEXT: s_wait_storecnt 0x0 466; GFX12-NEXT: global_inv scope:SCOPE_DEV 467; GFX12-NEXT: s_setpc_b64 s[30:31] 468; 469; GFX940-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 470; GFX940: ; %bb.0: 471; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 472; GFX940-NEXT: global_load_dword v3, v[0:1], off 473; GFX940-NEXT: s_mov_b64 s[0:1], 0 474; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 475; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start 476; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 477; GFX940-NEXT: s_waitcnt vmcnt(0) 478; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 479; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 480; GFX940-NEXT: buffer_wbl2 sc1 481; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 482; GFX940-NEXT: s_waitcnt vmcnt(0) 483; GFX940-NEXT: buffer_inv sc1 484; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 485; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 486; GFX940-NEXT: v_mov_b32_e32 v3, v2 487; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 488; GFX940-NEXT: s_cbranch_execnz .LBB5_1 489; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 490; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 491; GFX940-NEXT: s_setpc_b64 s[30:31] 492; 493; GFX11-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 494; GFX11: ; %bb.0: 495; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 496; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 497; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off 498; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 499; GFX11-NEXT: buffer_gl1_inv 500; GFX11-NEXT: buffer_gl0_inv 501; GFX11-NEXT: s_setpc_b64 s[30:31] 502; 503; GFX10-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 504; GFX10: ; %bb.0: 505; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 506; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 507; GFX10-NEXT: global_atomic_fmax v[0:1], v2, off 508; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 509; GFX10-NEXT: buffer_gl1_inv 510; GFX10-NEXT: buffer_gl0_inv 511; GFX10-NEXT: s_setpc_b64 s[30:31] 512; 513; GFX90A-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 514; GFX90A: ; %bb.0: 515; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 516; GFX90A-NEXT: global_load_dword v3, v[0:1], off 517; GFX90A-NEXT: s_mov_b64 s[4:5], 0 518; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 519; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start 520; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 521; GFX90A-NEXT: s_waitcnt vmcnt(0) 522; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 523; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 524; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 525; GFX90A-NEXT: s_waitcnt vmcnt(0) 526; GFX90A-NEXT: buffer_wbinvl1 527; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 528; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 529; GFX90A-NEXT: v_mov_b32_e32 v3, v2 530; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 531; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 532; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 533; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 534; GFX90A-NEXT: s_setpc_b64 s[30:31] 535; 536; GFX908-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 537; GFX908: ; %bb.0: 538; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 539; GFX908-NEXT: global_load_dword v3, v[0:1], off 540; GFX908-NEXT: s_mov_b64 s[4:5], 0 541; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 542; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start 543; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 544; GFX908-NEXT: s_waitcnt vmcnt(0) 545; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 546; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 547; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc 548; GFX908-NEXT: s_waitcnt vmcnt(0) 549; GFX908-NEXT: buffer_wbinvl1 550; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 551; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 552; GFX908-NEXT: v_mov_b32_e32 v3, v2 553; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 554; GFX908-NEXT: s_cbranch_execnz .LBB5_1 555; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 556; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 557; GFX908-NEXT: s_setpc_b64 s[30:31] 558; 559; GFX8-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 560; GFX8: ; %bb.0: 561; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 562; GFX8-NEXT: flat_load_dword v3, v[0:1] 563; GFX8-NEXT: s_mov_b64 s[4:5], 0 564; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 565; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start 566; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 567; GFX8-NEXT: s_waitcnt vmcnt(0) 568; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 569; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 570; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 571; GFX8-NEXT: s_waitcnt vmcnt(0) 572; GFX8-NEXT: buffer_wbinvl1 573; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 574; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 575; GFX8-NEXT: v_mov_b32_e32 v3, v2 576; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 577; GFX8-NEXT: s_cbranch_execnz .LBB5_1 578; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 579; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 580; GFX8-NEXT: s_setpc_b64 s[30:31] 581; 582; GFX7-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 583; GFX7: ; %bb.0: 584; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 585; GFX7-NEXT: s_mov_b32 s6, 0 586; GFX7-NEXT: s_mov_b32 s7, 0xf000 587; GFX7-NEXT: s_mov_b64 s[4:5], 0 588; GFX7-NEXT: buffer_atomic_fmax v2, v[0:1], s[4:7], 0 addr64 589; GFX7-NEXT: s_waitcnt vmcnt(0) 590; GFX7-NEXT: buffer_wbinvl1 591; GFX7-NEXT: s_setpc_b64 s[30:31] 592 %unused = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 593 ret void 594} 595 596define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) { 597; GFX12-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 598; GFX12: ; %bb.0: 599; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 600; GFX12-NEXT: s_wait_expcnt 0x0 601; GFX12-NEXT: s_wait_samplecnt 0x0 602; GFX12-NEXT: s_wait_bvhcnt 0x0 603; GFX12-NEXT: s_wait_kmcnt 0x0 604; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off 605; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 606; GFX12-NEXT: s_mov_b32 s0, 0 607; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start 608; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 609; GFX12-NEXT: s_wait_loadcnt 0x0 610; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 611; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 612; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] 613; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] 614; GFX12-NEXT: s_wait_storecnt 0x0 615; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV 616; GFX12-NEXT: s_wait_loadcnt 0x0 617; GFX12-NEXT: global_inv scope:SCOPE_DEV 618; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 619; GFX12-NEXT: s_wait_alu 0xfffe 620; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 621; GFX12-NEXT: s_wait_alu 0xfffe 622; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 623; GFX12-NEXT: s_cbranch_execnz .LBB6_1 624; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 625; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 626; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 627; GFX12-NEXT: s_wait_alu 0xfffe 628; GFX12-NEXT: s_setpc_b64 s[30:31] 629; 630; GFX940-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 631; GFX940: ; %bb.0: 632; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 633; GFX940-NEXT: buffer_wbl2 sc1 634; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 635; GFX940-NEXT: s_waitcnt vmcnt(0) 636; GFX940-NEXT: buffer_inv sc1 637; GFX940-NEXT: s_setpc_b64 s[30:31] 638; 639; GFX11-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 640; GFX11: ; %bb.0: 641; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 642; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off 643; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 644; GFX11-NEXT: s_mov_b32 s0, 0 645; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start 646; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 647; GFX11-NEXT: s_waitcnt vmcnt(0) 648; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 649; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 650; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 651; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 652; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 653; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc 654; GFX11-NEXT: s_waitcnt vmcnt(0) 655; GFX11-NEXT: buffer_gl1_inv 656; GFX11-NEXT: buffer_gl0_inv 657; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 658; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 659; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 660; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 661; GFX11-NEXT: s_cbranch_execnz .LBB6_1 662; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 663; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 664; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 665; GFX11-NEXT: s_setpc_b64 s[30:31] 666; 667; GFX10-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 668; GFX10: ; %bb.0: 669; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 670; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 671; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc 672; GFX10-NEXT: s_waitcnt vmcnt(0) 673; GFX10-NEXT: buffer_gl1_inv 674; GFX10-NEXT: buffer_gl0_inv 675; GFX10-NEXT: s_setpc_b64 s[30:31] 676; 677; GFX90A-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 678; GFX90A: ; %bb.0: 679; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 680; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc 681; GFX90A-NEXT: s_waitcnt vmcnt(0) 682; GFX90A-NEXT: buffer_wbinvl1 683; GFX90A-NEXT: s_setpc_b64 s[30:31] 684; 685; GFX908-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 686; GFX908: ; %bb.0: 687; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 688; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 689; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 690; GFX908-NEXT: s_mov_b64 s[4:5], 0 691; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start 692; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 693; GFX908-NEXT: s_waitcnt vmcnt(0) 694; GFX908-NEXT: v_mov_b32_e32 v7, v5 695; GFX908-NEXT: v_mov_b32_e32 v6, v4 696; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 697; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 698; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc 699; GFX908-NEXT: s_waitcnt vmcnt(0) 700; GFX908-NEXT: buffer_wbinvl1 701; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 702; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 703; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 704; GFX908-NEXT: s_cbranch_execnz .LBB6_1 705; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 706; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 707; GFX908-NEXT: v_mov_b32_e32 v0, v4 708; GFX908-NEXT: v_mov_b32_e32 v1, v5 709; GFX908-NEXT: s_setpc_b64 s[30:31] 710; 711; GFX8-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 712; GFX8: ; %bb.0: 713; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 714; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 715; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 716; GFX8-NEXT: s_mov_b64 s[4:5], 0 717; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start 718; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 719; GFX8-NEXT: s_waitcnt vmcnt(0) 720; GFX8-NEXT: v_mov_b32_e32 v7, v5 721; GFX8-NEXT: v_mov_b32_e32 v6, v4 722; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 723; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 724; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 725; GFX8-NEXT: s_waitcnt vmcnt(0) 726; GFX8-NEXT: buffer_wbinvl1 727; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 728; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 729; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 730; GFX8-NEXT: s_cbranch_execnz .LBB6_1 731; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 732; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 733; GFX8-NEXT: v_mov_b32_e32 v0, v4 734; GFX8-NEXT: v_mov_b32_e32 v1, v5 735; GFX8-NEXT: s_setpc_b64 s[30:31] 736; 737; GFX7-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 738; GFX7: ; %bb.0: 739; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 740; GFX7-NEXT: s_mov_b32 s6, 0 741; GFX7-NEXT: s_mov_b32 s7, 0xf000 742; GFX7-NEXT: s_mov_b64 s[4:5], 0 743; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc 744; GFX7-NEXT: s_waitcnt vmcnt(0) 745; GFX7-NEXT: buffer_wbinvl1 746; GFX7-NEXT: v_mov_b32_e32 v0, v2 747; GFX7-NEXT: v_mov_b32_e32 v1, v3 748; GFX7-NEXT: s_setpc_b64 s[30:31] 749 %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 750 ret double %result 751} 752 753define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) { 754; GFX12-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 755; GFX12: ; %bb.0: 756; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 757; GFX12-NEXT: s_wait_expcnt 0x0 758; GFX12-NEXT: s_wait_samplecnt 0x0 759; GFX12-NEXT: s_wait_bvhcnt 0x0 760; GFX12-NEXT: s_wait_kmcnt 0x0 761; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off 762; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] 763; GFX12-NEXT: s_mov_b32 s0, 0 764; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start 765; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 766; GFX12-NEXT: s_wait_loadcnt 0x0 767; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] 768; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 769; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] 770; GFX12-NEXT: s_wait_storecnt 0x0 771; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV 772; GFX12-NEXT: s_wait_loadcnt 0x0 773; GFX12-NEXT: global_inv scope:SCOPE_DEV 774; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 775; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 776; GFX12-NEXT: s_wait_alu 0xfffe 777; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 778; GFX12-NEXT: s_wait_alu 0xfffe 779; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 780; GFX12-NEXT: s_cbranch_execnz .LBB7_1 781; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 782; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 783; GFX12-NEXT: s_wait_alu 0xfffe 784; GFX12-NEXT: s_setpc_b64 s[30:31] 785; 786; GFX940-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 787; GFX940: ; %bb.0: 788; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 789; GFX940-NEXT: buffer_wbl2 sc1 790; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off 791; GFX940-NEXT: s_waitcnt vmcnt(0) 792; GFX940-NEXT: buffer_inv sc1 793; GFX940-NEXT: s_setpc_b64 s[30:31] 794; 795; GFX11-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 796; GFX11: ; %bb.0: 797; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 798; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off 799; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 800; GFX11-NEXT: s_mov_b32 s0, 0 801; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start 802; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 803; GFX11-NEXT: s_waitcnt vmcnt(0) 804; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 806; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 807; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 808; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off glc 809; GFX11-NEXT: s_waitcnt vmcnt(0) 810; GFX11-NEXT: buffer_gl1_inv 811; GFX11-NEXT: buffer_gl0_inv 812; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 813; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 814; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 815; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 816; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 817; GFX11-NEXT: s_cbranch_execnz .LBB7_1 818; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 819; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 820; GFX11-NEXT: s_setpc_b64 s[30:31] 821; 822; GFX10-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 823; GFX10: ; %bb.0: 824; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 825; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 826; GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[2:3], off 827; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 828; GFX10-NEXT: buffer_gl1_inv 829; GFX10-NEXT: buffer_gl0_inv 830; GFX10-NEXT: s_setpc_b64 s[30:31] 831; 832; GFX90A-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 833; GFX90A: ; %bb.0: 834; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 835; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[2:3], off 836; GFX90A-NEXT: s_waitcnt vmcnt(0) 837; GFX90A-NEXT: buffer_wbinvl1 838; GFX90A-NEXT: s_setpc_b64 s[30:31] 839; 840; GFX908-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 841; GFX908: ; %bb.0: 842; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 843; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off 844; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 845; GFX908-NEXT: s_mov_b64 s[4:5], 0 846; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start 847; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 848; GFX908-NEXT: s_waitcnt vmcnt(0) 849; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 850; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 851; GFX908-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc 852; GFX908-NEXT: s_waitcnt vmcnt(0) 853; GFX908-NEXT: buffer_wbinvl1 854; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 855; GFX908-NEXT: v_mov_b32_e32 v5, v3 856; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 857; GFX908-NEXT: v_mov_b32_e32 v4, v2 858; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 859; GFX908-NEXT: s_cbranch_execnz .LBB7_1 860; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 861; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 862; GFX908-NEXT: s_setpc_b64 s[30:31] 863; 864; GFX8-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 865; GFX8: ; %bb.0: 866; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 867; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 868; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 869; GFX8-NEXT: s_mov_b64 s[4:5], 0 870; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start 871; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 872; GFX8-NEXT: s_waitcnt vmcnt(0) 873; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 874; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 875; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc 876; GFX8-NEXT: s_waitcnt vmcnt(0) 877; GFX8-NEXT: buffer_wbinvl1 878; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 879; GFX8-NEXT: v_mov_b32_e32 v5, v3 880; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 881; GFX8-NEXT: v_mov_b32_e32 v4, v2 882; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 883; GFX8-NEXT: s_cbranch_execnz .LBB7_1 884; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 885; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 886; GFX8-NEXT: s_setpc_b64 s[30:31] 887; 888; GFX7-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 889; GFX7: ; %bb.0: 890; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX7-NEXT: s_mov_b32 s6, 0 892; GFX7-NEXT: s_mov_b32 s7, 0xf000 893; GFX7-NEXT: s_mov_b64 s[4:5], 0 894; GFX7-NEXT: buffer_atomic_fmax_x2 v[2:3], v[0:1], s[4:7], 0 addr64 895; GFX7-NEXT: s_waitcnt vmcnt(0) 896; GFX7-NEXT: buffer_wbinvl1 897; GFX7-NEXT: s_setpc_b64 s[30:31] 898 %unused = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 899 ret void 900} 901 902define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) { 903; GFX12-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 904; GFX12: ; %bb.0: 905; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 906; GFX12-NEXT: s_wait_expcnt 0x0 907; GFX12-NEXT: s_wait_samplecnt 0x0 908; GFX12-NEXT: s_wait_bvhcnt 0x0 909; GFX12-NEXT: s_wait_kmcnt 0x0 910; GFX12-NEXT: s_wait_storecnt 0x0 911; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV 912; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 913; GFX12-NEXT: global_inv scope:SCOPE_DEV 914; GFX12-NEXT: s_setpc_b64 s[30:31] 915; 916; GFX940-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 917; GFX940: ; %bb.0: 918; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX940-NEXT: flat_load_dword v3, v[0:1] 920; GFX940-NEXT: s_mov_b64 s[0:1], 0 921; GFX940-NEXT: v_max_f32_e32 v2, v2, v2 922; GFX940-NEXT: .LBB8_1: ; %atomicrmw.start 923; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 924; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 925; GFX940-NEXT: v_mov_b32_e32 v5, v3 926; GFX940-NEXT: v_max_f32_e32 v3, v5, v5 927; GFX940-NEXT: v_max_f32_e32 v4, v3, v2 928; GFX940-NEXT: buffer_wbl2 sc1 929; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 930; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 931; GFX940-NEXT: buffer_inv sc1 932; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 933; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 934; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 935; GFX940-NEXT: s_cbranch_execnz .LBB8_1 936; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 937; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 938; GFX940-NEXT: v_mov_b32_e32 v0, v3 939; GFX940-NEXT: s_setpc_b64 s[30:31] 940; 941; GFX11-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 942; GFX11: ; %bb.0: 943; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 944; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 945; GFX11-NEXT: flat_atomic_max_f32 v0, v[0:1], v2 glc 946; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 947; GFX11-NEXT: buffer_gl1_inv 948; GFX11-NEXT: buffer_gl0_inv 949; GFX11-NEXT: s_setpc_b64 s[30:31] 950; 951; GFX10-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 952; GFX10: ; %bb.0: 953; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 954; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 955; GFX10-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc 956; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 957; GFX10-NEXT: buffer_gl1_inv 958; GFX10-NEXT: buffer_gl0_inv 959; GFX10-NEXT: s_setpc_b64 s[30:31] 960; 961; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 962; GFX90A: ; %bb.0: 963; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 964; GFX90A-NEXT: flat_load_dword v3, v[0:1] 965; GFX90A-NEXT: s_mov_b64 s[4:5], 0 966; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 967; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start 968; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 969; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 970; GFX90A-NEXT: v_mov_b32_e32 v5, v3 971; GFX90A-NEXT: v_max_f32_e32 v3, v5, v5 972; GFX90A-NEXT: v_max_f32_e32 v4, v3, v2 973; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc 974; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 975; GFX90A-NEXT: buffer_wbinvl1 976; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 977; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 978; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 979; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 980; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 981; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 982; GFX90A-NEXT: v_mov_b32_e32 v0, v3 983; GFX90A-NEXT: s_setpc_b64 s[30:31] 984; 985; GFX908-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 986; GFX908: ; %bb.0: 987; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 988; GFX908-NEXT: flat_load_dword v3, v[0:1] 989; GFX908-NEXT: s_mov_b64 s[4:5], 0 990; GFX908-NEXT: v_max_f32_e32 v2, v2, v2 991; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start 992; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 993; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 994; GFX908-NEXT: v_mov_b32_e32 v4, v3 995; GFX908-NEXT: v_max_f32_e32 v3, v4, v4 996; GFX908-NEXT: v_max_f32_e32 v3, v3, v2 997; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 998; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 999; GFX908-NEXT: buffer_wbinvl1 1000; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1001; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1002; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1003; GFX908-NEXT: s_cbranch_execnz .LBB8_1 1004; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1005; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1006; GFX908-NEXT: v_mov_b32_e32 v0, v3 1007; GFX908-NEXT: s_setpc_b64 s[30:31] 1008; 1009; GFX8-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1010; GFX8: ; %bb.0: 1011; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1012; GFX8-NEXT: flat_load_dword v3, v[0:1] 1013; GFX8-NEXT: s_mov_b64 s[4:5], 0 1014; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 1015; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start 1016; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1017; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1018; GFX8-NEXT: v_mov_b32_e32 v4, v3 1019; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v4 1020; GFX8-NEXT: v_max_f32_e32 v3, v3, v2 1021; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc 1022; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1023; GFX8-NEXT: buffer_wbinvl1 1024; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 1025; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1026; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1027; GFX8-NEXT: s_cbranch_execnz .LBB8_1 1028; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1029; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1030; GFX8-NEXT: v_mov_b32_e32 v0, v3 1031; GFX8-NEXT: s_setpc_b64 s[30:31] 1032; 1033; GFX7-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1034; GFX7: ; %bb.0: 1035; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1036; GFX7-NEXT: flat_atomic_fmax v0, v[0:1], v2 glc 1037; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1038; GFX7-NEXT: buffer_wbinvl1 1039; GFX7-NEXT: s_setpc_b64 s[30:31] 1040 %result = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1041 ret float %result 1042} 1043 1044define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) { 1045; GFX12-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1046; GFX12: ; %bb.0: 1047; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1048; GFX12-NEXT: s_wait_expcnt 0x0 1049; GFX12-NEXT: s_wait_samplecnt 0x0 1050; GFX12-NEXT: s_wait_bvhcnt 0x0 1051; GFX12-NEXT: s_wait_kmcnt 0x0 1052; GFX12-NEXT: s_wait_storecnt 0x0 1053; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV 1054; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 1055; GFX12-NEXT: global_inv scope:SCOPE_DEV 1056; GFX12-NEXT: s_setpc_b64 s[30:31] 1057; 1058; GFX940-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1059; GFX940: ; %bb.0: 1060; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1061; GFX940-NEXT: flat_load_dword v3, v[0:1] 1062; GFX940-NEXT: s_mov_b64 s[0:1], 0 1063; GFX940-NEXT: v_max_f32_e32 v4, v2, v2 1064; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start 1065; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1066; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1067; GFX940-NEXT: v_max_f32_e32 v2, v3, v3 1068; GFX940-NEXT: v_max_f32_e32 v2, v2, v4 1069; GFX940-NEXT: buffer_wbl2 sc1 1070; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 1071; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1072; GFX940-NEXT: buffer_inv sc1 1073; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1074; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1075; GFX940-NEXT: v_mov_b32_e32 v3, v2 1076; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] 1077; GFX940-NEXT: s_cbranch_execnz .LBB9_1 1078; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1079; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] 1080; GFX940-NEXT: s_setpc_b64 s[30:31] 1081; 1082; GFX11-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1083; GFX11: ; %bb.0: 1084; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1085; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1086; GFX11-NEXT: flat_atomic_max_f32 v[0:1], v2 1087; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1089; GFX11-NEXT: buffer_gl1_inv 1090; GFX11-NEXT: buffer_gl0_inv 1091; GFX11-NEXT: s_setpc_b64 s[30:31] 1092; 1093; GFX10-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1094; GFX10: ; %bb.0: 1095; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1096; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1097; GFX10-NEXT: flat_atomic_fmax v[0:1], v2 1098; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1100; GFX10-NEXT: buffer_gl1_inv 1101; GFX10-NEXT: buffer_gl0_inv 1102; GFX10-NEXT: s_setpc_b64 s[30:31] 1103; 1104; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1105; GFX90A: ; %bb.0: 1106; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1107; GFX90A-NEXT: flat_load_dword v3, v[0:1] 1108; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1109; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 1110; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start 1111; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1112; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1113; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 1114; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 1115; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1116; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1117; GFX90A-NEXT: buffer_wbinvl1 1118; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1119; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1120; GFX90A-NEXT: v_mov_b32_e32 v3, v2 1121; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1122; GFX90A-NEXT: s_cbranch_execnz .LBB9_1 1123; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1124; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1125; GFX90A-NEXT: s_setpc_b64 s[30:31] 1126; 1127; GFX908-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1128; GFX908: ; %bb.0: 1129; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1130; GFX908-NEXT: flat_load_dword v3, v[0:1] 1131; GFX908-NEXT: s_mov_b64 s[4:5], 0 1132; GFX908-NEXT: v_max_f32_e32 v4, v2, v2 1133; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start 1134; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1135; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1136; GFX908-NEXT: v_max_f32_e32 v2, v3, v3 1137; GFX908-NEXT: v_max_f32_e32 v2, v2, v4 1138; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1139; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1140; GFX908-NEXT: buffer_wbinvl1 1141; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1142; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1143; GFX908-NEXT: v_mov_b32_e32 v3, v2 1144; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1145; GFX908-NEXT: s_cbranch_execnz .LBB9_1 1146; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1147; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1148; GFX908-NEXT: s_setpc_b64 s[30:31] 1149; 1150; GFX8-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1151; GFX8: ; %bb.0: 1152; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1153; GFX8-NEXT: flat_load_dword v3, v[0:1] 1154; GFX8-NEXT: s_mov_b64 s[4:5], 0 1155; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2 1156; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start 1157; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1158; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1159; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3 1160; GFX8-NEXT: v_max_f32_e32 v2, v2, v4 1161; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc 1162; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1163; GFX8-NEXT: buffer_wbinvl1 1164; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 1165; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1166; GFX8-NEXT: v_mov_b32_e32 v3, v2 1167; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1168; GFX8-NEXT: s_cbranch_execnz .LBB9_1 1169; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1170; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1171; GFX8-NEXT: s_setpc_b64 s[30:31] 1172; 1173; GFX7-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1174; GFX7: ; %bb.0: 1175; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1176; GFX7-NEXT: flat_atomic_fmax v[0:1], v2 1177; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1178; GFX7-NEXT: buffer_wbinvl1 1179; GFX7-NEXT: s_setpc_b64 s[30:31] 1180 %unused = atomicrmw fmax ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1181 ret void 1182} 1183 1184define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) { 1185; GFX12-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1186; GFX12: ; %bb.0: 1187; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1188; GFX12-NEXT: s_wait_expcnt 0x0 1189; GFX12-NEXT: s_wait_samplecnt 0x0 1190; GFX12-NEXT: s_wait_bvhcnt 0x0 1191; GFX12-NEXT: s_wait_kmcnt 0x0 1192; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] 1193; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] 1194; GFX12-NEXT: s_mov_b32 s0, 0 1195; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start 1196; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1197; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1198; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 1199; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1200; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7] 1201; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3] 1202; GFX12-NEXT: s_wait_storecnt 0x0 1203; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1204; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1205; GFX12-NEXT: global_inv scope:SCOPE_DEV 1206; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 1207; GFX12-NEXT: s_wait_alu 0xfffe 1208; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1209; GFX12-NEXT: s_wait_alu 0xfffe 1210; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1211; GFX12-NEXT: s_cbranch_execnz .LBB10_1 1212; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1213; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1214; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 1215; GFX12-NEXT: s_wait_alu 0xfffe 1216; GFX12-NEXT: s_setpc_b64 s[30:31] 1217; 1218; GFX940-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1219; GFX940: ; %bb.0: 1220; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1221; GFX940-NEXT: buffer_wbl2 sc1 1222; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 1223; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1224; GFX940-NEXT: buffer_inv sc1 1225; GFX940-NEXT: s_setpc_b64 s[30:31] 1226; 1227; GFX11-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1228; GFX11: ; %bb.0: 1229; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1230; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] 1231; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 1232; GFX11-NEXT: s_mov_b32 s0, 0 1233; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start 1234; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1235; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1236; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4 1237; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1238; GFX11-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 1239; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 1240; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1241; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc 1242; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1243; GFX11-NEXT: buffer_gl1_inv 1244; GFX11-NEXT: buffer_gl0_inv 1245; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7] 1246; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1247; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1248; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1249; GFX11-NEXT: s_cbranch_execnz .LBB10_1 1250; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1251; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1252; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 1253; GFX11-NEXT: s_setpc_b64 s[30:31] 1254; 1255; GFX10-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1256; GFX10: ; %bb.0: 1257; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1258; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1259; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc 1260; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1261; GFX10-NEXT: buffer_gl1_inv 1262; GFX10-NEXT: buffer_gl0_inv 1263; GFX10-NEXT: s_setpc_b64 s[30:31] 1264; 1265; GFX90A-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1266; GFX90A: ; %bb.0: 1267; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1268; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc 1269; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1270; GFX90A-NEXT: buffer_wbinvl1 1271; GFX90A-NEXT: s_setpc_b64 s[30:31] 1272; 1273; GFX908-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1274; GFX908: ; %bb.0: 1275; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1276; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 1277; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 1278; GFX908-NEXT: s_mov_b64 s[4:5], 0 1279; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start 1280; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1281; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1282; GFX908-NEXT: v_mov_b32_e32 v7, v5 1283; GFX908-NEXT: v_mov_b32_e32 v6, v4 1284; GFX908-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 1285; GFX908-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 1286; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 1287; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1288; GFX908-NEXT: buffer_wbinvl1 1289; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 1290; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1291; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1292; GFX908-NEXT: s_cbranch_execnz .LBB10_1 1293; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1294; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1295; GFX908-NEXT: v_mov_b32_e32 v0, v4 1296; GFX908-NEXT: v_mov_b32_e32 v1, v5 1297; GFX908-NEXT: s_setpc_b64 s[30:31] 1298; 1299; GFX8-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1300; GFX8: ; %bb.0: 1301; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1302; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 1303; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 1304; GFX8-NEXT: flat_load_dword v4, v[0:1] 1305; GFX8-NEXT: flat_load_dword v5, v[5:6] 1306; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] 1307; GFX8-NEXT: s_mov_b64 s[4:5], 0 1308; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start 1309; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1310; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1311; GFX8-NEXT: v_mov_b32_e32 v7, v5 1312; GFX8-NEXT: v_mov_b32_e32 v6, v4 1313; GFX8-NEXT: v_max_f64 v[4:5], v[6:7], v[6:7] 1314; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[2:3] 1315; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc 1316; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1317; GFX8-NEXT: buffer_wbinvl1 1318; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] 1319; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1320; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1321; GFX8-NEXT: s_cbranch_execnz .LBB10_1 1322; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1323; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1324; GFX8-NEXT: v_mov_b32_e32 v0, v4 1325; GFX8-NEXT: v_mov_b32_e32 v1, v5 1326; GFX8-NEXT: s_setpc_b64 s[30:31] 1327; 1328; GFX7-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1329; GFX7: ; %bb.0: 1330; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1331; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[0:1], v[2:3] glc 1332; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1333; GFX7-NEXT: buffer_wbinvl1 1334; GFX7-NEXT: s_setpc_b64 s[30:31] 1335 %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 1336 ret double %result 1337} 1338 1339define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) { 1340; GFX12-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1341; GFX12: ; %bb.0: 1342; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1343; GFX12-NEXT: s_wait_expcnt 0x0 1344; GFX12-NEXT: s_wait_samplecnt 0x0 1345; GFX12-NEXT: s_wait_bvhcnt 0x0 1346; GFX12-NEXT: s_wait_kmcnt 0x0 1347; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] 1348; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3] 1349; GFX12-NEXT: s_mov_b32 s0, 0 1350; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start 1351; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1352; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1353; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] 1354; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1355; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7] 1356; GFX12-NEXT: s_wait_storecnt 0x0 1357; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV 1358; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1359; GFX12-NEXT: global_inv scope:SCOPE_DEV 1360; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 1361; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 1362; GFX12-NEXT: s_wait_alu 0xfffe 1363; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 1364; GFX12-NEXT: s_wait_alu 0xfffe 1365; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1366; GFX12-NEXT: s_cbranch_execnz .LBB11_1 1367; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1368; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 1369; GFX12-NEXT: s_wait_alu 0xfffe 1370; GFX12-NEXT: s_setpc_b64 s[30:31] 1371; 1372; GFX940-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1373; GFX940: ; %bb.0: 1374; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1375; GFX940-NEXT: buffer_wbl2 sc1 1376; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] 1377; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1378; GFX940-NEXT: buffer_inv sc1 1379; GFX940-NEXT: s_setpc_b64 s[30:31] 1380; 1381; GFX11-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1382; GFX11: ; %bb.0: 1383; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1384; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] 1385; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 1386; GFX11-NEXT: s_mov_b32 s0, 0 1387; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start 1388; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1389; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1390; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 1391; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1392; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 1393; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1394; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] glc 1395; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1396; GFX11-NEXT: buffer_gl1_inv 1397; GFX11-NEXT: buffer_gl0_inv 1398; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] 1399; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 1400; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 1401; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1402; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 1403; GFX11-NEXT: s_cbranch_execnz .LBB11_1 1404; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1405; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 1406; GFX11-NEXT: s_setpc_b64 s[30:31] 1407; 1408; GFX10-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1409; GFX10: ; %bb.0: 1410; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1411; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1412; GFX10-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] 1413; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1414; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1415; GFX10-NEXT: buffer_gl1_inv 1416; GFX10-NEXT: buffer_gl0_inv 1417; GFX10-NEXT: s_setpc_b64 s[30:31] 1418; 1419; GFX90A-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1420; GFX90A: ; %bb.0: 1421; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1422; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] 1423; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1424; GFX90A-NEXT: buffer_wbinvl1 1425; GFX90A-NEXT: s_setpc_b64 s[30:31] 1426; 1427; GFX908-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1428; GFX908: ; %bb.0: 1429; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1430; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 1431; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 1432; GFX908-NEXT: s_mov_b64 s[4:5], 0 1433; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start 1434; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1435; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1436; GFX908-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 1437; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 1438; GFX908-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc 1439; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1440; GFX908-NEXT: buffer_wbinvl1 1441; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 1442; GFX908-NEXT: v_mov_b32_e32 v5, v3 1443; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1444; GFX908-NEXT: v_mov_b32_e32 v4, v2 1445; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1446; GFX908-NEXT: s_cbranch_execnz .LBB11_1 1447; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1448; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1449; GFX908-NEXT: s_setpc_b64 s[30:31] 1450; 1451; GFX8-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1452; GFX8: ; %bb.0: 1453; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1454; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 1455; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 1456; GFX8-NEXT: flat_load_dword v4, v[0:1] 1457; GFX8-NEXT: flat_load_dword v5, v[5:6] 1458; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3] 1459; GFX8-NEXT: s_mov_b64 s[4:5], 0 1460; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start 1461; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1462; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1463; GFX8-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] 1464; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] 1465; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc 1466; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1467; GFX8-NEXT: buffer_wbinvl1 1468; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] 1469; GFX8-NEXT: v_mov_b32_e32 v5, v3 1470; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1471; GFX8-NEXT: v_mov_b32_e32 v4, v2 1472; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1473; GFX8-NEXT: s_cbranch_execnz .LBB11_1 1474; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1475; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1476; GFX8-NEXT: s_setpc_b64 s[30:31] 1477; 1478; GFX7-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1479; GFX7: ; %bb.0: 1480; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1481; GFX7-NEXT: flat_atomic_fmax_x2 v[0:1], v[2:3] 1482; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1483; GFX7-NEXT: buffer_wbinvl1 1484; GFX7-NEXT: s_setpc_b64 s[30:31] 1485 %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 1486 ret void 1487} 1488 1489define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) { 1490; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1491; GFX12: ; %bb.0: 1492; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1493; GFX12-NEXT: s_wait_expcnt 0x0 1494; GFX12-NEXT: s_wait_samplecnt 0x0 1495; GFX12-NEXT: s_wait_bvhcnt 0x0 1496; GFX12-NEXT: s_wait_kmcnt 0x0 1497; GFX12-NEXT: v_mov_b32_e32 v1, s16 1498; GFX12-NEXT: s_wait_storecnt 0x0 1499; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN 1500; GFX12-NEXT: s_wait_loadcnt 0x0 1501; GFX12-NEXT: global_inv scope:SCOPE_DEV 1502; GFX12-NEXT: s_setpc_b64 s[30:31] 1503; 1504; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1505; GFX940: ; %bb.0: 1506; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1507; GFX940-NEXT: v_mov_b32_e32 v2, s16 1508; GFX940-NEXT: v_mov_b32_e32 v1, v0 1509; GFX940-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen 1510; GFX940-NEXT: s_mov_b64 s[4:5], 0 1511; GFX940-NEXT: v_max_f32_e32 v3, v1, v1 1512; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start 1513; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1514; GFX940-NEXT: s_waitcnt vmcnt(0) 1515; GFX940-NEXT: v_mov_b32_e32 v5, v0 1516; GFX940-NEXT: v_max_f32_e32 v0, v5, v5 1517; GFX940-NEXT: v_max_f32_e32 v4, v0, v3 1518; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] 1519; GFX940-NEXT: buffer_wbl2 sc1 1520; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[0:3], 0 offen sc0 1521; GFX940-NEXT: s_waitcnt vmcnt(0) 1522; GFX940-NEXT: buffer_inv sc1 1523; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1524; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1525; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 1526; GFX940-NEXT: s_cbranch_execnz .LBB12_1 1527; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1528; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 1529; GFX940-NEXT: s_setpc_b64 s[30:31] 1530; 1531; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1532; GFX11: ; %bb.0: 1533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1534; GFX11-NEXT: v_mov_b32_e32 v1, s16 1535; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1536; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc 1537; GFX11-NEXT: s_waitcnt vmcnt(0) 1538; GFX11-NEXT: buffer_gl1_inv 1539; GFX11-NEXT: buffer_gl0_inv 1540; GFX11-NEXT: s_setpc_b64 s[30:31] 1541; 1542; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1543; GFX10: ; %bb.0: 1544; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1545; GFX10-NEXT: v_mov_b32_e32 v1, s20 1546; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1547; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc 1548; GFX10-NEXT: s_waitcnt vmcnt(0) 1549; GFX10-NEXT: buffer_gl1_inv 1550; GFX10-NEXT: buffer_gl0_inv 1551; GFX10-NEXT: s_setpc_b64 s[30:31] 1552; 1553; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1554; GFX90A: ; %bb.0: 1555; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1556; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1557; GFX90A-NEXT: v_mov_b32_e32 v1, v0 1558; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen 1559; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1560; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1 1561; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start 1562; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1563; GFX90A-NEXT: s_waitcnt vmcnt(0) 1564; GFX90A-NEXT: v_mov_b32_e32 v5, v0 1565; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5 1566; GFX90A-NEXT: v_max_f32_e32 v4, v0, v3 1567; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] 1568; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc 1569; GFX90A-NEXT: s_waitcnt vmcnt(0) 1570; GFX90A-NEXT: buffer_wbinvl1 1571; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1572; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1573; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1574; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 1575; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1576; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1577; GFX90A-NEXT: s_setpc_b64 s[30:31] 1578; 1579; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1580; GFX908: ; %bb.0: 1581; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1582; GFX908-NEXT: v_mov_b32_e32 v2, s20 1583; GFX908-NEXT: v_mov_b32_e32 v1, v0 1584; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen 1585; GFX908-NEXT: s_mov_b64 s[4:5], 0 1586; GFX908-NEXT: v_max_f32_e32 v3, v1, v1 1587; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start 1588; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1589; GFX908-NEXT: s_waitcnt vmcnt(0) 1590; GFX908-NEXT: v_mov_b32_e32 v5, v0 1591; GFX908-NEXT: v_max_f32_e32 v0, v5, v5 1592; GFX908-NEXT: v_max_f32_e32 v4, v0, v3 1593; GFX908-NEXT: v_mov_b32_e32 v0, v4 1594; GFX908-NEXT: v_mov_b32_e32 v1, v5 1595; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc 1596; GFX908-NEXT: s_waitcnt vmcnt(0) 1597; GFX908-NEXT: buffer_wbinvl1 1598; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1599; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1600; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1601; GFX908-NEXT: s_cbranch_execnz .LBB12_1 1602; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1603; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1604; GFX908-NEXT: s_setpc_b64 s[30:31] 1605; 1606; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1607; GFX8: ; %bb.0: 1608; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1609; GFX8-NEXT: v_mov_b32_e32 v2, s20 1610; GFX8-NEXT: v_mov_b32_e32 v1, v0 1611; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen 1612; GFX8-NEXT: s_mov_b64 s[4:5], 0 1613; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1 1614; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start 1615; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1616; GFX8-NEXT: s_waitcnt vmcnt(0) 1617; GFX8-NEXT: v_mov_b32_e32 v5, v0 1618; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5 1619; GFX8-NEXT: v_max_f32_e32 v4, v0, v3 1620; GFX8-NEXT: v_mov_b32_e32 v0, v4 1621; GFX8-NEXT: v_mov_b32_e32 v1, v5 1622; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[16:19], 0 offen glc 1623; GFX8-NEXT: s_waitcnt vmcnt(0) 1624; GFX8-NEXT: buffer_wbinvl1 1625; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 1626; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1627; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1628; GFX8-NEXT: s_cbranch_execnz .LBB12_1 1629; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1630; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1631; GFX8-NEXT: s_setpc_b64 s[30:31] 1632; 1633; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory: 1634; GFX7: ; %bb.0: 1635; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1636; GFX7-NEXT: v_mov_b32_e32 v1, s20 1637; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen glc 1638; GFX7-NEXT: s_waitcnt vmcnt(0) 1639; GFX7-NEXT: buffer_wbinvl1 1640; GFX7-NEXT: s_setpc_b64 s[30:31] 1641 %result = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1642 ret float %result 1643} 1644 1645define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) { 1646; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1647; GFX12: ; %bb.0: 1648; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1649; GFX12-NEXT: s_wait_expcnt 0x0 1650; GFX12-NEXT: s_wait_samplecnt 0x0 1651; GFX12-NEXT: s_wait_bvhcnt 0x0 1652; GFX12-NEXT: s_wait_kmcnt 0x0 1653; GFX12-NEXT: v_mov_b32_e32 v1, s16 1654; GFX12-NEXT: s_wait_storecnt 0x0 1655; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen 1656; GFX12-NEXT: s_wait_storecnt 0x0 1657; GFX12-NEXT: global_inv scope:SCOPE_DEV 1658; GFX12-NEXT: s_setpc_b64 s[30:31] 1659; 1660; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1661; GFX940: ; %bb.0: 1662; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1663; GFX940-NEXT: v_mov_b32_e32 v2, s16 1664; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen 1665; GFX940-NEXT: s_mov_b64 s[4:5], 0 1666; GFX940-NEXT: v_max_f32_e32 v3, v0, v0 1667; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start 1668; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 1669; GFX940-NEXT: s_waitcnt vmcnt(0) 1670; GFX940-NEXT: v_max_f32_e32 v0, v1, v1 1671; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 1672; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1] 1673; GFX940-NEXT: buffer_wbl2 sc1 1674; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0 1675; GFX940-NEXT: s_waitcnt vmcnt(0) 1676; GFX940-NEXT: buffer_inv sc1 1677; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 1678; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1679; GFX940-NEXT: v_mov_b32_e32 v1, v4 1680; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5] 1681; GFX940-NEXT: s_cbranch_execnz .LBB13_1 1682; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end 1683; GFX940-NEXT: s_or_b64 exec, exec, s[4:5] 1684; GFX940-NEXT: s_setpc_b64 s[30:31] 1685; 1686; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1687; GFX11: ; %bb.0: 1688; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1689; GFX11-NEXT: v_mov_b32_e32 v1, s16 1690; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1691; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen 1692; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1693; GFX11-NEXT: buffer_gl1_inv 1694; GFX11-NEXT: buffer_gl0_inv 1695; GFX11-NEXT: s_setpc_b64 s[30:31] 1696; 1697; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1698; GFX10: ; %bb.0: 1699; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1700; GFX10-NEXT: v_mov_b32_e32 v1, s20 1701; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1702; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen 1703; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1704; GFX10-NEXT: buffer_gl1_inv 1705; GFX10-NEXT: buffer_gl0_inv 1706; GFX10-NEXT: s_setpc_b64 s[30:31] 1707; 1708; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1709; GFX90A: ; %bb.0: 1710; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1711; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1712; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 1713; GFX90A-NEXT: s_mov_b64 s[4:5], 0 1714; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 1715; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start 1716; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 1717; GFX90A-NEXT: s_waitcnt vmcnt(0) 1718; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 1719; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 1720; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1] 1721; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 1722; GFX90A-NEXT: s_waitcnt vmcnt(0) 1723; GFX90A-NEXT: buffer_wbinvl1 1724; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 1725; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1726; GFX90A-NEXT: v_mov_b32_e32 v1, v4 1727; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] 1728; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 1729; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end 1730; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] 1731; GFX90A-NEXT: s_setpc_b64 s[30:31] 1732; 1733; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1734; GFX908: ; %bb.0: 1735; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1736; GFX908-NEXT: v_mov_b32_e32 v2, s20 1737; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 1738; GFX908-NEXT: s_mov_b64 s[4:5], 0 1739; GFX908-NEXT: v_max_f32_e32 v3, v0, v0 1740; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start 1741; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1742; GFX908-NEXT: s_waitcnt vmcnt(0) 1743; GFX908-NEXT: v_max_f32_e32 v0, v1, v1 1744; GFX908-NEXT: v_max_f32_e32 v0, v0, v3 1745; GFX908-NEXT: v_mov_b32_e32 v5, v1 1746; GFX908-NEXT: v_mov_b32_e32 v4, v0 1747; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 1748; GFX908-NEXT: s_waitcnt vmcnt(0) 1749; GFX908-NEXT: buffer_wbinvl1 1750; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 1751; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1752; GFX908-NEXT: v_mov_b32_e32 v1, v4 1753; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1754; GFX908-NEXT: s_cbranch_execnz .LBB13_1 1755; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1756; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1757; GFX908-NEXT: s_setpc_b64 s[30:31] 1758; 1759; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1760; GFX8: ; %bb.0: 1761; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1762; GFX8-NEXT: v_mov_b32_e32 v2, s20 1763; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen 1764; GFX8-NEXT: s_mov_b64 s[4:5], 0 1765; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0 1766; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start 1767; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1768; GFX8-NEXT: s_waitcnt vmcnt(0) 1769; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 1770; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 1771; GFX8-NEXT: v_mov_b32_e32 v5, v1 1772; GFX8-NEXT: v_mov_b32_e32 v4, v0 1773; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[16:19], 0 offen glc 1774; GFX8-NEXT: s_waitcnt vmcnt(0) 1775; GFX8-NEXT: buffer_wbinvl1 1776; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1 1777; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1778; GFX8-NEXT: v_mov_b32_e32 v1, v4 1779; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1780; GFX8-NEXT: s_cbranch_execnz .LBB13_1 1781; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1782; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1783; GFX8-NEXT: s_setpc_b64 s[30:31] 1784; 1785; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory: 1786; GFX7: ; %bb.0: 1787; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1788; GFX7-NEXT: v_mov_b32_e32 v1, s20 1789; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[16:19], 0 offen 1790; GFX7-NEXT: s_waitcnt vmcnt(0) 1791; GFX7-NEXT: buffer_wbinvl1 1792; GFX7-NEXT: s_setpc_b64 s[30:31] 1793 %unused = atomicrmw fmax ptr addrspace(7) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1794 ret void 1795} 1796 1797define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) { 1798; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1799; GFX12: ; %bb.0: 1800; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1801; GFX12-NEXT: s_wait_expcnt 0x0 1802; GFX12-NEXT: s_wait_samplecnt 0x0 1803; GFX12-NEXT: s_wait_bvhcnt 0x0 1804; GFX12-NEXT: s_wait_kmcnt 0x0 1805; GFX12-NEXT: v_mov_b32_e32 v6, s16 1806; GFX12-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 1807; GFX12-NEXT: s_mov_b32 s4, 0 1808; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen 1809; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] 1810; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start 1811; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1812; GFX12-NEXT: s_wait_loadcnt 0x0 1813; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1814; GFX12-NEXT: s_wait_storecnt 0x0 1815; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1816; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10] 1817; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5] 1818; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1819; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1820; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1821; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 1822; GFX12-NEXT: s_wait_loadcnt 0x0 1823; GFX12-NEXT: global_inv scope:SCOPE_DEV 1824; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1825; GFX12-NEXT: s_wait_alu 0xfffe 1826; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 1827; GFX12-NEXT: s_wait_alu 0xfffe 1828; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1829; GFX12-NEXT: s_cbranch_execnz .LBB14_1 1830; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 1831; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 1832; GFX12-NEXT: s_wait_alu 0xfffe 1833; GFX12-NEXT: s_setpc_b64 s[30:31] 1834; 1835; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1836; GFX940: ; %bb.0: 1837; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1838; GFX940-NEXT: v_mov_b32_e32 v2, s16 1839; GFX940-NEXT: buffer_wbl2 sc1 1840; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 1841; GFX940-NEXT: s_waitcnt vmcnt(0) 1842; GFX940-NEXT: buffer_inv sc1 1843; GFX940-NEXT: s_setpc_b64 s[30:31] 1844; 1845; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1846; GFX11: ; %bb.0: 1847; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1848; GFX11-NEXT: v_mov_b32_e32 v6, s16 1849; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1 1850; GFX11-NEXT: s_mov_b32 s4, 0 1851; GFX11-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], 0 offen 1852; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1853; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start 1854; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 1855; GFX11-NEXT: s_waitcnt vmcnt(0) 1856; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0 1857; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1858; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1859; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1860; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] 1861; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1862; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 1863; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 1864; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc 1865; GFX11-NEXT: s_waitcnt vmcnt(0) 1866; GFX11-NEXT: buffer_gl1_inv 1867; GFX11-NEXT: buffer_gl0_inv 1868; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] 1869; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 1870; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1871; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 1872; GFX11-NEXT: s_cbranch_execnz .LBB14_1 1873; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 1874; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 1875; GFX11-NEXT: s_setpc_b64 s[30:31] 1876; 1877; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1878; GFX10: ; %bb.0: 1879; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1880; GFX10-NEXT: v_mov_b32_e32 v2, s20 1881; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1882; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc 1883; GFX10-NEXT: s_waitcnt vmcnt(0) 1884; GFX10-NEXT: buffer_gl1_inv 1885; GFX10-NEXT: buffer_gl0_inv 1886; GFX10-NEXT: s_setpc_b64 s[30:31] 1887; 1888; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1889; GFX90A: ; %bb.0: 1890; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1891; GFX90A-NEXT: v_mov_b32_e32 v2, s20 1892; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen glc 1893; GFX90A-NEXT: s_waitcnt vmcnt(0) 1894; GFX90A-NEXT: buffer_wbinvl1 1895; GFX90A-NEXT: s_setpc_b64 s[30:31] 1896; 1897; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1898; GFX908: ; %bb.0: 1899; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1900; GFX908-NEXT: v_mov_b32_e32 v6, s20 1901; GFX908-NEXT: v_mov_b32_e32 v2, v0 1902; GFX908-NEXT: v_mov_b32_e32 v3, v1 1903; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen 1904; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1905; GFX908-NEXT: s_mov_b64 s[4:5], 0 1906; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start 1907; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 1908; GFX908-NEXT: s_waitcnt vmcnt(0) 1909; GFX908-NEXT: v_mov_b32_e32 v10, v1 1910; GFX908-NEXT: v_mov_b32_e32 v9, v0 1911; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1912; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] 1913; GFX908-NEXT: v_mov_b32_e32 v0, v7 1914; GFX908-NEXT: v_mov_b32_e32 v1, v8 1915; GFX908-NEXT: v_mov_b32_e32 v2, v9 1916; GFX908-NEXT: v_mov_b32_e32 v3, v10 1917; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 1918; GFX908-NEXT: s_waitcnt vmcnt(0) 1919; GFX908-NEXT: buffer_wbinvl1 1920; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 1921; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1922; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 1923; GFX908-NEXT: s_cbranch_execnz .LBB14_1 1924; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 1925; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 1926; GFX908-NEXT: s_setpc_b64 s[30:31] 1927; 1928; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1929; GFX8: ; %bb.0: 1930; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1931; GFX8-NEXT: v_mov_b32_e32 v6, s20 1932; GFX8-NEXT: v_mov_b32_e32 v2, v0 1933; GFX8-NEXT: v_mov_b32_e32 v3, v1 1934; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen 1935; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] 1936; GFX8-NEXT: s_mov_b64 s[4:5], 0 1937; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start 1938; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 1939; GFX8-NEXT: s_waitcnt vmcnt(0) 1940; GFX8-NEXT: v_mov_b32_e32 v10, v1 1941; GFX8-NEXT: v_mov_b32_e32 v9, v0 1942; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] 1943; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] 1944; GFX8-NEXT: v_mov_b32_e32 v0, v7 1945; GFX8-NEXT: v_mov_b32_e32 v1, v8 1946; GFX8-NEXT: v_mov_b32_e32 v2, v9 1947; GFX8-NEXT: v_mov_b32_e32 v3, v10 1948; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc 1949; GFX8-NEXT: s_waitcnt vmcnt(0) 1950; GFX8-NEXT: buffer_wbinvl1 1951; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] 1952; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 1953; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 1954; GFX8-NEXT: s_cbranch_execnz .LBB14_1 1955; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 1956; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 1957; GFX8-NEXT: s_setpc_b64 s[30:31] 1958; 1959; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory: 1960; GFX7: ; %bb.0: 1961; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1962; GFX7-NEXT: v_mov_b32_e32 v2, s20 1963; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen glc 1964; GFX7-NEXT: s_waitcnt vmcnt(0) 1965; GFX7-NEXT: buffer_wbinvl1 1966; GFX7-NEXT: s_setpc_b64 s[30:31] 1967 %result = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1968 ret double %result 1969} 1970 1971define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) { 1972; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 1973; GFX12: ; %bb.0: 1974; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1975; GFX12-NEXT: s_wait_expcnt 0x0 1976; GFX12-NEXT: s_wait_samplecnt 0x0 1977; GFX12-NEXT: s_wait_bvhcnt 0x0 1978; GFX12-NEXT: s_wait_kmcnt 0x0 1979; GFX12-NEXT: v_mov_b32_e32 v6, s16 1980; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] 1981; GFX12-NEXT: s_mov_b32 s4, 0 1982; GFX12-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], null offen 1983; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start 1984; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 1985; GFX12-NEXT: s_wait_loadcnt 0x0 1986; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3] 1987; GFX12-NEXT: s_wait_storecnt 0x0 1988; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1989; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5] 1990; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 1991; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 1992; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN 1993; GFX12-NEXT: s_wait_loadcnt 0x0 1994; GFX12-NEXT: global_inv scope:SCOPE_DEV 1995; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] 1996; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 1997; GFX12-NEXT: s_wait_alu 0xfffe 1998; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 1999; GFX12-NEXT: s_wait_alu 0xfffe 2000; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2001; GFX12-NEXT: s_cbranch_execnz .LBB15_1 2002; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end 2003; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 2004; GFX12-NEXT: s_wait_alu 0xfffe 2005; GFX12-NEXT: s_setpc_b64 s[30:31] 2006; 2007; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2008; GFX940: ; %bb.0: 2009; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2010; GFX940-NEXT: v_mov_b32_e32 v2, s16 2011; GFX940-NEXT: buffer_wbl2 sc1 2012; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen 2013; GFX940-NEXT: s_waitcnt vmcnt(0) 2014; GFX940-NEXT: buffer_inv sc1 2015; GFX940-NEXT: s_setpc_b64 s[30:31] 2016; 2017; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2018; GFX11: ; %bb.0: 2019; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2020; GFX11-NEXT: v_mov_b32_e32 v6, s16 2021; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2022; GFX11-NEXT: s_mov_b32 s4, 0 2023; GFX11-NEXT: buffer_load_b64 v[2:3], v6, s[0:3], 0 offen 2024; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start 2025; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 2026; GFX11-NEXT: s_waitcnt vmcnt(0) 2027; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 2028; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2029; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2030; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] 2031; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2 2032; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0 2033; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc 2034; GFX11-NEXT: s_waitcnt vmcnt(0) 2035; GFX11-NEXT: buffer_gl1_inv 2036; GFX11-NEXT: buffer_gl0_inv 2037; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3] 2038; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8 2039; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 2040; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2041; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 2042; GFX11-NEXT: s_cbranch_execnz .LBB15_1 2043; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end 2044; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 2045; GFX11-NEXT: s_setpc_b64 s[30:31] 2046; 2047; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2048; GFX10: ; %bb.0: 2049; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2050; GFX10-NEXT: v_mov_b32_e32 v2, s20 2051; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2052; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen 2053; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2054; GFX10-NEXT: buffer_gl1_inv 2055; GFX10-NEXT: buffer_gl0_inv 2056; GFX10-NEXT: s_setpc_b64 s[30:31] 2057; 2058; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2059; GFX90A: ; %bb.0: 2060; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2061; GFX90A-NEXT: v_mov_b32_e32 v2, s20 2062; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[16:19], 0 offen 2063; GFX90A-NEXT: s_waitcnt vmcnt(0) 2064; GFX90A-NEXT: buffer_wbinvl1 2065; GFX90A-NEXT: s_setpc_b64 s[30:31] 2066; 2067; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2068; GFX908: ; %bb.0: 2069; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2070; GFX908-NEXT: v_mov_b32_e32 v6, s20 2071; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen 2072; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2073; GFX908-NEXT: s_mov_b64 s[4:5], 0 2074; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start 2075; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 2076; GFX908-NEXT: s_waitcnt vmcnt(0) 2077; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 2078; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] 2079; GFX908-NEXT: v_mov_b32_e32 v10, v3 2080; GFX908-NEXT: v_mov_b32_e32 v9, v2 2081; GFX908-NEXT: v_mov_b32_e32 v8, v1 2082; GFX908-NEXT: v_mov_b32_e32 v7, v0 2083; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2084; GFX908-NEXT: s_waitcnt vmcnt(0) 2085; GFX908-NEXT: buffer_wbinvl1 2086; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] 2087; GFX908-NEXT: v_mov_b32_e32 v2, v7 2088; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2089; GFX908-NEXT: v_mov_b32_e32 v3, v8 2090; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5] 2091; GFX908-NEXT: s_cbranch_execnz .LBB15_1 2092; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end 2093; GFX908-NEXT: s_or_b64 exec, exec, s[4:5] 2094; GFX908-NEXT: s_setpc_b64 s[30:31] 2095; 2096; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2097; GFX8: ; %bb.0: 2098; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2099; GFX8-NEXT: v_mov_b32_e32 v6, s20 2100; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen 2101; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] 2102; GFX8-NEXT: s_mov_b64 s[4:5], 0 2103; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start 2104; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 2105; GFX8-NEXT: s_waitcnt vmcnt(0) 2106; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] 2107; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] 2108; GFX8-NEXT: v_mov_b32_e32 v10, v3 2109; GFX8-NEXT: v_mov_b32_e32 v9, v2 2110; GFX8-NEXT: v_mov_b32_e32 v8, v1 2111; GFX8-NEXT: v_mov_b32_e32 v7, v0 2112; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc 2113; GFX8-NEXT: s_waitcnt vmcnt(0) 2114; GFX8-NEXT: buffer_wbinvl1 2115; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3] 2116; GFX8-NEXT: v_mov_b32_e32 v2, v7 2117; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] 2118; GFX8-NEXT: v_mov_b32_e32 v3, v8 2119; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] 2120; GFX8-NEXT: s_cbranch_execnz .LBB15_1 2121; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end 2122; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] 2123; GFX8-NEXT: s_setpc_b64 s[30:31] 2124; 2125; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory: 2126; GFX7: ; %bb.0: 2127; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2128; GFX7-NEXT: v_mov_b32_e32 v2, s20 2129; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[16:19], 0 offen 2130; GFX7-NEXT: s_waitcnt vmcnt(0) 2131; GFX7-NEXT: buffer_wbinvl1 2132; GFX7-NEXT: s_setpc_b64 s[30:31] 2133 %unused = atomicrmw fmax ptr addrspace(7) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 2134 ret void 2135} 2136 2137!0 = !{} 2138!1 = !{i32 5, i32 6} 2139