1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX803 %s 3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX906 %s 4; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX908 %s 5; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX90A %s 6; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX940 %s 7; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX10 %s 8; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX11 %s 9; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=COMMON,GFX12 %s 10 11;--------------------------------------------------------------------- 12; TODO: atomicrmw xchg 13;--------------------------------------------------------------------- 14 15; ; xchg is supported over PCIe, so no expansion is necessary 16; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { 17; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst 18; ret <2 x half> %res 19; } 20 21; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. 22; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 23; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 24; ret <2 x half> %res 25; } 26 27; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. 28; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 29; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 30; ret <2 x half> %res 31; } 32 33; ; xchg is supported over PCIe, so no expansion is necessary. Metadata should be ignored. 34; define <2 x half> @test_atomicrmw_xchg_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 35; %res = atomicrmw xchg ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 36; ret <2 x half> %res 37; } 38 39;--------------------------------------------------------------------- 40; atomicrmw fadd 41;--------------------------------------------------------------------- 42 43define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { 44; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 45; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 46; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 47; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 48; GFX803: atomicrmw.start: 49; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 50; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 51; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 52; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 53; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 54; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 55; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 56; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 57; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 58; GFX803: atomicrmw.end: 59; GFX803-NEXT: ret <2 x half> [[TMP5]] 60; 61; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 62; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 63; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 64; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 65; GFX906: atomicrmw.start: 66; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 67; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 68; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 69; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 70; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 71; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 72; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 73; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 74; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 75; GFX906: atomicrmw.end: 76; GFX906-NEXT: ret <2 x half> [[TMP5]] 77; 78; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 79; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 80; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 81; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 82; GFX908: atomicrmw.start: 83; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 84; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 85; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 86; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 87; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 88; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 89; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 90; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 91; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 92; GFX908: atomicrmw.end: 93; GFX908-NEXT: ret <2 x half> [[TMP5]] 94; 95; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 96; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 97; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 98; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 99; GFX90A: atomicrmw.start: 100; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 101; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 102; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 103; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 104; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 105; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 106; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 107; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 108; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 109; GFX90A: atomicrmw.end: 110; GFX90A-NEXT: ret <2 x half> [[RES]] 111; 112; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 113; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 114; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 115; GFX940-NEXT: ret <2 x half> [[RES]] 116; 117; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 118; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 119; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 120; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 121; GFX10: atomicrmw.start: 122; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 123; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 124; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 125; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 126; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 127; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 128; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 129; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 130; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 131; GFX10: atomicrmw.end: 132; GFX10-NEXT: ret <2 x half> [[TMP5]] 133; 134; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 135; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 136; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 137; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 138; GFX11: atomicrmw.start: 139; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 140; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 141; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 142; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 143; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 144; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 145; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 146; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 147; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 148; GFX11: atomicrmw.end: 149; GFX11-NEXT: ret <2 x half> [[TMP5]] 150; 151; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( 152; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 153; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 154; GFX12-NEXT: ret <2 x half> [[RES]] 155; 156 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst 157 ret <2 x half> %res 158} 159 160define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 161; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 162; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 163; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 164; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 165; GFX803: atomicrmw.start: 166; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 167; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 168; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 169; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 170; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 171; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 172; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 173; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 174; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 175; GFX803: atomicrmw.end: 176; GFX803-NEXT: ret <2 x half> [[TMP5]] 177; 178; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 179; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 180; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 181; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 182; GFX906: atomicrmw.start: 183; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 184; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 185; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 186; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 187; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 188; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 189; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 190; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 191; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 192; GFX906: atomicrmw.end: 193; GFX906-NEXT: ret <2 x half> [[TMP5]] 194; 195; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 196; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 197; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 198; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 199; GFX908: atomicrmw.start: 200; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 201; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 202; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 203; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 204; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 205; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 206; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 207; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 208; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 209; GFX908: atomicrmw.end: 210; GFX908-NEXT: ret <2 x half> [[TMP5]] 211; 212; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 213; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 214; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 215; GFX90A-NEXT: ret <2 x half> [[RES]] 216; 217; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 218; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 219; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 220; GFX940-NEXT: ret <2 x half> [[RES]] 221; 222; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 223; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 224; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 225; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 226; GFX10: atomicrmw.start: 227; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 228; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 229; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 230; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 231; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 232; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 233; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 234; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 235; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 236; GFX10: atomicrmw.end: 237; GFX10-NEXT: ret <2 x half> [[TMP5]] 238; 239; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 240; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 241; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 242; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 243; GFX11: atomicrmw.start: 244; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 245; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 246; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 247; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 248; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 249; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 250; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 251; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 252; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 253; GFX11: atomicrmw.end: 254; GFX11-NEXT: ret <2 x half> [[TMP5]] 255; 256; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( 257; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 258; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 259; GFX12-NEXT: ret <2 x half> [[RES]] 260; 261 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 262 ret <2 x half> %res 263} 264 265define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 266; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 267; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 268; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 269; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 270; GFX803: atomicrmw.start: 271; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 272; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 273; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 274; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 275; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 276; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 277; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 278; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 279; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 280; GFX803: atomicrmw.end: 281; GFX803-NEXT: ret <2 x half> [[TMP5]] 282; 283; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 284; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 285; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 286; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 287; GFX906: atomicrmw.start: 288; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 289; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 290; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 291; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 292; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 293; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 294; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 295; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 296; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 297; GFX906: atomicrmw.end: 298; GFX906-NEXT: ret <2 x half> [[TMP5]] 299; 300; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 301; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 302; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 303; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 304; GFX908: atomicrmw.start: 305; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 306; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 307; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 308; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 309; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 310; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 311; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 312; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 313; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 314; GFX908: atomicrmw.end: 315; GFX908-NEXT: ret <2 x half> [[TMP5]] 316; 317; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 318; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 319; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 320; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 321; GFX90A: atomicrmw.start: 322; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 323; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 324; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 325; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 326; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 327; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 328; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 329; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 330; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 331; GFX90A: atomicrmw.end: 332; GFX90A-NEXT: ret <2 x half> [[RES]] 333; 334; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 335; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 336; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 337; GFX940-NEXT: ret <2 x half> [[RES]] 338; 339; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 340; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 341; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 342; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 343; GFX10: atomicrmw.start: 344; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 345; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 346; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 347; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 348; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 349; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 350; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 351; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 352; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 353; GFX10: atomicrmw.end: 354; GFX10-NEXT: ret <2 x half> [[TMP5]] 355; 356; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 357; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 358; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 359; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 360; GFX11: atomicrmw.start: 361; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 362; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 363; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 364; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 365; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 366; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 367; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 368; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 369; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 370; GFX11: atomicrmw.end: 371; GFX11-NEXT: ret <2 x half> [[TMP5]] 372; 373; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( 374; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 375; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 376; GFX12-NEXT: ret <2 x half> [[RES]] 377; 378 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 379 ret <2 x half> %res 380} 381 382define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 383; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 384; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 385; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 386; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 387; GFX803: atomicrmw.start: 388; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 389; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 390; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 391; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 392; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 393; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 394; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 395; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 396; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 397; GFX803: atomicrmw.end: 398; GFX803-NEXT: ret <2 x half> [[TMP5]] 399; 400; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 401; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 402; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 403; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 404; GFX906: atomicrmw.start: 405; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 406; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 407; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 408; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 409; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 410; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 411; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 412; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 413; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 414; GFX906: atomicrmw.end: 415; GFX906-NEXT: ret <2 x half> [[TMP5]] 416; 417; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 418; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 419; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 420; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 421; GFX908: atomicrmw.start: 422; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 423; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 424; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 425; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 426; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 427; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 428; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 429; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 430; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 431; GFX908: atomicrmw.end: 432; GFX908-NEXT: ret <2 x half> [[TMP5]] 433; 434; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 435; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 436; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 437; GFX90A-NEXT: ret <2 x half> [[RES]] 438; 439; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 440; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 441; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 442; GFX940-NEXT: ret <2 x half> [[RES]] 443; 444; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 445; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 446; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 447; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 448; GFX10: atomicrmw.start: 449; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 450; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 451; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 452; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 453; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 454; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 455; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 456; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 457; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 458; GFX10: atomicrmw.end: 459; GFX10-NEXT: ret <2 x half> [[TMP5]] 460; 461; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 462; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 463; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 464; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 465; GFX11: atomicrmw.start: 466; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 467; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 468; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 469; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 470; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 471; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 472; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 473; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 474; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 475; GFX11: atomicrmw.end: 476; GFX11-NEXT: ret <2 x half> [[TMP5]] 477; 478; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 479; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 480; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 481; GFX12-NEXT: ret <2 x half> [[RES]] 482; 483 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 484 ret <2 x half> %res 485} 486 487define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { 488; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 489; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 490; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 491; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 492; GFX803: atomicrmw.start: 493; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 494; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 495; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 496; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 497; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 498; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 499; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 500; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 501; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 502; GFX803: atomicrmw.end: 503; GFX803-NEXT: ret <2 x half> [[TMP5]] 504; 505; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 506; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 507; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 508; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 509; GFX906: atomicrmw.start: 510; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 511; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 512; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 513; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 514; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 515; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 516; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 517; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 518; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 519; GFX906: atomicrmw.end: 520; GFX906-NEXT: ret <2 x half> [[TMP5]] 521; 522; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 523; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 524; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 525; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 526; GFX908: atomicrmw.start: 527; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 528; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 529; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 530; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 531; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 532; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 533; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 534; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 535; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 536; GFX908: atomicrmw.end: 537; GFX908-NEXT: ret <2 x half> [[TMP5]] 538; 539; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 540; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 541; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 542; GFX90A-NEXT: ret <2 x half> [[RES]] 543; 544; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 545; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 546; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 547; GFX940-NEXT: ret <2 x half> [[RES]] 548; 549; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 550; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 551; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 552; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 553; GFX10: atomicrmw.start: 554; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 555; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 556; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 557; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 558; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 559; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 560; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 561; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 562; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 563; GFX10: atomicrmw.end: 564; GFX10-NEXT: ret <2 x half> [[TMP5]] 565; 566; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 567; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 568; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 569; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 570; GFX11: atomicrmw.start: 571; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 572; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 573; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 574; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 575; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 576; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 577; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 578; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 579; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 580; GFX11: atomicrmw.end: 581; GFX11-NEXT: ret <2 x half> [[TMP5]] 582; 583; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( 584; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { 585; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 586; GFX12-NEXT: ret <2 x half> [[RES]] 587; 588 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 589 ret <2 x half> %res 590} 591 592define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { 593; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 594; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 595; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 596; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 597; GFX803: atomicrmw.start: 598; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 599; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 600; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 601; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 602; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 603; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 604; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 605; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 606; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 607; GFX803: atomicrmw.end: 608; GFX803-NEXT: ret <2 x half> [[TMP5]] 609; 610; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 611; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 612; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 613; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 614; GFX906: atomicrmw.start: 615; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 616; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 617; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 618; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 619; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 620; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 621; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 622; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 623; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 624; GFX906: atomicrmw.end: 625; GFX906-NEXT: ret <2 x half> [[TMP5]] 626; 627; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 628; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 629; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 630; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 631; GFX908: atomicrmw.start: 632; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 633; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 634; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 635; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 636; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 637; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 638; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 639; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 640; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 641; GFX908: atomicrmw.end: 642; GFX908-NEXT: ret <2 x half> [[TMP5]] 643; 644; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 645; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 646; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 647; GFX90A-NEXT: ret <2 x half> [[RES]] 648; 649; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 650; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 651; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 652; GFX940-NEXT: ret <2 x half> [[RES]] 653; 654; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 655; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 656; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 657; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 658; GFX10: atomicrmw.start: 659; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 660; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 661; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 662; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 663; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 664; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 665; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 666; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 667; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 668; GFX10: atomicrmw.end: 669; GFX10-NEXT: ret <2 x half> [[TMP5]] 670; 671; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 672; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 673; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 674; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 675; GFX11: atomicrmw.start: 676; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 677; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 678; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 679; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 680; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 681; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 682; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 683; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 684; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 685; GFX11: atomicrmw.end: 686; GFX11-NEXT: ret <2 x half> [[TMP5]] 687; 688; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( 689; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { 690; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 691; GFX12-NEXT: ret <2 x half> [[RES]] 692; 693 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 694 ret <2 x half> %res 695} 696 697define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { 698; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 699; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 700; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 701; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 702; GFX803: atomicrmw.start: 703; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 704; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 705; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 706; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 707; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 708; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 709; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 710; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 711; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 712; GFX803: atomicrmw.end: 713; GFX803-NEXT: ret <2 x half> [[TMP5]] 714; 715; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 716; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 717; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 718; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 719; GFX906: atomicrmw.start: 720; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 721; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 722; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 723; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 724; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 725; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 726; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 727; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 728; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 729; GFX906: atomicrmw.end: 730; GFX906-NEXT: ret <2 x half> [[TMP5]] 731; 732; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 733; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 734; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 735; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 736; GFX908: atomicrmw.start: 737; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 738; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 739; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 740; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 741; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 742; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 743; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 744; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 745; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 746; GFX908: atomicrmw.end: 747; GFX908-NEXT: ret <2 x half> [[TMP5]] 748; 749; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 750; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 751; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 752; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 753; GFX90A: atomicrmw.start: 754; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 755; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 756; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 757; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 758; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 759; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 760; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 761; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 762; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 763; GFX90A: atomicrmw.end: 764; GFX90A-NEXT: ret <2 x half> [[RES]] 765; 766; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 767; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 768; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] 769; GFX940-NEXT: ret <2 x half> [[RES]] 770; 771; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 772; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 773; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 774; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 775; GFX10: atomicrmw.start: 776; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 777; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 778; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 779; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 780; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 781; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 782; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 783; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 784; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 785; GFX10: atomicrmw.end: 786; GFX10-NEXT: ret <2 x half> [[TMP5]] 787; 788; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 789; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 790; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 791; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 792; GFX11: atomicrmw.start: 793; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 794; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 795; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 796; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 797; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 798; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 799; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 800; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 801; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 802; GFX11: atomicrmw.end: 803; GFX11-NEXT: ret <2 x half> [[TMP5]] 804; 805; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( 806; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 807; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] 808; GFX12-NEXT: ret <2 x half> [[RES]] 809; 810 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 811 ret <2 x half> %res 812} 813 814define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 815; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 816; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 817; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 818; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 819; GFX803: atomicrmw.start: 820; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 821; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 822; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 823; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 824; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 825; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 826; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 827; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 828; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 829; GFX803: atomicrmw.end: 830; GFX803-NEXT: ret <2 x half> [[TMP5]] 831; 832; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 833; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 834; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 835; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 836; GFX906: atomicrmw.start: 837; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 838; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 839; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 840; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 841; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 842; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 843; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 844; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 845; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 846; GFX906: atomicrmw.end: 847; GFX906-NEXT: ret <2 x half> [[TMP5]] 848; 849; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 850; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 851; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 852; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 853; GFX908: atomicrmw.start: 854; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 855; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 856; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 857; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 858; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 859; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 860; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 861; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 862; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 863; GFX908: atomicrmw.end: 864; GFX908-NEXT: ret <2 x half> [[TMP5]] 865; 866; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 867; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 868; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 869; GFX90A-NEXT: ret <2 x half> [[RES]] 870; 871; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 872; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 873; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 874; GFX940-NEXT: ret <2 x half> [[RES]] 875; 876; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 877; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 878; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 879; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 880; GFX10: atomicrmw.start: 881; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 882; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 883; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 884; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 885; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 886; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 887; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 888; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 889; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 890; GFX10: atomicrmw.end: 891; GFX10-NEXT: ret <2 x half> [[TMP5]] 892; 893; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 894; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 895; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 896; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 897; GFX11: atomicrmw.start: 898; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 899; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 900; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 901; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 902; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 903; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 904; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 905; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 906; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 907; GFX11: atomicrmw.end: 908; GFX11-NEXT: ret <2 x half> [[TMP5]] 909; 910; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 911; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 912; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 913; GFX12-NEXT: ret <2 x half> [[RES]] 914; 915 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 916 ret <2 x half> %res 917} 918 919define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 920; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 921; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 922; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 923; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 924; GFX803: atomicrmw.start: 925; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 926; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 927; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 928; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 929; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 930; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 931; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 932; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 933; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 934; GFX803: atomicrmw.end: 935; GFX803-NEXT: ret <2 x half> [[TMP5]] 936; 937; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 938; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 939; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 940; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 941; GFX906: atomicrmw.start: 942; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 943; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 944; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 945; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 946; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 947; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 948; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 949; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 950; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 951; GFX906: atomicrmw.end: 952; GFX906-NEXT: ret <2 x half> [[TMP5]] 953; 954; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 955; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 956; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 957; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 958; GFX908: atomicrmw.start: 959; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 960; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 961; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 962; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 963; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 964; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 965; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 966; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 967; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 968; GFX908: atomicrmw.end: 969; GFX908-NEXT: ret <2 x half> [[TMP5]] 970; 971; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 972; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 973; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 974; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] 975; GFX90A: atomicrmw.start: 976; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 977; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 978; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 979; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 980; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 981; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 982; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 983; GFX90A-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 984; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 985; GFX90A: atomicrmw.end: 986; GFX90A-NEXT: ret <2 x half> [[RES]] 987; 988; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 989; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 990; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 991; GFX940-NEXT: ret <2 x half> [[RES]] 992; 993; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 994; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 995; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 996; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 997; GFX10: atomicrmw.start: 998; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 999; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1000; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1001; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1002; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1003; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1004; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1005; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1006; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1007; GFX10: atomicrmw.end: 1008; GFX10-NEXT: ret <2 x half> [[TMP5]] 1009; 1010; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 1011; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1012; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1013; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 1014; GFX11: atomicrmw.start: 1015; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1016; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1017; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1018; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1019; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1020; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1021; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1022; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1023; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1024; GFX11: atomicrmw.end: 1025; GFX11-NEXT: ret <2 x half> [[TMP5]] 1026; 1027; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 1028; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1029; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1030; GFX12-NEXT: ret <2 x half> [[RES]] 1031; 1032 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1033 ret <2 x half> %res 1034} 1035 1036define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1037; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1038; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1039; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1040; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 1041; GFX803: atomicrmw.start: 1042; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1043; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1044; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1045; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1046; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1047; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1048; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1049; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1050; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1051; GFX803: atomicrmw.end: 1052; GFX803-NEXT: ret <2 x half> [[TMP5]] 1053; 1054; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1055; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1056; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1057; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 1058; GFX906: atomicrmw.start: 1059; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1060; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1061; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1062; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1063; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1064; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1065; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1066; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1067; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1068; GFX906: atomicrmw.end: 1069; GFX906-NEXT: ret <2 x half> [[TMP5]] 1070; 1071; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1072; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1073; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1074; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 1075; GFX908: atomicrmw.start: 1076; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1077; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1078; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1079; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1080; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1081; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1082; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1083; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1084; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1085; GFX908: atomicrmw.end: 1086; GFX908-NEXT: ret <2 x half> [[TMP5]] 1087; 1088; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1089; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1090; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1091; GFX90A-NEXT: ret <2 x half> [[RES]] 1092; 1093; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1094; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1095; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1096; GFX940-NEXT: ret <2 x half> [[RES]] 1097; 1098; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1099; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1100; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1101; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 1102; GFX10: atomicrmw.start: 1103; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1104; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1105; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1106; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1107; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1108; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1109; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1110; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1111; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1112; GFX10: atomicrmw.end: 1113; GFX10-NEXT: ret <2 x half> [[TMP5]] 1114; 1115; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1116; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1117; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1118; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 1119; GFX11: atomicrmw.start: 1120; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1121; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1122; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1123; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1124; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1125; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1126; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1127; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1128; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1129; GFX11: atomicrmw.end: 1130; GFX11-NEXT: ret <2 x half> [[TMP5]] 1131; 1132; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1133; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1134; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1135; GFX12-NEXT: ret <2 x half> [[RES]] 1136; 1137 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1138 ret <2 x half> %res 1139} 1140 1141define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { 1142; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1143; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1144; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1145; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 1146; GFX803: atomicrmw.start: 1147; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1148; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1149; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1150; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1151; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1152; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1153; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1154; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1155; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1156; GFX803: atomicrmw.end: 1157; GFX803-NEXT: ret <2 x half> [[TMP5]] 1158; 1159; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1160; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1161; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1162; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 1163; GFX906: atomicrmw.start: 1164; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1165; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1166; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1167; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1168; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1169; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1170; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1171; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1172; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1173; GFX906: atomicrmw.end: 1174; GFX906-NEXT: ret <2 x half> [[TMP5]] 1175; 1176; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1177; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1178; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1179; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 1180; GFX908: atomicrmw.start: 1181; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1182; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1183; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1184; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1185; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1186; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1187; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1188; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1189; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1190; GFX908: atomicrmw.end: 1191; GFX908-NEXT: ret <2 x half> [[TMP5]] 1192; 1193; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1194; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1195; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1196; GFX90A-NEXT: ret <2 x half> [[RES]] 1197; 1198; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1199; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1200; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1201; GFX940-NEXT: ret <2 x half> [[RES]] 1202; 1203; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1204; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1205; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1206; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 1207; GFX10: atomicrmw.start: 1208; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1209; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1210; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1211; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1212; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1213; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1214; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1215; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1216; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1217; GFX10: atomicrmw.end: 1218; GFX10-NEXT: ret <2 x half> [[TMP5]] 1219; 1220; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1221; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1222; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1223; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 1224; GFX11: atomicrmw.start: 1225; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1226; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1227; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1228; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1229; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1230; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1231; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1232; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1233; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1234; GFX11: atomicrmw.end: 1235; GFX11-NEXT: ret <2 x half> [[TMP5]] 1236; 1237; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( 1238; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { 1239; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1240; GFX12-NEXT: ret <2 x half> [[RES]] 1241; 1242 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1243 ret <2 x half> %res 1244} 1245 1246define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { 1247; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1248; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1249; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1250; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] 1251; GFX803: atomicrmw.start: 1252; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1253; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1254; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1255; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1256; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1257; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1258; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1259; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1260; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1261; GFX803: atomicrmw.end: 1262; GFX803-NEXT: ret <2 x half> [[TMP5]] 1263; 1264; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1265; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1266; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1267; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] 1268; GFX906: atomicrmw.start: 1269; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1270; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1271; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1272; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1273; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1274; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1275; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1276; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1277; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1278; GFX906: atomicrmw.end: 1279; GFX906-NEXT: ret <2 x half> [[TMP5]] 1280; 1281; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1282; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1283; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1284; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] 1285; GFX908: atomicrmw.start: 1286; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1287; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1288; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1289; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1290; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1291; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1292; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1293; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1294; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1295; GFX908: atomicrmw.end: 1296; GFX908-NEXT: ret <2 x half> [[TMP5]] 1297; 1298; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1299; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1300; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1301; GFX90A-NEXT: ret <2 x half> [[RES]] 1302; 1303; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1304; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1305; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1306; GFX940-NEXT: ret <2 x half> [[RES]] 1307; 1308; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1309; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1310; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1311; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] 1312; GFX10: atomicrmw.start: 1313; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1314; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1315; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1316; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1317; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1318; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1319; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1320; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1321; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1322; GFX10: atomicrmw.end: 1323; GFX10-NEXT: ret <2 x half> [[TMP5]] 1324; 1325; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1326; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1327; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1328; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] 1329; GFX11: atomicrmw.start: 1330; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1331; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] 1332; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1333; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1334; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1335; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1336; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1337; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1338; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1339; GFX11: atomicrmw.end: 1340; GFX11-NEXT: ret <2 x half> [[TMP5]] 1341; 1342; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( 1343; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { 1344; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] 1345; GFX12-NEXT: ret <2 x half> [[RES]] 1346; 1347 %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1348 ret <2 x half> %res 1349} 1350 1351;--------------------------------------------------------------------- 1352; atomicrmw fsub 1353;--------------------------------------------------------------------- 1354 1355define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { 1356; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent( 1357; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { 1358; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1359; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1360; COMMON: atomicrmw.start: 1361; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1362; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1363; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1364; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1365; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 1366; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1367; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1368; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1369; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1370; COMMON: atomicrmw.end: 1371; COMMON-NEXT: ret <2 x half> [[RES]] 1372; 1373 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst 1374 ret <2 x half> %res 1375} 1376 1377define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1378; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory( 1379; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1380; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1381; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1382; COMMON: atomicrmw.start: 1383; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1384; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1385; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1386; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1387; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] 1388; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1389; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1390; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1391; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1392; COMMON: atomicrmw.end: 1393; COMMON-NEXT: ret <2 x half> [[RES]] 1394; 1395 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1396 ret <2 x half> %res 1397} 1398 1399define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1400; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_remote_memory( 1401; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1402; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1403; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1404; COMMON: atomicrmw.start: 1405; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1406; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1407; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1408; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1409; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1410; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1411; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1412; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1413; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1414; COMMON: atomicrmw.end: 1415; COMMON-NEXT: ret <2 x half> [[RES]] 1416; 1417 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1418 ret <2 x half> %res 1419} 1420 1421define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1422; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1423; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1424; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1425; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1426; COMMON: atomicrmw.start: 1427; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1428; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1429; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1430; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1431; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1432; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1433; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1434; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1435; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1436; COMMON: atomicrmw.end: 1437; COMMON-NEXT: ret <2 x half> [[RES]] 1438; 1439 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 1440 ret <2 x half> %res 1441} 1442 1443define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { 1444; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode( 1445; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1446; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1447; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1448; COMMON: atomicrmw.start: 1449; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1450; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1451; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1452; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1453; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 1454; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1455; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1456; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1457; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1458; COMMON: atomicrmw.end: 1459; COMMON-NEXT: ret <2 x half> [[TMP5]] 1460; 1461 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 1462 ret <2 x half> %res 1463} 1464 1465define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1466; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 1467; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1468; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1469; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1470; COMMON: atomicrmw.start: 1471; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1472; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1473; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1474; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1475; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 1476; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1477; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1478; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1479; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1480; COMMON: atomicrmw.end: 1481; COMMON-NEXT: ret <2 x half> [[TMP5]] 1482; 1483 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1484 ret <2 x half> %res 1485} 1486 1487define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1488; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 1489; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1490; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1491; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1492; COMMON: atomicrmw.start: 1493; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1494; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1495; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1496; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1497; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1498; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1499; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1500; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1501; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1502; COMMON: atomicrmw.end: 1503; COMMON-NEXT: ret <2 x half> [[TMP5]] 1504; 1505 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1506 ret <2 x half> %res 1507} 1508 1509define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1510; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1511; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1512; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1513; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1514; COMMON: atomicrmw.start: 1515; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] 1516; COMMON-NEXT: [[NEW:%.*]] = fsub <2 x half> [[LOADED]], [[VALUE]] 1517; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 1518; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1519; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1520; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 1521; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 1522; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1523; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1524; COMMON: atomicrmw.end: 1525; COMMON-NEXT: ret <2 x half> [[TMP5]] 1526; 1527 %res = atomicrmw fsub ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1528 ret <2 x half> %res 1529} 1530 1531;--------------------------------------------------------------------- 1532; atomicrmw fmax 1533;--------------------------------------------------------------------- 1534 1535define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { 1536; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent( 1537; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1538; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1539; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1540; COMMON: atomicrmw.start: 1541; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1542; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1543; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1544; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1545; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1546; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1547; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1548; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1549; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1550; COMMON: atomicrmw.end: 1551; COMMON-NEXT: ret <2 x half> [[RES]] 1552; 1553 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst 1554 ret <2 x half> %res 1555} 1556 1557define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1558; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory( 1559; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1560; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1561; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1562; COMMON: atomicrmw.start: 1563; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1564; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1565; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1566; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1567; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 1568; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1569; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1570; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1571; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1572; COMMON: atomicrmw.end: 1573; COMMON-NEXT: ret <2 x half> [[RES]] 1574; 1575 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1576 ret <2 x half> %res 1577} 1578 1579define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1580; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_remote_memory( 1581; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1582; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1583; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1584; COMMON: atomicrmw.start: 1585; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1586; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1587; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1588; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1589; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1590; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1591; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1592; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1593; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1594; COMMON: atomicrmw.end: 1595; COMMON-NEXT: ret <2 x half> [[RES]] 1596; 1597 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1598 ret <2 x half> %res 1599} 1600 1601define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1602; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1603; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1604; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1605; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1606; COMMON: atomicrmw.start: 1607; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1608; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1609; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1610; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1611; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1612; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1613; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1614; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1615; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1616; COMMON: atomicrmw.end: 1617; COMMON-NEXT: ret <2 x half> [[RES]] 1618; 1619 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 1620 ret <2 x half> %res 1621} 1622 1623define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { 1624; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode( 1625; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1626; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1627; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1628; COMMON: atomicrmw.start: 1629; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1630; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1631; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1632; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1633; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1634; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1635; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1636; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1637; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1638; COMMON: atomicrmw.end: 1639; COMMON-NEXT: ret <2 x half> [[TMP6]] 1640; 1641 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 1642 ret <2 x half> %res 1643} 1644 1645define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1646; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 1647; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1648; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1649; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1650; COMMON: atomicrmw.start: 1651; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1652; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1653; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1654; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1655; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 1656; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1657; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1658; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1659; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1660; COMMON: atomicrmw.end: 1661; COMMON-NEXT: ret <2 x half> [[TMP6]] 1662; 1663 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1664 ret <2 x half> %res 1665} 1666 1667define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1668; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 1669; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1670; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1671; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1672; COMMON: atomicrmw.start: 1673; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1674; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1675; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1676; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1677; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1678; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1679; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1680; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1681; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1682; COMMON: atomicrmw.end: 1683; COMMON-NEXT: ret <2 x half> [[TMP6]] 1684; 1685 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1686 ret <2 x half> %res 1687} 1688 1689define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1690; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmax_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1691; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1692; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1693; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1694; COMMON: atomicrmw.start: 1695; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1696; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.maxnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1697; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1698; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1699; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1700; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1701; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1702; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1703; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1704; COMMON: atomicrmw.end: 1705; COMMON-NEXT: ret <2 x half> [[TMP6]] 1706; 1707 %res = atomicrmw fmax ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1708 ret <2 x half> %res 1709} 1710 1711;--------------------------------------------------------------------- 1712; atomicrmw fmin 1713;--------------------------------------------------------------------- 1714 1715define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { 1716; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent( 1717; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1718; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1719; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1720; COMMON: atomicrmw.start: 1721; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1722; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1723; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1724; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1725; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1726; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1727; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1728; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1729; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1730; COMMON: atomicrmw.end: 1731; COMMON-NEXT: ret <2 x half> [[RES]] 1732; 1733 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst 1734 ret <2 x half> %res 1735} 1736 1737define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1738; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory( 1739; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1740; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1741; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1742; COMMON: atomicrmw.start: 1743; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1744; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1745; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1746; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1747; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 1748; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1749; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1750; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1751; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1752; COMMON: atomicrmw.end: 1753; COMMON-NEXT: ret <2 x half> [[RES]] 1754; 1755 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 1756 ret <2 x half> %res 1757} 1758 1759define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1760; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_remote_memory( 1761; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1762; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1763; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1764; COMMON: atomicrmw.start: 1765; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1766; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1767; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1768; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1769; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1770; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1771; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1772; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1773; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1774; COMMON: atomicrmw.end: 1775; COMMON-NEXT: ret <2 x half> [[RES]] 1776; 1777 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 1778 ret <2 x half> %res 1779} 1780 1781define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1782; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1783; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1784; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1785; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1786; COMMON: atomicrmw.start: 1787; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] 1788; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1789; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1790; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1791; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1792; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1793; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1794; COMMON-NEXT: [[RES]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1795; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1796; COMMON: atomicrmw.end: 1797; COMMON-NEXT: ret <2 x half> [[RES]] 1798; 1799 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 1800 ret <2 x half> %res 1801} 1802 1803define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { 1804; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode( 1805; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1806; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1807; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1808; COMMON: atomicrmw.start: 1809; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1810; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1811; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1812; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1813; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4 1814; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1815; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1816; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1817; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1818; COMMON: atomicrmw.end: 1819; COMMON-NEXT: ret <2 x half> [[TMP6]] 1820; 1821 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 1822 ret <2 x half> %res 1823} 1824 1825define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1826; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( 1827; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1828; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1829; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1830; COMMON: atomicrmw.start: 1831; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1832; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1833; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1834; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1835; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] 1836; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1837; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1838; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1839; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1840; COMMON: atomicrmw.end: 1841; COMMON-NEXT: ret <2 x half> [[TMP6]] 1842; 1843 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 1844 ret <2 x half> %res 1845} 1846 1847define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1848; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( 1849; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1850; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1851; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1852; COMMON: atomicrmw.start: 1853; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1854; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1855; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1856; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1857; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] 1858; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1859; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1860; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1861; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1862; COMMON: atomicrmw.end: 1863; COMMON-NEXT: ret <2 x half> [[TMP6]] 1864; 1865 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1866 ret <2 x half> %res 1867} 1868 1869define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { 1870; COMMON-LABEL: define <2 x half> @test_atomicrmw_fmin_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( 1871; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { 1872; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 1873; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] 1874; COMMON: atomicrmw.start: 1875; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] 1876; COMMON-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[LOADED]], <2 x half> [[VALUE]]) 1877; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[TMP2]] to i32 1878; COMMON-NEXT: [[TMP4:%.*]] = bitcast <2 x half> [[LOADED]] to i32 1879; COMMON-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] 1880; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 1881; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 1882; COMMON-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to <2 x half> 1883; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] 1884; COMMON: atomicrmw.end: 1885; COMMON-NEXT: ret <2 x half> [[TMP6]] 1886; 1887 %res = atomicrmw fmin ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 1888 ret <2 x half> %res 1889} 1890 1891attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } 1892attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } 1893 1894!0 = !{} 1895;. 1896; GFX803: [[META0]] = !{} 1897;. 1898; GFX906: [[META0]] = !{} 1899;. 1900; GFX908: [[META0]] = !{} 1901;. 1902; GFX90A: [[META0]] = !{} 1903;. 1904; GFX940: [[META0]] = !{} 1905;. 1906; GFX10: [[META0]] = !{} 1907;. 1908; GFX11: [[META0]] = !{} 1909;. 1910; GFX12: [[META0]] = !{} 1911;. 1912