1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s 4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s 5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s 6 7define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { 8; GFX68-LABEL: buffer_store: 9; GFX68: ; %bb.0: ; %main_body 10; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 11; GFX68-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc 12; GFX68-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc 13; GFX68-NEXT: s_endpgm 14; 15; GFX11-LABEL: buffer_store: 16; GFX11: ; %bb.0: ; %main_body 17; GFX11-NEXT: s_clause 0x2 18; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 19; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc 20; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc 21; GFX11-NEXT: s_endpgm 22main_body: 23 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) 24 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) 25 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 2) 26 ret void 27} 28 29define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { 30; GFX68-LABEL: buffer_store_immoffs: 31; GFX68: ; %bb.0: ; %main_body 32; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 33; GFX68-NEXT: s_endpgm 34; 35; GFX11-LABEL: buffer_store_immoffs: 36; GFX11: ; %bb.0: ; %main_body 37; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 38; GFX11-NEXT: s_endpgm 39main_body: 40 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) 41 ret void 42} 43 44define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { 45; GFX68-LABEL: buffer_store_ofs: 46; GFX68: ; %bb.0: ; %main_body 47; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen 48; GFX68-NEXT: s_endpgm 49; 50; GFX11-LABEL: buffer_store_ofs: 51; GFX11: ; %bb.0: ; %main_body 52; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen 53; GFX11-NEXT: s_endpgm 54main_body: 55 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) 56 ret void 57} 58 59; Ideally, the register allocator would avoid the wait here 60define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { 61; VERDE-LABEL: buffer_store_wait: 62; VERDE: ; %bb.0: ; %main_body 63; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen 64; VERDE-NEXT: s_waitcnt expcnt(0) 65; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen 66; VERDE-NEXT: s_waitcnt vmcnt(0) 67; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen 68; VERDE-NEXT: s_endpgm 69; 70; GFX8-LABEL: buffer_store_wait: 71; GFX8: ; %bb.0: ; %main_body 72; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen 73; GFX8-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen 74; GFX8-NEXT: s_waitcnt vmcnt(0) 75; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen 76; GFX8-NEXT: s_endpgm 77; 78; GFX11-LABEL: buffer_store_wait: 79; GFX11: ; %bb.0: ; %main_body 80; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen 81; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 offen 82; GFX11-NEXT: s_waitcnt vmcnt(0) 83; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen 84; GFX11-NEXT: s_endpgm 85main_body: 86 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) 87 %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) 88 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i32 0) 89 ret void 90} 91 92define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { 93; GFX68-LABEL: buffer_store_x1: 94; GFX68: ; %bb.0: ; %main_body 95; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 96; GFX68-NEXT: s_endpgm 97; 98; GFX11-LABEL: buffer_store_x1: 99; GFX11: ; %bb.0: ; %main_body 100; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen 101; GFX11-NEXT: s_endpgm 102main_body: 103 call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 104 ret void 105} 106 107define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { 108; GFX68-LABEL: buffer_store_x2: 109; GFX68: ; %bb.0: ; %main_body 110; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen 111; GFX68-NEXT: s_endpgm 112; 113; GFX11-LABEL: buffer_store_x2: 114; GFX11: ; %bb.0: ; %main_body 115; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen 116; GFX11-NEXT: s_endpgm 117main_body: 118 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 119 ret void 120} 121 122define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 123; GFX68-LABEL: buffer_store_x1_offen_merged_and: 124; GFX68: ; %bb.0: 125; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 126; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 127; GFX68-NEXT: s_endpgm 128; 129; GFX11-LABEL: buffer_store_x1_offen_merged_and: 130; GFX11: ; %bb.0: 131; GFX11-NEXT: s_clause 0x1 132; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 133; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 134; GFX11-NEXT: s_endpgm 135 %a1 = add i32 %a, 4 136 %a2 = add i32 %a, 8 137 %a3 = add i32 %a, 12 138 %a4 = add i32 %a, 16 139 %a5 = add i32 %a, 28 140 %a6 = add i32 %a, 32 141 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 142 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 143 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) 144 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) 145 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) 146 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) 147 ret void 148} 149 150define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 151; GFX68-LABEL: buffer_store_x1_offen_merged_or: 152; GFX68: ; %bb.0: 153; GFX68-NEXT: v_lshlrev_b32_e32 v0, 6, v0 154; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 155; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 156; GFX68-NEXT: s_endpgm 157; 158; GFX11-LABEL: buffer_store_x1_offen_merged_or: 159; GFX11: ; %bb.0: 160; GFX11-NEXT: v_lshlrev_b32_e32 v0, 6, v0 161; GFX11-NEXT: s_clause 0x1 162; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 163; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 164; GFX11-NEXT: s_endpgm 165 %a = shl i32 %inp, 6 166 %a1 = add i32 %a, 4 167 %a2 = add i32 %a, 8 168 %a3 = add i32 %a, 12 169 %a4 = add i32 %a, 16 170 %a5 = add i32 %a, 28 171 %a6 = add i32 %a, 32 172 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 173 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 174 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 0) 175 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 0) 176 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 0) 177 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 0) 178 ret void 179} 180 181define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 182; GFX68-LABEL: buffer_store_x1_offen_merged_glc_slc: 183; GFX68: ; %bb.0: 184; GFX68-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 185; GFX68-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc 186; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc 187; GFX68-NEXT: s_endpgm 188; 189; GFX11-LABEL: buffer_store_x1_offen_merged_glc_slc: 190; GFX11: ; %bb.0: 191; GFX11-NEXT: s_clause 0x2 192; GFX11-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4 193; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc 194; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc 195; GFX11-NEXT: s_endpgm 196 %a1 = add i32 %a, 4 197 %a2 = add i32 %a, 8 198 %a3 = add i32 %a, 12 199 %a4 = add i32 %a, 16 200 %a5 = add i32 %a, 28 201 %a6 = add i32 %a, 32 202 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 203 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 204 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 %a3, i32 0, i32 1) 205 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 %a4, i32 0, i32 1) 206 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 %a5, i32 0, i32 3) 207 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 %a6, i32 0, i32 3) 208 ret void 209} 210 211define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { 212; GFX68-LABEL: buffer_store_x2_offen_merged_and: 213; GFX68: ; %bb.0: 214; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 215; GFX68-NEXT: s_endpgm 216; 217; GFX11-LABEL: buffer_store_x2_offen_merged_and: 218; GFX11: ; %bb.0: 219; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 220; GFX11-NEXT: s_endpgm 221 %a1 = add i32 %a, 4 222 %a2 = add i32 %a, 12 223 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 224 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 225 ret void 226} 227 228define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { 229; GFX68-LABEL: buffer_store_x2_offen_merged_or: 230; GFX68: ; %bb.0: 231; GFX68-NEXT: v_lshlrev_b32_e32 v0, 4, v0 232; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 233; GFX68-NEXT: s_endpgm 234; 235; GFX11-LABEL: buffer_store_x2_offen_merged_or: 236; GFX11: ; %bb.0: 237; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 238; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 239; GFX11-NEXT: s_endpgm 240 %a = shl i32 %inp, 4 241 %a1 = add i32 %a, 4 242 %a2 = add i32 %a, 12 243 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) 244 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 %a2, i32 0, i32 0) 245 ret void 246} 247 248define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 249; GFX68-LABEL: buffer_store_x1_offset_merged: 250; GFX68: ; %bb.0: 251; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 252; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 253; GFX68-NEXT: s_endpgm 254; 255; GFX11-LABEL: buffer_store_x1_offset_merged: 256; GFX11: ; %bb.0: 257; GFX11-NEXT: s_clause 0x1 258; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 259; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 260; GFX11-NEXT: s_endpgm 261 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) 262 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) 263 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) 264 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) 265 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) 266 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) 267 ret void 268} 269 270define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { 271; GFX68-LABEL: buffer_store_x2_offset_merged: 272; GFX68: ; %bb.0: 273; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 274; GFX68-NEXT: s_endpgm 275; 276; GFX11-LABEL: buffer_store_x2_offset_merged: 277; GFX11: ; %bb.0: 278; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 279; GFX11-NEXT: s_endpgm 280 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) 281 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) 282 ret void 283} 284 285define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { 286; GFX68-LABEL: buffer_store_int: 287; GFX68: ; %bb.0: ; %main_body 288; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 289; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc 290; GFX68-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc 291; GFX68-NEXT: s_endpgm 292; 293; GFX11-LABEL: buffer_store_int: 294; GFX11: ; %bb.0: ; %main_body 295; GFX11-NEXT: s_clause 0x2 296; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 297; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc 298; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc 299; GFX11-NEXT: s_endpgm 300main_body: 301 call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) 302 call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) 303 call void @llvm.amdgcn.raw.buffer.store.i32(i32 %3, <4 x i32> %0, i32 0, i32 0, i32 2) 304 ret void 305} 306 307define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { 308; GFX68-LABEL: raw_buffer_store_byte: 309; GFX68: ; %bb.0: ; %main_body 310; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 311; GFX68-NEXT: buffer_store_byte v0, off, s[0:3], 0 312; GFX68-NEXT: s_endpgm 313; 314; GFX11-LABEL: raw_buffer_store_byte: 315; GFX11: ; %bb.0: ; %main_body 316; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 317; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 318; GFX11-NEXT: s_endpgm 319main_body: 320 %v2 = fptoui float %v1 to i32 321 %v3 = trunc i32 %v2 to i8 322 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) 323 ret void 324} 325 326define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { 327; GFX68-LABEL: raw_buffer_store_short: 328; GFX68: ; %bb.0: ; %main_body 329; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 330; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 331; GFX68-NEXT: s_endpgm 332; 333; GFX11-LABEL: raw_buffer_store_short: 334; GFX11: ; %bb.0: ; %main_body 335; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 336; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 337; GFX11-NEXT: s_endpgm 338main_body: 339 %v2 = fptoui float %v1 to i32 340 %v3 = trunc i32 %v2 to i16 341 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 0, i32 0) 342 ret void 343} 344 345define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { 346; GFX68-LABEL: raw_buffer_store_f16: 347; GFX68: ; %bb.0: ; %main_body 348; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 349; GFX68-NEXT: s_endpgm 350; 351; GFX11-LABEL: raw_buffer_store_f16: 352; GFX11: ; %bb.0: ; %main_body 353; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 354; GFX11-NEXT: s_endpgm 355main_body: 356 %trunc = trunc i32 %v1 to i16 357 %cast = bitcast i16 %trunc to half 358 call void @llvm.amdgcn.raw.buffer.store.f16(half %cast, <4 x i32> %rsrc, i32 0, i32 0, i32 0) 359 ret void 360} 361 362define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { 363; VERDE-LABEL: buffer_store_v2f16: 364; VERDE: ; %bb.0: ; %main_body 365; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 366; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 367; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 368; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 369; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 370; VERDE-NEXT: s_endpgm 371; 372; GFX8-LABEL: buffer_store_v2f16: 373; GFX8: ; %bb.0: ; %main_body 374; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 375; GFX8-NEXT: s_endpgm 376; 377; GFX11-LABEL: buffer_store_v2f16: 378; GFX11: ; %bb.0: ; %main_body 379; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen 380; GFX11-NEXT: s_endpgm 381main_body: 382 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 383 ret void 384} 385 386define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { 387; VERDE-LABEL: buffer_store_v4f16: 388; VERDE: ; %bb.0: ; %main_body 389; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 390; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 391; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 392; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 393; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 394; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 395; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 396; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 397; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen 398; VERDE-NEXT: s_endpgm 399; 400; GFX8-LABEL: buffer_store_v4f16: 401; GFX8: ; %bb.0: ; %main_body 402; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen 403; GFX8-NEXT: s_endpgm 404; 405; GFX11-LABEL: buffer_store_v4f16: 406; GFX11: ; %bb.0: ; %main_body 407; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen 408; GFX11-NEXT: s_endpgm 409main_body: 410 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 411 ret void 412} 413 414define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { 415; GFX68-LABEL: raw_buffer_store_i16: 416; GFX68: ; %bb.0: ; %main_body 417; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 418; GFX68-NEXT: s_endpgm 419; 420; GFX11-LABEL: raw_buffer_store_i16: 421; GFX11: ; %bb.0: ; %main_body 422; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 423; GFX11-NEXT: s_endpgm 424main_body: 425 %trunc = trunc i32 %v1 to i16 426 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) 427 ret void 428} 429 430define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { 431; VERDE-LABEL: buffer_store_v2i16: 432; VERDE: ; %bb.0: ; %main_body 433; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 434; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 435; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 436; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 437; VERDE-NEXT: s_endpgm 438; 439; GFX8-LABEL: buffer_store_v2i16: 440; GFX8: ; %bb.0: ; %main_body 441; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 442; GFX8-NEXT: s_endpgm 443; 444; GFX11-LABEL: buffer_store_v2i16: 445; GFX11: ; %bb.0: ; %main_body 446; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen 447; GFX11-NEXT: s_endpgm 448main_body: 449 call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 450 ret void 451} 452 453define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { 454; VERDE-LABEL: buffer_store_v4i16: 455; VERDE: ; %bb.0: ; %main_body 456; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 457; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 458; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 459; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 460; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 461; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 462; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen 463; VERDE-NEXT: s_endpgm 464; 465; GFX8-LABEL: buffer_store_v4i16: 466; GFX8: ; %bb.0: ; %main_body 467; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen 468; GFX8-NEXT: s_endpgm 469; 470; GFX11-LABEL: buffer_store_v4i16: 471; GFX11: ; %bb.0: ; %main_body 472; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen 473; GFX11-NEXT: s_endpgm 474main_body: 475 call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) 476 ret void 477} 478 479define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 480; GFX68-LABEL: raw_buffer_store_x1_offset_merged: 481; GFX68: ; %bb.0: 482; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 483; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 484; GFX68-NEXT: s_endpgm 485; 486; GFX11-LABEL: raw_buffer_store_x1_offset_merged: 487; GFX11: ; %bb.0: 488; GFX11-NEXT: s_clause 0x1 489; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 490; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 491; GFX11-NEXT: s_endpgm 492 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) 493 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) 494 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) 495 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) 496 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) 497 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) 498 ret void 499} 500 501define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 502; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: 503; GFX68: ; %bb.0: 504; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 505; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 506; GFX68-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 507; GFX68-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 508; GFX68-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 509; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 510; GFX68-NEXT: s_endpgm 511; 512; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: 513; GFX11: ; %bb.0: 514; GFX11-NEXT: s_clause 0x5 515; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4 516; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 offset:8 517; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 offset:12 518; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 offset:16 519; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 520; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 521; GFX11-NEXT: s_endpgm 522 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) 523 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) 524 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) 525 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8) 526 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8) 527 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8) 528 ret void 529} 530 531define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { 532; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: 533; GFX12: ; %bb.0: 534; GFX12-NEXT: s_clause 0x5 535; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4 536; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8 537; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12 538; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16 539; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28 540; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32 541; GFX12-NEXT: s_endpgm 542 call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64) 543 call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64) 544 call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64) 545 call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64) 546 call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64) 547 call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64) 548 ret void 549} 550 551declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 552declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 553declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 554declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #0 555declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32) #0 556declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32) #0 557declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #1 558declare void @llvm.amdgcn.raw.buffer.store.i8(i8, <4 x i32>, i32, i32, i32) #0 559declare void @llvm.amdgcn.raw.buffer.store.f16(half, <4 x i32>, i32, i32, i32) #0 560declare void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) #0 561declare void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32) #0 562declare void @llvm.amdgcn.raw.buffer.store.i16(i16, <4 x i32>, i32, i32, i32) #0 563declare void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i32) #0 564declare void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16>, <4 x i32>, i32, i32, i32) #0 565 566attributes #0 = { nounwind } 567attributes #1 = { nounwind readonly } 568