1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10 3;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=PREGFX10 4;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10 5;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11 6 7define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { 8; PREGFX10-LABEL: buffer_load: 9; PREGFX10: ; %bb.0: ; %main_body 10; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 11; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc 12; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc 13; PREGFX10-NEXT: s_waitcnt vmcnt(0) 14; PREGFX10-NEXT: ; return to shader part epilog 15; 16; GFX10-LABEL: buffer_load: 17; GFX10: ; %bb.0: ; %main_body 18; GFX10-NEXT: s_clause 0x2 19; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 20; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc 21; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc 22; GFX10-NEXT: s_waitcnt vmcnt(0) 23; GFX10-NEXT: ; return to shader part epilog 24; 25; GFX11-LABEL: buffer_load: 26; GFX11: ; %bb.0: ; %main_body 27; GFX11-NEXT: s_clause 0x2 28; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 29; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc 30; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 slc 31; GFX11-NEXT: s_waitcnt vmcnt(0) 32; GFX11-NEXT: ; return to shader part epilog 33main_body: 34 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0) 35 %data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 1) 36 %data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 2) 37 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 38 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 39 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 40 ret {<4 x float>, <4 x float>, <4 x float>} %r2 41} 42 43define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(ptr addrspace(8) inreg) { 44; PREGFX10-LABEL: buffer_load_dlc: 45; PREGFX10: ; %bb.0: ; %main_body 46; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 47; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc 48; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc 49; PREGFX10-NEXT: s_waitcnt vmcnt(0) 50; PREGFX10-NEXT: ; return to shader part epilog 51; 52; GFX10-LABEL: buffer_load_dlc: 53; GFX10: ; %bb.0: ; %main_body 54; GFX10-NEXT: s_clause 0x2 55; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 dlc 56; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc 57; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc dlc 58; GFX10-NEXT: s_waitcnt vmcnt(0) 59; GFX10-NEXT: ; return to shader part epilog 60; 61; GFX11-LABEL: buffer_load_dlc: 62; GFX11: ; %bb.0: ; %main_body 63; GFX11-NEXT: s_clause 0x2 64; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 dlc 65; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc 66; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 slc dlc 67; GFX11-NEXT: s_waitcnt vmcnt(0) 68; GFX11-NEXT: ; return to shader part epilog 69main_body: 70 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 4) 71 %data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 5) 72 %data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 6) 73 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 74 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 75 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 76 ret {<4 x float>, <4 x float>, <4 x float>} %r2 77} 78 79define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_volatile(ptr addrspace(8) inreg) { 80; PREGFX10-LABEL: buffer_load_volatile: 81; PREGFX10: ; %bb.0: ; %main_body 82; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc 83; PREGFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc 84; PREGFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc 85; PREGFX10-NEXT: s_waitcnt vmcnt(0) 86; PREGFX10-NEXT: ; return to shader part epilog 87; 88; GFX10-LABEL: buffer_load_volatile: 89; GFX10: ; %bb.0: ; %main_body 90; GFX10-NEXT: s_clause 0x2 91; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 glc dlc 92; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc 93; GFX10-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 glc slc dlc 94; GFX10-NEXT: s_waitcnt vmcnt(0) 95; GFX10-NEXT: ; return to shader part epilog 96; 97; GFX11-LABEL: buffer_load_volatile: 98; GFX11: ; %bb.0: ; %main_body 99; GFX11-NEXT: s_clause 0x2 100; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc 101; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 glc dlc 102; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 glc slc dlc 103; GFX11-NEXT: s_waitcnt vmcnt(0) 104; GFX11-NEXT: ; return to shader part epilog 105main_body: 106 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483648) 107 %data_glc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483647) 108 %data_slc = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 -2147483646) 109 %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0 110 %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1 111 %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2 112 ret {<4 x float>, <4 x float>, <4 x float>} %r2 113} 114 115define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { 116; PREGFX10-LABEL: buffer_load_immoffs: 117; PREGFX10: ; %bb.0: ; %main_body 118; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 119; PREGFX10-NEXT: s_waitcnt vmcnt(0) 120; PREGFX10-NEXT: ; return to shader part epilog 121; 122; GFX10-LABEL: buffer_load_immoffs: 123; GFX10: ; %bb.0: ; %main_body 124; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 125; GFX10-NEXT: s_waitcnt vmcnt(0) 126; GFX10-NEXT: ; return to shader part epilog 127; 128; GFX11-LABEL: buffer_load_immoffs: 129; GFX11: ; %bb.0: ; %main_body 130; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:40 131; GFX11-NEXT: s_waitcnt vmcnt(0) 132; GFX11-NEXT: ; return to shader part epilog 133main_body: 134 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 40, i32 0, i32 0) 135 ret <4 x float> %data 136} 137 138define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { 139; PREGFX10-LABEL: buffer_load_immoffs_large: 140; PREGFX10: ; %bb.0: ; %main_body 141; PREGFX10-NEXT: s_movk_i32 s4, 0x1ffc 142; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s4 offset:4 143; PREGFX10-NEXT: s_waitcnt vmcnt(0) 144; PREGFX10-NEXT: ; return to shader part epilog 145; 146; GFX10-LABEL: buffer_load_immoffs_large: 147; GFX10: ; %bb.0: ; %main_body 148; GFX10-NEXT: s_movk_i32 s4, 0x1ffc 149; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s4 offset:4 150; GFX10-NEXT: s_waitcnt vmcnt(0) 151; GFX10-NEXT: ; return to shader part epilog 152; 153; GFX11-LABEL: buffer_load_immoffs_large: 154; GFX11: ; %bb.0: ; %main_body 155; GFX11-NEXT: s_movk_i32 s4, 0x1ffc 156; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], s4 offset:4 157; GFX11-NEXT: s_waitcnt vmcnt(0) 158; GFX11-NEXT: ; return to shader part epilog 159main_body: 160 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 4, i32 8188, i32 0) 161 ret <4 x float> %data 162} 163 164define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { 165; PREGFX10-LABEL: buffer_load_ofs: 166; PREGFX10: ; %bb.0: ; %main_body 167; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen 168; PREGFX10-NEXT: s_waitcnt vmcnt(0) 169; PREGFX10-NEXT: ; return to shader part epilog 170; 171; GFX10-LABEL: buffer_load_ofs: 172; GFX10: ; %bb.0: ; %main_body 173; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen 174; GFX10-NEXT: s_waitcnt vmcnt(0) 175; GFX10-NEXT: ; return to shader part epilog 176; 177; GFX11-LABEL: buffer_load_ofs: 178; GFX11: ; %bb.0: ; %main_body 179; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen 180; GFX11-NEXT: s_waitcnt vmcnt(0) 181; GFX11-NEXT: ; return to shader part epilog 182main_body: 183 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0) 184 ret <4 x float> %data 185} 186 187define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { 188; PREGFX10-LABEL: buffer_load_ofs_imm: 189; PREGFX10: ; %bb.0: ; %main_body 190; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 191; PREGFX10-NEXT: s_waitcnt vmcnt(0) 192; PREGFX10-NEXT: ; return to shader part epilog 193; 194; GFX10-LABEL: buffer_load_ofs_imm: 195; GFX10: ; %bb.0: ; %main_body 196; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 197; GFX10-NEXT: s_waitcnt vmcnt(0) 198; GFX10-NEXT: ; return to shader part epilog 199; 200; GFX11-LABEL: buffer_load_ofs_imm: 201; GFX11: ; %bb.0: ; %main_body 202; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60 203; GFX11-NEXT: s_waitcnt vmcnt(0) 204; GFX11-NEXT: ; return to shader part epilog 205main_body: 206 %ofs = add i32 %1, 60 207 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %ofs, i32 0, i32 0) 208 ret <4 x float> %data 209} 210 211define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) inreg) { 212; PREGFX10-LABEL: buffer_load_voffset_large_12bit: 213; PREGFX10: ; %bb.0: ; %main_body 214; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4092 215; PREGFX10-NEXT: s_waitcnt vmcnt(0) 216; PREGFX10-NEXT: ; return to shader part epilog 217; 218; GFX10-LABEL: buffer_load_voffset_large_12bit: 219; GFX10: ; %bb.0: ; %main_body 220; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4092 221; GFX10-NEXT: s_waitcnt vmcnt(0) 222; GFX10-NEXT: ; return to shader part epilog 223; 224; GFX11-LABEL: buffer_load_voffset_large_12bit: 225; GFX11: ; %bb.0: ; %main_body 226; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4092 227; GFX11-NEXT: s_waitcnt vmcnt(0) 228; GFX11-NEXT: ; return to shader part epilog 229main_body: 230 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 4092, i32 0, i32 0) 231 ret <4 x float> %data 232} 233 234define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) inreg) { 235; PREGFX10-LABEL: buffer_load_voffset_large_13bit: 236; PREGFX10: ; %bb.0: ; %main_body 237; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x1000 238; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 239; PREGFX10-NEXT: s_waitcnt vmcnt(0) 240; PREGFX10-NEXT: ; return to shader part epilog 241; 242; GFX10-LABEL: buffer_load_voffset_large_13bit: 243; GFX10: ; %bb.0: ; %main_body 244; GFX10-NEXT: v_mov_b32_e32 v0, 0x1000 245; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 246; GFX10-NEXT: s_waitcnt vmcnt(0) 247; GFX10-NEXT: ; return to shader part epilog 248; 249; GFX11-LABEL: buffer_load_voffset_large_13bit: 250; GFX11: ; %bb.0: ; %main_body 251; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 252; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 253; GFX11-NEXT: s_waitcnt vmcnt(0) 254; GFX11-NEXT: ; return to shader part epilog 255main_body: 256 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 8188, i32 0, i32 0) 257 ret <4 x float> %data 258} 259 260define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) inreg) { 261; PREGFX10-LABEL: buffer_load_voffset_large_16bit: 262; PREGFX10: ; %bb.0: ; %main_body 263; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xf000 264; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 265; PREGFX10-NEXT: s_waitcnt vmcnt(0) 266; PREGFX10-NEXT: ; return to shader part epilog 267; 268; GFX10-LABEL: buffer_load_voffset_large_16bit: 269; GFX10: ; %bb.0: ; %main_body 270; GFX10-NEXT: v_mov_b32_e32 v0, 0xf000 271; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 272; GFX10-NEXT: s_waitcnt vmcnt(0) 273; GFX10-NEXT: ; return to shader part epilog 274; 275; GFX11-LABEL: buffer_load_voffset_large_16bit: 276; GFX11: ; %bb.0: ; %main_body 277; GFX11-NEXT: v_mov_b32_e32 v0, 0xf000 278; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 279; GFX11-NEXT: s_waitcnt vmcnt(0) 280; GFX11-NEXT: ; return to shader part epilog 281main_body: 282 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 65532, i32 0, i32 0) 283 ret <4 x float> %data 284} 285 286define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) inreg) { 287; PREGFX10-LABEL: buffer_load_voffset_large_23bit: 288; PREGFX10: ; %bb.0: ; %main_body 289; PREGFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000 290; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 291; PREGFX10-NEXT: s_waitcnt vmcnt(0) 292; PREGFX10-NEXT: ; return to shader part epilog 293; 294; GFX10-LABEL: buffer_load_voffset_large_23bit: 295; GFX10: ; %bb.0: ; %main_body 296; GFX10-NEXT: v_mov_b32_e32 v0, 0x7ff000 297; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 298; GFX10-NEXT: s_waitcnt vmcnt(0) 299; GFX10-NEXT: ; return to shader part epilog 300; 301; GFX11-LABEL: buffer_load_voffset_large_23bit: 302; GFX11: ; %bb.0: ; %main_body 303; GFX11-NEXT: v_mov_b32_e32 v0, 0x7ff000 304; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 305; GFX11-NEXT: s_waitcnt vmcnt(0) 306; GFX11-NEXT: ; return to shader part epilog 307main_body: 308 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 8388604, i32 0, i32 0) 309 ret <4 x float> %data 310} 311 312define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) inreg) { 313; PREGFX10-LABEL: buffer_load_voffset_large_24bit: 314; PREGFX10: ; %bb.0: ; %main_body 315; PREGFX10-NEXT: v_mov_b32_e32 v0, 0xfff000 316; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 317; PREGFX10-NEXT: s_waitcnt vmcnt(0) 318; PREGFX10-NEXT: ; return to shader part epilog 319; 320; GFX10-LABEL: buffer_load_voffset_large_24bit: 321; GFX10: ; %bb.0: ; %main_body 322; GFX10-NEXT: v_mov_b32_e32 v0, 0xfff000 323; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4092 324; GFX10-NEXT: s_waitcnt vmcnt(0) 325; GFX10-NEXT: ; return to shader part epilog 326; 327; GFX11-LABEL: buffer_load_voffset_large_24bit: 328; GFX11: ; %bb.0: ; %main_body 329; GFX11-NEXT: v_mov_b32_e32 v0, 0xfff000 330; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 331; GFX11-NEXT: s_waitcnt vmcnt(0) 332; GFX11-NEXT: ; return to shader part epilog 333main_body: 334 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 16777212, i32 0, i32 0) 335 ret <4 x float> %data 336} 337 338 339define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %ofs) { 340; PREGFX10-LABEL: buffer_load_x1: 341; PREGFX10: ; %bb.0: ; %main_body 342; PREGFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 343; PREGFX10-NEXT: s_waitcnt vmcnt(0) 344; PREGFX10-NEXT: ; return to shader part epilog 345; 346; GFX10-LABEL: buffer_load_x1: 347; GFX10: ; %bb.0: ; %main_body 348; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 349; GFX10-NEXT: s_waitcnt vmcnt(0) 350; GFX10-NEXT: ; return to shader part epilog 351; 352; GFX11-LABEL: buffer_load_x1: 353; GFX11: ; %bb.0: ; %main_body 354; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen 355; GFX11-NEXT: s_waitcnt vmcnt(0) 356; GFX11-NEXT: ; return to shader part epilog 357main_body: 358 %data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0, i32 0) 359 ret float %data 360} 361 362define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %ofs) { 363; PREGFX10-LABEL: buffer_load_x2: 364; PREGFX10: ; %bb.0: ; %main_body 365; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen 366; PREGFX10-NEXT: s_waitcnt vmcnt(0) 367; PREGFX10-NEXT: ; return to shader part epilog 368; 369; GFX10-LABEL: buffer_load_x2: 370; GFX10: ; %bb.0: ; %main_body 371; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen 372; GFX10-NEXT: s_waitcnt vmcnt(0) 373; GFX10-NEXT: ; return to shader part epilog 374; 375; GFX11-LABEL: buffer_load_x2: 376; GFX11: ; %bb.0: ; %main_body 377; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen 378; GFX11-NEXT: s_waitcnt vmcnt(0) 379; GFX11-NEXT: ; return to shader part epilog 380main_body: 381 %data = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %ofs, i32 0, i32 0) 382 ret <2 x float> %data 383} 384 385define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { 386; GFX10-LABEL: buffer_load_negative_offset: 387; GFX10: ; %bb.0: ; %main_body 388; GFX10-NEXT: v_add_nc_u32_e32 v0, -16, v0 389; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen 390; GFX10-NEXT: s_waitcnt vmcnt(0) 391; GFX10-NEXT: ; return to shader part epilog 392; 393; GFX11-LABEL: buffer_load_negative_offset: 394; GFX11: ; %bb.0: ; %main_body 395; GFX11-NEXT: v_add_nc_u32_e32 v0, -16, v0 396; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen 397; GFX11-NEXT: s_waitcnt vmcnt(0) 398; GFX11-NEXT: ; return to shader part epilog 399main_body: 400 %ofs.1 = add i32 %ofs, -16 401 %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %ofs.1, i32 0, i32 0) 402 ret <4 x float> %data 403} 404 405define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) { 406; GFX10-LABEL: buffer_load_mmo: 407; GFX10: ; %bb.0: ; %entry 408; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 409; GFX10-NEXT: v_mov_b32_e32 v2, 0 410; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 411; GFX10-NEXT: s_waitcnt vmcnt(0) 412; GFX10-NEXT: v_mov_b32_e32 v0, v1 413; GFX10-NEXT: s_waitcnt lgkmcnt(0) 414; GFX10-NEXT: ; return to shader part epilog 415; 416; GFX11-LABEL: buffer_load_mmo: 417; GFX11: ; %bb.0: ; %entry 418; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 419; GFX11-NEXT: v_mov_b32_e32 v2, 0 420; GFX11-NEXT: ds_store_2addr_b32 v0, v2, v2 offset1:4 421; GFX11-NEXT: s_waitcnt vmcnt(0) 422; GFX11-NEXT: v_mov_b32_e32 v0, v1 423; GFX11-NEXT: s_waitcnt lgkmcnt(0) 424; GFX11-NEXT: ; return to shader part epilog 425entry: 426 store float 0.0, ptr addrspace(3) %lds 427 %val = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 428 %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4 429 store float 0.0, ptr addrspace(3) %tmp2 430 ret float %val 431} 432 433define amdgpu_ps void @buffer_load_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a) { 434; PREGFX10-LABEL: buffer_load_x1_offen_merged_and: 435; PREGFX10: ; %bb.0: ; %main_body 436; PREGFX10-NEXT: buffer_load_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 437; PREGFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 438; PREGFX10-NEXT: s_waitcnt vmcnt(1) 439; PREGFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm 440; PREGFX10-NEXT: s_waitcnt vmcnt(0) 441; PREGFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm 442; PREGFX10-NEXT: s_endpgm 443; 444; GFX10-LABEL: buffer_load_x1_offen_merged_and: 445; GFX10: ; %bb.0: ; %main_body 446; GFX10-NEXT: s_clause 0x1 447; GFX10-NEXT: buffer_load_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 448; GFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 449; GFX10-NEXT: s_waitcnt vmcnt(1) 450; GFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm 451; GFX10-NEXT: s_waitcnt vmcnt(0) 452; GFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm 453; GFX10-NEXT: s_endpgm 454; 455; GFX11-LABEL: buffer_load_x1_offen_merged_and: 456; GFX11: ; %bb.0: ; %main_body 457; GFX11-NEXT: s_clause 0x1 458; GFX11-NEXT: buffer_load_b128 v[1:4], v0, s[0:3], 0 offen offset:4 459; GFX11-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], 0 offen offset:28 460; GFX11-NEXT: s_waitcnt vmcnt(1) 461; GFX11-NEXT: exp mrt0 v1, v2, v3, v4 done 462; GFX11-NEXT: s_waitcnt vmcnt(0) 463; GFX11-NEXT: exp mrt0 v5, v6, v0, v0 done 464; GFX11-NEXT: s_endpgm 465main_body: 466 %a1 = add i32 %a, 4 467 %a2 = add i32 %a, 8 468 %a3 = add i32 %a, 12 469 %a4 = add i32 %a, 16 470 %a5 = add i32 %a, 28 471 %a6 = add i32 %a, 32 472 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) 473 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) 474 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) 475 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) 476 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) 477 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) 478 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 479 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 480 ret void 481} 482 483define amdgpu_ps void @buffer_load_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp) { 484; PREGFX10-LABEL: buffer_load_x1_offen_merged_or: 485; PREGFX10: ; %bb.0: ; %main_body 486; PREGFX10-NEXT: v_lshlrev_b32_e32 v4, 6, v0 487; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen offset:4 488; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], v4, s[0:3], 0 offen offset:28 489; PREGFX10-NEXT: s_waitcnt vmcnt(1) 490; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 491; PREGFX10-NEXT: s_waitcnt vmcnt(0) 492; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 493; PREGFX10-NEXT: s_endpgm 494; 495; GFX10-LABEL: buffer_load_x1_offen_merged_or: 496; GFX10: ; %bb.0: ; %main_body 497; GFX10-NEXT: v_lshlrev_b32_e32 v6, 6, v0 498; GFX10-NEXT: s_clause 0x1 499; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 offen offset:4 500; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 offen offset:28 501; GFX10-NEXT: s_waitcnt vmcnt(1) 502; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 503; GFX10-NEXT: s_waitcnt vmcnt(0) 504; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 505; GFX10-NEXT: s_endpgm 506; 507; GFX11-LABEL: buffer_load_x1_offen_merged_or: 508; GFX11: ; %bb.0: ; %main_body 509; GFX11-NEXT: v_lshlrev_b32_e32 v4, 6, v0 510; GFX11-NEXT: s_clause 0x1 511; GFX11-NEXT: buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:4 512; GFX11-NEXT: buffer_load_b64 v[4:5], v4, s[0:3], 0 offen offset:28 513; GFX11-NEXT: s_waitcnt vmcnt(1) 514; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 515; GFX11-NEXT: s_waitcnt vmcnt(0) 516; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done 517; GFX11-NEXT: s_endpgm 518main_body: 519 %a = shl i32 %inp, 6 520 %a1 = or i32 %a, 4 521 %a2 = or i32 %a, 8 522 %a3 = or i32 %a, 12 523 %a4 = or i32 %a, 16 524 %a5 = or i32 %a, 28 525 %a6 = or i32 %a, 32 526 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) 527 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) 528 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 0) 529 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 0) 530 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 0) 531 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 0) 532 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 533 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 534 ret void 535} 536 537define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a) { 538; PREGFX10-LABEL: buffer_load_x1_offen_merged_glc_slc: 539; PREGFX10: ; %bb.0: ; %main_body 540; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 541; PREGFX10-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc 542; PREGFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc 543; PREGFX10-NEXT: s_waitcnt vmcnt(1) 544; PREGFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm 545; PREGFX10-NEXT: s_waitcnt vmcnt(0) 546; PREGFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm 547; PREGFX10-NEXT: s_endpgm 548; 549; GFX10-LABEL: buffer_load_x1_offen_merged_glc_slc: 550; GFX10: ; %bb.0: ; %main_body 551; GFX10-NEXT: s_clause 0x2 552; GFX10-NEXT: buffer_load_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 553; GFX10-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc 554; GFX10-NEXT: buffer_load_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc 555; GFX10-NEXT: s_waitcnt vmcnt(1) 556; GFX10-NEXT: exp mrt0 v1, v2, v3, v4 done vm 557; GFX10-NEXT: s_waitcnt vmcnt(0) 558; GFX10-NEXT: exp mrt0 v5, v6, v0, v0 done vm 559; GFX10-NEXT: s_endpgm 560; 561; GFX11-LABEL: buffer_load_x1_offen_merged_glc_slc: 562; GFX11: ; %bb.0: ; %main_body 563; GFX11-NEXT: s_clause 0x2 564; GFX11-NEXT: buffer_load_b64 v[1:2], v0, s[0:3], 0 offen offset:4 565; GFX11-NEXT: buffer_load_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc 566; GFX11-NEXT: buffer_load_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc 567; GFX11-NEXT: s_waitcnt vmcnt(1) 568; GFX11-NEXT: exp mrt0 v1, v2, v3, v4 done 569; GFX11-NEXT: s_waitcnt vmcnt(0) 570; GFX11-NEXT: exp mrt0 v5, v6, v0, v0 done 571; GFX11-NEXT: s_endpgm 572main_body: 573 %a1 = add i32 %a, 4 574 %a2 = add i32 %a, 8 575 %a3 = add i32 %a, 12 576 %a4 = add i32 %a, 16 577 %a5 = add i32 %a, 28 578 %a6 = add i32 %a, 32 579 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) 580 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) 581 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a3, i32 0, i32 1) 582 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a4, i32 0, i32 1) 583 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a5, i32 0, i32 3) 584 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %a6, i32 0, i32 3) 585 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 586 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 587 ret void 588} 589 590define amdgpu_ps void @buffer_load_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a) { 591; PREGFX10-LABEL: buffer_load_x2_offen_merged_and: 592; PREGFX10: ; %bb.0: ; %main_body 593; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4 594; PREGFX10-NEXT: s_waitcnt vmcnt(0) 595; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 596; PREGFX10-NEXT: s_endpgm 597; 598; GFX10-LABEL: buffer_load_x2_offen_merged_and: 599; GFX10: ; %bb.0: ; %main_body 600; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4 601; GFX10-NEXT: s_waitcnt vmcnt(0) 602; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 603; GFX10-NEXT: s_endpgm 604; 605; GFX11-LABEL: buffer_load_x2_offen_merged_and: 606; GFX11: ; %bb.0: ; %main_body 607; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4 608; GFX11-NEXT: s_waitcnt vmcnt(0) 609; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 610; GFX11-NEXT: s_endpgm 611main_body: 612 %a1 = add i32 %a, 4 613 %a2 = add i32 %a, 12 614 %vr1 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) 615 %vr2 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) 616 %r1 = extractelement <2 x float> %vr1, i32 0 617 %r2 = extractelement <2 x float> %vr1, i32 1 618 %r3 = extractelement <2 x float> %vr2, i32 0 619 %r4 = extractelement <2 x float> %vr2, i32 1 620 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 621 ret void 622} 623 624define amdgpu_ps void @buffer_load_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp) { 625; PREGFX10-LABEL: buffer_load_x2_offen_merged_or: 626; PREGFX10: ; %bb.0: ; %main_body 627; PREGFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 628; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4 629; PREGFX10-NEXT: s_waitcnt vmcnt(0) 630; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 631; PREGFX10-NEXT: s_endpgm 632; 633; GFX10-LABEL: buffer_load_x2_offen_merged_or: 634; GFX10: ; %bb.0: ; %main_body 635; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 636; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:4 637; GFX10-NEXT: s_waitcnt vmcnt(0) 638; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 639; GFX10-NEXT: s_endpgm 640; 641; GFX11-LABEL: buffer_load_x2_offen_merged_or: 642; GFX11: ; %bb.0: ; %main_body 643; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 644; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4 645; GFX11-NEXT: s_waitcnt vmcnt(0) 646; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 647; GFX11-NEXT: s_endpgm 648main_body: 649 %a = shl i32 %inp, 4 650 %a1 = add i32 %a, 4 651 %a2 = add i32 %a, 12 652 %vr1 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0) 653 %vr2 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %a2, i32 0, i32 0) 654 %r1 = extractelement <2 x float> %vr1, i32 0 655 %r2 = extractelement <2 x float> %vr1, i32 1 656 %r3 = extractelement <2 x float> %vr2, i32 0 657 %r4 = extractelement <2 x float> %vr2, i32 1 658 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 659 ret void 660} 661 662define amdgpu_ps void @buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { 663; PREGFX10-LABEL: buffer_load_x1_offset_merged: 664; PREGFX10: ; %bb.0: ; %main_body 665; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 666; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 667; PREGFX10-NEXT: s_waitcnt vmcnt(1) 668; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 669; PREGFX10-NEXT: s_waitcnt vmcnt(0) 670; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 671; PREGFX10-NEXT: s_endpgm 672; 673; GFX10-LABEL: buffer_load_x1_offset_merged: 674; GFX10: ; %bb.0: ; %main_body 675; GFX10-NEXT: s_clause 0x1 676; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 677; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 678; GFX10-NEXT: s_waitcnt vmcnt(1) 679; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 680; GFX10-NEXT: s_waitcnt vmcnt(0) 681; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 682; GFX10-NEXT: s_endpgm 683; 684; GFX11-LABEL: buffer_load_x1_offset_merged: 685; GFX11: ; %bb.0: ; %main_body 686; GFX11-NEXT: s_clause 0x1 687; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4 688; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28 689; GFX11-NEXT: s_waitcnt vmcnt(1) 690; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 691; GFX11-NEXT: s_waitcnt vmcnt(0) 692; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done 693; GFX11-NEXT: s_endpgm 694main_body: 695 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) 696 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) 697 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) 698 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) 699 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) 700 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) 701 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 702 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 703 ret void 704} 705 706define amdgpu_ps void @buffer_load_x2_offset_merged(ptr addrspace(8) inreg %rsrc) { 707; PREGFX10-LABEL: buffer_load_x2_offset_merged: 708; PREGFX10: ; %bb.0: ; %main_body 709; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 710; PREGFX10-NEXT: s_waitcnt vmcnt(0) 711; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 712; PREGFX10-NEXT: s_endpgm 713; 714; GFX10-LABEL: buffer_load_x2_offset_merged: 715; GFX10: ; %bb.0: ; %main_body 716; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 717; GFX10-NEXT: s_waitcnt vmcnt(0) 718; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 719; GFX10-NEXT: s_endpgm 720; 721; GFX11-LABEL: buffer_load_x2_offset_merged: 722; GFX11: ; %bb.0: ; %main_body 723; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4 724; GFX11-NEXT: s_waitcnt vmcnt(0) 725; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 726; GFX11-NEXT: s_endpgm 727main_body: 728 %vr1 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) 729 %vr2 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) 730 %r1 = extractelement <2 x float> %vr1, i32 0 731 %r2 = extractelement <2 x float> %vr1, i32 1 732 %r3 = extractelement <2 x float> %vr2, i32 0 733 %r4 = extractelement <2 x float> %vr2, i32 1 734 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 735 ret void 736} 737 738define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) { 739; PREGFX10-LABEL: buffer_load_int: 740; PREGFX10: ; %bb.0: ; %main_body 741; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 742; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc 743; PREGFX10-NEXT: buffer_load_dword v6, off, s[0:3], 0 slc 744; PREGFX10-NEXT: s_waitcnt vmcnt(0) 745; PREGFX10-NEXT: ; return to shader part epilog 746; 747; GFX10-LABEL: buffer_load_int: 748; GFX10: ; %bb.0: ; %main_body 749; GFX10-NEXT: s_clause 0x2 750; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 751; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc 752; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], 0 slc 753; GFX10-NEXT: s_waitcnt vmcnt(0) 754; GFX10-NEXT: ; return to shader part epilog 755; 756; GFX11-LABEL: buffer_load_int: 757; GFX11: ; %bb.0: ; %main_body 758; GFX11-NEXT: s_clause 0x2 759; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 760; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 glc 761; GFX11-NEXT: buffer_load_b32 v6, off, s[0:3], 0 slc 762; GFX11-NEXT: s_waitcnt vmcnt(0) 763; GFX11-NEXT: ; return to shader part epilog 764main_body: 765 %data = call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0) 766 %data_glc = call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 1) 767 %data_slc = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %0, i32 0, i32 0, i32 2) 768 %fdata = bitcast <4 x i32> %data to <4 x float> 769 %fdata_glc = bitcast <2 x i32> %data_glc to <2 x float> 770 %fdata_slc = bitcast i32 %data_slc to float 771 %r0 = insertvalue {<4 x float>, <2 x float>, float} undef, <4 x float> %fdata, 0 772 %r1 = insertvalue {<4 x float>, <2 x float>, float} %r0, <2 x float> %fdata_glc, 1 773 %r2 = insertvalue {<4 x float>, <2 x float>, float} %r1, float %fdata_slc, 2 774 ret {<4 x float>, <2 x float>, float} %r2 775} 776 777define amdgpu_ps float @raw_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc) { 778; PREGFX10-LABEL: raw_ptr_buffer_load_ubyte: 779; PREGFX10: ; %bb.0: ; %main_body 780; PREGFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 781; PREGFX10-NEXT: s_waitcnt vmcnt(0) 782; PREGFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 783; PREGFX10-NEXT: ; return to shader part epilog 784; 785; GFX10-LABEL: raw_ptr_buffer_load_ubyte: 786; GFX10: ; %bb.0: ; %main_body 787; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 788; GFX10-NEXT: s_waitcnt vmcnt(0) 789; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 790; GFX10-NEXT: ; return to shader part epilog 791; 792; GFX11-LABEL: raw_ptr_buffer_load_ubyte: 793; GFX11: ; %bb.0: ; %main_body 794; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 795; GFX11-NEXT: s_waitcnt vmcnt(0) 796; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 797; GFX11-NEXT: ; return to shader part epilog 798main_body: 799 %tmp = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 800 %tmp2 = zext i8 %tmp to i32 801 %val = uitofp i32 %tmp2 to float 802 ret float %val 803} 804 805define amdgpu_ps float @raw_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc) { 806; PREGFX10-LABEL: raw_ptr_buffer_load_i16: 807; PREGFX10: ; %bb.0: ; %main_body 808; PREGFX10-NEXT: buffer_load_ushort v0, off, s[0:3], 0 809; PREGFX10-NEXT: s_waitcnt vmcnt(0) 810; PREGFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 811; PREGFX10-NEXT: ; return to shader part epilog 812; 813; GFX10-LABEL: raw_ptr_buffer_load_i16: 814; GFX10: ; %bb.0: ; %main_body 815; GFX10-NEXT: buffer_load_ushort v0, off, s[0:3], 0 816; GFX10-NEXT: s_waitcnt vmcnt(0) 817; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 818; GFX10-NEXT: ; return to shader part epilog 819; 820; GFX11-LABEL: raw_ptr_buffer_load_i16: 821; GFX11: ; %bb.0: ; %main_body 822; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 823; GFX11-NEXT: s_waitcnt vmcnt(0) 824; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 825; GFX11-NEXT: ; return to shader part epilog 826main_body: 827 %tmp = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 828 %tmp2 = zext i16 %tmp to i32 829 %val = uitofp i32 %tmp2 to float 830 ret float %val 831} 832 833define amdgpu_ps float @raw_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc) { 834; PREGFX10-LABEL: raw_ptr_buffer_load_sbyte: 835; PREGFX10: ; %bb.0: ; %main_body 836; PREGFX10-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 837; PREGFX10-NEXT: s_waitcnt vmcnt(0) 838; PREGFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 839; PREGFX10-NEXT: ; return to shader part epilog 840; 841; GFX10-LABEL: raw_ptr_buffer_load_sbyte: 842; GFX10: ; %bb.0: ; %main_body 843; GFX10-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 844; GFX10-NEXT: s_waitcnt vmcnt(0) 845; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 846; GFX10-NEXT: ; return to shader part epilog 847; 848; GFX11-LABEL: raw_ptr_buffer_load_sbyte: 849; GFX11: ; %bb.0: ; %main_body 850; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 851; GFX11-NEXT: s_waitcnt vmcnt(0) 852; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 853; GFX11-NEXT: ; return to shader part epilog 854main_body: 855 %tmp = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 856 %tmp2 = sext i8 %tmp to i32 857 %val = sitofp i32 %tmp2 to float 858 ret float %val 859} 860 861define amdgpu_ps float @raw_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc) { 862; PREGFX10-LABEL: raw_ptr_buffer_load_sshort: 863; PREGFX10: ; %bb.0: ; %main_body 864; PREGFX10-NEXT: buffer_load_sshort v0, off, s[0:3], 0 865; PREGFX10-NEXT: s_waitcnt vmcnt(0) 866; PREGFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 867; PREGFX10-NEXT: ; return to shader part epilog 868; 869; GFX10-LABEL: raw_ptr_buffer_load_sshort: 870; GFX10: ; %bb.0: ; %main_body 871; GFX10-NEXT: buffer_load_sshort v0, off, s[0:3], 0 872; GFX10-NEXT: s_waitcnt vmcnt(0) 873; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 874; GFX10-NEXT: ; return to shader part epilog 875; 876; GFX11-LABEL: raw_ptr_buffer_load_sshort: 877; GFX11: ; %bb.0: ; %main_body 878; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 879; GFX11-NEXT: s_waitcnt vmcnt(0) 880; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 881; GFX11-NEXT: ; return to shader part epilog 882main_body: 883 %tmp = call i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 884 %tmp2 = sext i16 %tmp to i32 885 %val = sitofp i32 %tmp2 to float 886 ret float %val 887} 888 889define amdgpu_ps void @raw_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { 890; PREGFX10-LABEL: raw_ptr_buffer_load_f16: 891; PREGFX10: ; %bb.0: ; %main_body 892; PREGFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 893; PREGFX10-NEXT: s_mov_b32 m0, -1 894; PREGFX10-NEXT: s_waitcnt vmcnt(0) 895; PREGFX10-NEXT: ds_write_b16 v0, v1 896; PREGFX10-NEXT: s_endpgm 897; 898; GFX10-LABEL: raw_ptr_buffer_load_f16: 899; GFX10: ; %bb.0: ; %main_body 900; GFX10-NEXT: buffer_load_ushort v1, off, s[0:3], 0 901; GFX10-NEXT: s_waitcnt vmcnt(0) 902; GFX10-NEXT: ds_write_b16 v0, v1 903; GFX10-NEXT: s_endpgm 904; 905; GFX11-LABEL: raw_ptr_buffer_load_f16: 906; GFX11: ; %bb.0: ; %main_body 907; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 908; GFX11-NEXT: s_waitcnt vmcnt(0) 909; GFX11-NEXT: ds_store_b16 v0, v1 910; GFX11-NEXT: s_endpgm 911main_body: 912 %val = call half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 913 store half %val, ptr addrspace(3) %ptr 914 ret void 915} 916 917define amdgpu_ps void @raw_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { 918; PREGFX10-LABEL: raw_ptr_buffer_load_v2f16: 919; PREGFX10: ; %bb.0: ; %main_body 920; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 921; PREGFX10-NEXT: s_mov_b32 m0, -1 922; PREGFX10-NEXT: s_waitcnt vmcnt(0) 923; PREGFX10-NEXT: ds_write_b32 v0, v1 924; PREGFX10-NEXT: s_endpgm 925; 926; GFX10-LABEL: raw_ptr_buffer_load_v2f16: 927; GFX10: ; %bb.0: ; %main_body 928; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 929; GFX10-NEXT: s_waitcnt vmcnt(0) 930; GFX10-NEXT: ds_write_b32 v0, v1 931; GFX10-NEXT: s_endpgm 932; 933; GFX11-LABEL: raw_ptr_buffer_load_v2f16: 934; GFX11: ; %bb.0: ; %main_body 935; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 936; GFX11-NEXT: s_waitcnt vmcnt(0) 937; GFX11-NEXT: ds_store_b32 v0, v1 938; GFX11-NEXT: s_endpgm 939main_body: 940 %val = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 941 store <2 x half> %val, ptr addrspace(3) %ptr 942 ret void 943} 944 945define amdgpu_ps void @raw_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { 946; PREGFX10-LABEL: raw_ptr_buffer_load_v4f16: 947; PREGFX10: ; %bb.0: ; %main_body 948; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 949; PREGFX10-NEXT: s_mov_b32 m0, -1 950; PREGFX10-NEXT: s_waitcnt vmcnt(0) 951; PREGFX10-NEXT: ds_write_b64 v0, v[1:2] 952; PREGFX10-NEXT: s_endpgm 953; 954; GFX10-LABEL: raw_ptr_buffer_load_v4f16: 955; GFX10: ; %bb.0: ; %main_body 956; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 957; GFX10-NEXT: s_waitcnt vmcnt(0) 958; GFX10-NEXT: ds_write_b64 v0, v[1:2] 959; GFX10-NEXT: s_endpgm 960; 961; GFX11-LABEL: raw_ptr_buffer_load_v4f16: 962; GFX11: ; %bb.0: ; %main_body 963; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 964; GFX11-NEXT: s_waitcnt vmcnt(0) 965; GFX11-NEXT: ds_store_b64 v0, v[1:2] 966; GFX11-NEXT: s_endpgm 967main_body: 968 %val = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 969 store <4 x half> %val, ptr addrspace(3) %ptr 970 ret void 971} 972 973define amdgpu_ps void @raw_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { 974; PREGFX10-LABEL: raw_ptr_buffer_load_v2i16: 975; PREGFX10: ; %bb.0: ; %main_body 976; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 977; PREGFX10-NEXT: s_mov_b32 m0, -1 978; PREGFX10-NEXT: s_waitcnt vmcnt(0) 979; PREGFX10-NEXT: ds_write_b32 v0, v1 980; PREGFX10-NEXT: s_endpgm 981; 982; GFX10-LABEL: raw_ptr_buffer_load_v2i16: 983; GFX10: ; %bb.0: ; %main_body 984; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 985; GFX10-NEXT: s_waitcnt vmcnt(0) 986; GFX10-NEXT: ds_write_b32 v0, v1 987; GFX10-NEXT: s_endpgm 988; 989; GFX11-LABEL: raw_ptr_buffer_load_v2i16: 990; GFX11: ; %bb.0: ; %main_body 991; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 992; GFX11-NEXT: s_waitcnt vmcnt(0) 993; GFX11-NEXT: ds_store_b32 v0, v1 994; GFX11-NEXT: s_endpgm 995main_body: 996 %val = call <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 997 store <2 x i16> %val, ptr addrspace(3) %ptr 998 ret void 999} 1000 1001define amdgpu_ps void @raw_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr) { 1002; PREGFX10-LABEL: raw_ptr_buffer_load_v4i16: 1003; PREGFX10: ; %bb.0: ; %main_body 1004; PREGFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 1005; PREGFX10-NEXT: s_mov_b32 m0, -1 1006; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1007; PREGFX10-NEXT: ds_write_b64 v0, v[1:2] 1008; PREGFX10-NEXT: s_endpgm 1009; 1010; GFX10-LABEL: raw_ptr_buffer_load_v4i16: 1011; GFX10: ; %bb.0: ; %main_body 1012; GFX10-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 1013; GFX10-NEXT: s_waitcnt vmcnt(0) 1014; GFX10-NEXT: ds_write_b64 v0, v[1:2] 1015; GFX10-NEXT: s_endpgm 1016; 1017; GFX11-LABEL: raw_ptr_buffer_load_v4i16: 1018; GFX11: ; %bb.0: ; %main_body 1019; GFX11-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 1020; GFX11-NEXT: s_waitcnt vmcnt(0) 1021; GFX11-NEXT: ds_store_b64 v0, v[1:2] 1022; GFX11-NEXT: s_endpgm 1023main_body: 1024 %val = call <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) 1025 store <4 x i16> %val, ptr addrspace(3) %ptr 1026 ret void 1027} 1028 1029define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_merged(ptr addrspace(8) inreg %rsrc) { 1030; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged: 1031; PREGFX10: ; %bb.0: ; %main_body 1032; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 1033; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 1034; PREGFX10-NEXT: s_waitcnt vmcnt(1) 1035; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 1036; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1037; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 1038; PREGFX10-NEXT: s_endpgm 1039; 1040; GFX10-LABEL: raw_ptr_buffer_load_x1_offset_merged: 1041; GFX10: ; %bb.0: ; %main_body 1042; GFX10-NEXT: s_clause 0x1 1043; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 1044; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 1045; GFX10-NEXT: s_waitcnt vmcnt(1) 1046; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 1047; GFX10-NEXT: s_waitcnt vmcnt(0) 1048; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 1049; GFX10-NEXT: s_endpgm 1050; 1051; GFX11-LABEL: raw_ptr_buffer_load_x1_offset_merged: 1052; GFX11: ; %bb.0: ; %main_body 1053; GFX11-NEXT: s_clause 0x1 1054; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4 1055; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28 1056; GFX11-NEXT: s_waitcnt vmcnt(1) 1057; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 1058; GFX11-NEXT: s_waitcnt vmcnt(0) 1059; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done 1060; GFX11-NEXT: s_endpgm 1061main_body: 1062 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) 1063 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0) 1064 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0) 1065 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 0) 1066 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 0) 1067 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 0) 1068 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 1069 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 1070 ret void 1071} 1072 1073define amdgpu_ps void @raw_ptr_buffer_load_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc) { 1074; PREGFX10-LABEL: raw_ptr_buffer_load_x1_offset_swizzled_not_merged: 1075; PREGFX10: ; %bb.0: ; %main_body 1076; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 1077; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 1078; PREGFX10-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12 1079; PREGFX10-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 1080; PREGFX10-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:28 1081; PREGFX10-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:32 1082; PREGFX10-NEXT: s_waitcnt vmcnt(2) 1083; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 1084; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1085; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 1086; PREGFX10-NEXT: s_endpgm 1087; 1088; GFX10-LABEL: raw_ptr_buffer_load_x1_offset_swizzled_not_merged: 1089; GFX10: ; %bb.0: ; %main_body 1090; GFX10-NEXT: s_clause 0x5 1091; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 1092; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 1093; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:12 1094; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:16 1095; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:28 1096; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:32 1097; GFX10-NEXT: s_waitcnt vmcnt(2) 1098; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm 1099; GFX10-NEXT: s_waitcnt vmcnt(0) 1100; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm 1101; GFX10-NEXT: s_endpgm 1102; 1103; GFX11-LABEL: raw_ptr_buffer_load_x1_offset_swizzled_not_merged: 1104; GFX11: ; %bb.0: ; %main_body 1105; GFX11-NEXT: s_clause 0x5 1106; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4 1107; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:8 1108; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0 offset:12 1109; GFX11-NEXT: buffer_load_b32 v3, off, s[0:3], 0 offset:16 1110; GFX11-NEXT: buffer_load_b32 v4, off, s[0:3], 0 offset:28 1111; GFX11-NEXT: buffer_load_b32 v5, off, s[0:3], 0 offset:32 1112; GFX11-NEXT: s_waitcnt vmcnt(2) 1113; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done 1114; GFX11-NEXT: s_waitcnt vmcnt(0) 1115; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done 1116; GFX11-NEXT: s_endpgm 1117main_body: 1118 %r1 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8) 1119 %r2 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8) 1120 %r3 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8) 1121 %r4 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 16, i32 0, i32 8) 1122 %r5 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 28, i32 0, i32 8) 1123 %r6 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 32, i32 0, i32 8) 1124 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) 1125 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) 1126 ret void 1127} 1128 1129define double @buffer_load_f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) { 1130; PREGFX10-LABEL: buffer_load_f64__voffset_add: 1131; PREGFX10: ; %bb.0: 1132; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1133; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 1134; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1135; PREGFX10-NEXT: s_setpc_b64 s[30:31] 1136; 1137; GFX10-LABEL: buffer_load_f64__voffset_add: 1138; GFX10: ; %bb.0: 1139; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1140; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 1141; GFX10-NEXT: s_waitcnt vmcnt(0) 1142; GFX10-NEXT: s_setpc_b64 s[30:31] 1143; 1144; GFX11-LABEL: buffer_load_f64__voffset_add: 1145; GFX11: ; %bb.0: 1146; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1147; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60 1148; GFX11-NEXT: s_waitcnt vmcnt(0) 1149; GFX11-NEXT: s_setpc_b64 s[30:31] 1150 %voffset.add = add i32 %voffset, 60 1151 %data = call double @llvm.amdgcn.raw.ptr.buffer.load.f64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0) 1152 ret double %data 1153} 1154 1155define <2 x double> @buffer_load_v2f64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) { 1156; PREGFX10-LABEL: buffer_load_v2f64__voffset_add: 1157; PREGFX10: ; %bb.0: 1158; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1159; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 1160; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1161; PREGFX10-NEXT: s_setpc_b64 s[30:31] 1162; 1163; GFX10-LABEL: buffer_load_v2f64__voffset_add: 1164; GFX10: ; %bb.0: 1165; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1166; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 1167; GFX10-NEXT: s_waitcnt vmcnt(0) 1168; GFX10-NEXT: s_setpc_b64 s[30:31] 1169; 1170; GFX11-LABEL: buffer_load_v2f64__voffset_add: 1171; GFX11: ; %bb.0: 1172; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1173; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60 1174; GFX11-NEXT: s_waitcnt vmcnt(0) 1175; GFX11-NEXT: s_setpc_b64 s[30:31] 1176 %voffset.add = add i32 %voffset, 60 1177 %data = call <2 x double> @llvm.amdgcn.raw.ptr.buffer.load.v2f64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0) 1178 ret <2 x double> %data 1179} 1180 1181define i64 @buffer_load_i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) { 1182; PREGFX10-LABEL: buffer_load_i64__voffset_add: 1183; PREGFX10: ; %bb.0: 1184; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1185; PREGFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 1186; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1187; PREGFX10-NEXT: s_setpc_b64 s[30:31] 1188; 1189; GFX10-LABEL: buffer_load_i64__voffset_add: 1190; GFX10: ; %bb.0: 1191; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1192; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:60 1193; GFX10-NEXT: s_waitcnt vmcnt(0) 1194; GFX10-NEXT: s_setpc_b64 s[30:31] 1195; 1196; GFX11-LABEL: buffer_load_i64__voffset_add: 1197; GFX11: ; %bb.0: 1198; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1199; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:60 1200; GFX11-NEXT: s_waitcnt vmcnt(0) 1201; GFX11-NEXT: s_setpc_b64 s[30:31] 1202 %voffset.add = add i32 %voffset, 60 1203 %data = call i64 @llvm.amdgcn.raw.ptr.buffer.load.i64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0) 1204 ret i64 %data 1205} 1206 1207define <2 x i64> @buffer_load_v2i64__voffset_add(ptr addrspace(8) inreg %rsrc, i32 %voffset) { 1208; PREGFX10-LABEL: buffer_load_v2i64__voffset_add: 1209; PREGFX10: ; %bb.0: 1210; PREGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1211; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 1212; PREGFX10-NEXT: s_waitcnt vmcnt(0) 1213; PREGFX10-NEXT: s_setpc_b64 s[30:31] 1214; 1215; GFX10-LABEL: buffer_load_v2i64__voffset_add: 1216; GFX10: ; %bb.0: 1217; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1218; GFX10-NEXT: buffer_load_dwordx4 v[0:3], v0, s[4:7], 0 offen offset:60 1219; GFX10-NEXT: s_waitcnt vmcnt(0) 1220; GFX10-NEXT: s_setpc_b64 s[30:31] 1221; 1222; GFX11-LABEL: buffer_load_v2i64__voffset_add: 1223; GFX11: ; %bb.0: 1224; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1225; GFX11-NEXT: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:60 1226; GFX11-NEXT: s_waitcnt vmcnt(0) 1227; GFX11-NEXT: s_setpc_b64 s[30:31] 1228 %voffset.add = add i32 %voffset, 60 1229 %data = call <2 x i64> @llvm.amdgcn.raw.ptr.buffer.load.v2i64(ptr addrspace(8) %rsrc, i32 %voffset.add, i32 0, i32 0) 1230 ret <2 x i64> %data 1231} 1232 1233declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #0 1234declare <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32) #0 1235declare <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8), i32, i32, i32) #0 1236declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32) #0 1237declare <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8), i32, i32, i32) #0 1238declare <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32) #0 1239declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 1240declare i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8), i32, i32, i32) #0 1241declare i16 @llvm.amdgcn.raw.ptr.buffer.load.i16(ptr addrspace(8), i32, i32, i32) #0 1242declare <2 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32) #0 1243declare <4 x i16> @llvm.amdgcn.raw.ptr.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32) #0 1244declare half @llvm.amdgcn.raw.ptr.buffer.load.f16(ptr addrspace(8), i32, i32, i32) #0 1245declare <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.v2f16(ptr addrspace(8), i32, i32, i32) #0 1246declare <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.v4f16(ptr addrspace(8), i32, i32, i32) #0 1247attributes #0 = { nounwind readonly } 1248