1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90a %s 4 5define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0 { 6; GFX940-LABEL: preload_block_count_x: 7; GFX940: ; %bb.1: 8; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 9; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 10; GFX940-NEXT: s_waitcnt lgkmcnt(0) 11; GFX940-NEXT: s_branch .LBB0_0 12; GFX940-NEXT: .p2align 8 13; GFX940-NEXT: ; %bb.2: 14; GFX940-NEXT: .LBB0_0: 15; GFX940-NEXT: v_mov_b32_e32 v0, 0 16; GFX940-NEXT: v_mov_b32_e32 v1, s4 17; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 18; GFX940-NEXT: s_endpgm 19; 20; GFX90a-LABEL: preload_block_count_x: 21; GFX90a: ; %bb.1: 22; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 23; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 24; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 25; GFX90a-NEXT: s_branch .LBB0_0 26; GFX90a-NEXT: .p2align 8 27; GFX90a-NEXT: ; %bb.2: 28; GFX90a-NEXT: .LBB0_0: 29; GFX90a-NEXT: v_mov_b32_e32 v0, 0 30; GFX90a-NEXT: v_mov_b32_e32 v1, s8 31; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 32; GFX90a-NEXT: s_endpgm 33 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 34 %load = load i32, ptr addrspace(4) %imp_arg_ptr 35 store i32 %load, ptr addrspace(1) %out 36 ret void 37} 38 39define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inreg %out, i32 inreg) #0 { 40; GFX940-LABEL: preload_unused_arg_block_count_x: 41; GFX940: ; %bb.1: 42; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 43; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 44; GFX940-NEXT: s_load_dword s6, s[0:1], 0x10 45; GFX940-NEXT: s_waitcnt lgkmcnt(0) 46; GFX940-NEXT: s_branch .LBB1_0 47; GFX940-NEXT: .p2align 8 48; GFX940-NEXT: ; %bb.2: 49; GFX940-NEXT: .LBB1_0: 50; GFX940-NEXT: v_mov_b32_e32 v0, 0 51; GFX940-NEXT: v_mov_b32_e32 v1, s6 52; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 53; GFX940-NEXT: s_endpgm 54; 55; GFX90a-LABEL: preload_unused_arg_block_count_x: 56; GFX90a: ; %bb.1: 57; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 58; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 59; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 60; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 61; GFX90a-NEXT: s_branch .LBB1_0 62; GFX90a-NEXT: .p2align 8 63; GFX90a-NEXT: ; %bb.2: 64; GFX90a-NEXT: .LBB1_0: 65; GFX90a-NEXT: v_mov_b32_e32 v0, 0 66; GFX90a-NEXT: v_mov_b32_e32 v1, s10 67; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 68; GFX90a-NEXT: s_endpgm 69 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 70 %load = load i32, ptr addrspace(4) %imp_arg_ptr 71 store i32 %load, ptr addrspace(1) %out 72 ret void 73} 74 75define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %out, i256 inreg) { 76; GFX940-LABEL: no_free_sgprs_block_count_x: 77; GFX940: ; %bb.1: 78; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 79; GFX940-NEXT: s_waitcnt lgkmcnt(0) 80; GFX940-NEXT: s_branch .LBB2_0 81; GFX940-NEXT: .p2align 8 82; GFX940-NEXT: ; %bb.2: 83; GFX940-NEXT: .LBB2_0: 84; GFX940-NEXT: s_load_dword s0, s[4:5], 0x28 85; GFX940-NEXT: v_mov_b32_e32 v0, 0 86; GFX940-NEXT: s_waitcnt lgkmcnt(0) 87; GFX940-NEXT: v_mov_b32_e32 v1, s0 88; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 89; GFX940-NEXT: s_endpgm 90; 91; GFX90a-LABEL: no_free_sgprs_block_count_x: 92; GFX90a: ; %bb.1: 93; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0 94; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 95; GFX90a-NEXT: s_branch .LBB2_0 96; GFX90a-NEXT: .p2align 8 97; GFX90a-NEXT: ; %bb.2: 98; GFX90a-NEXT: .LBB2_0: 99; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x28 100; GFX90a-NEXT: v_mov_b32_e32 v0, 0 101; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 102; GFX90a-NEXT: v_mov_b32_e32 v1, s0 103; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] 104; GFX90a-NEXT: s_endpgm 105 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 106 %load = load i32, ptr addrspace(4) %imp_arg_ptr 107 store i32 %load, ptr addrspace(1) %out 108 ret void 109} 110 111define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 { 112; GFX940-LABEL: no_inreg_block_count_x: 113; GFX940: ; %bb.0: 114; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 115; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 116; GFX940-NEXT: v_mov_b32_e32 v0, 0 117; GFX940-NEXT: s_waitcnt lgkmcnt(0) 118; GFX940-NEXT: v_mov_b32_e32 v1, s4 119; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 120; GFX940-NEXT: s_endpgm 121; 122; GFX90a-LABEL: no_inreg_block_count_x: 123; GFX90a: ; %bb.0: 124; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x8 125; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 126; GFX90a-NEXT: v_mov_b32_e32 v0, 0 127; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 128; GFX90a-NEXT: v_mov_b32_e32 v1, s2 129; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] 130; GFX90a-NEXT: s_endpgm 131 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 132 %load = load i32, ptr addrspace(4) %imp_arg_ptr 133 store i32 %load, ptr addrspace(1) %out 134 ret void 135} 136 137; Implicit arg preloading is currently restricted to cases where all explicit 138; args are inreg (preloaded). 139 140define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 inreg) #0 { 141; GFX940-LABEL: mixed_inreg_block_count_x: 142; GFX940: ; %bb.0: 143; GFX940-NEXT: s_load_dword s4, s[0:1], 0x10 144; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 145; GFX940-NEXT: v_mov_b32_e32 v0, 0 146; GFX940-NEXT: s_waitcnt lgkmcnt(0) 147; GFX940-NEXT: v_mov_b32_e32 v1, s4 148; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 149; GFX940-NEXT: s_endpgm 150; 151; GFX90a-LABEL: mixed_inreg_block_count_x: 152; GFX90a: ; %bb.0: 153; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10 154; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 155; GFX90a-NEXT: v_mov_b32_e32 v0, 0 156; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 157; GFX90a-NEXT: v_mov_b32_e32 v1, s2 158; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] 159; GFX90a-NEXT: s_endpgm 160 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 161 %load = load i32, ptr addrspace(4) %imp_arg_ptr 162 store i32 %load, ptr addrspace(1) %out 163 ret void 164} 165 166define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) #0 { 167; GFX940-LABEL: incorrect_type_i64_block_count_x: 168; GFX940: ; %bb.1: 169; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 170; GFX940-NEXT: s_waitcnt lgkmcnt(0) 171; GFX940-NEXT: s_branch .LBB5_0 172; GFX940-NEXT: .p2align 8 173; GFX940-NEXT: ; %bb.2: 174; GFX940-NEXT: .LBB5_0: 175; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 176; GFX940-NEXT: v_mov_b32_e32 v2, 0 177; GFX940-NEXT: s_waitcnt lgkmcnt(0) 178; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 179; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 180; GFX940-NEXT: s_endpgm 181; 182; GFX90a-LABEL: incorrect_type_i64_block_count_x: 183; GFX90a: ; %bb.1: 184; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 185; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 186; GFX90a-NEXT: s_branch .LBB5_0 187; GFX90a-NEXT: .p2align 8 188; GFX90a-NEXT: ; %bb.2: 189; GFX90a-NEXT: .LBB5_0: 190; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 191; GFX90a-NEXT: v_mov_b32_e32 v2, 0 192; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 193; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 194; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 195; GFX90a-NEXT: s_endpgm 196 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 197 %load = load i64, ptr addrspace(4) %imp_arg_ptr 198 store i64 %load, ptr addrspace(1) %out 199 ret void 200} 201 202define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inreg %out) #0 { 203; GFX940-LABEL: incorrect_type_i16_block_count_x: 204; GFX940: ; %bb.1: 205; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 206; GFX940-NEXT: s_waitcnt lgkmcnt(0) 207; GFX940-NEXT: s_branch .LBB6_0 208; GFX940-NEXT: .p2align 8 209; GFX940-NEXT: ; %bb.2: 210; GFX940-NEXT: .LBB6_0: 211; GFX940-NEXT: s_load_dword s0, s[0:1], 0x8 212; GFX940-NEXT: v_mov_b32_e32 v0, 0 213; GFX940-NEXT: s_waitcnt lgkmcnt(0) 214; GFX940-NEXT: v_mov_b32_e32 v1, s0 215; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 216; GFX940-NEXT: s_endpgm 217; 218; GFX90a-LABEL: incorrect_type_i16_block_count_x: 219; GFX90a: ; %bb.1: 220; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 221; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 222; GFX90a-NEXT: s_branch .LBB6_0 223; GFX90a-NEXT: .p2align 8 224; GFX90a-NEXT: ; %bb.2: 225; GFX90a-NEXT: .LBB6_0: 226; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x8 227; GFX90a-NEXT: v_mov_b32_e32 v0, 0 228; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 229; GFX90a-NEXT: v_mov_b32_e32 v1, s0 230; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 231; GFX90a-NEXT: s_endpgm 232 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 233 %load = load i16, ptr addrspace(4) %imp_arg_ptr 234 store i16 %load, ptr addrspace(1) %out 235 ret void 236} 237 238define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0 { 239; GFX940-LABEL: preload_block_count_y: 240; GFX940: ; %bb.1: 241; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 242; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 243; GFX940-NEXT: s_waitcnt lgkmcnt(0) 244; GFX940-NEXT: s_branch .LBB7_0 245; GFX940-NEXT: .p2align 8 246; GFX940-NEXT: ; %bb.2: 247; GFX940-NEXT: .LBB7_0: 248; GFX940-NEXT: v_mov_b32_e32 v0, 0 249; GFX940-NEXT: v_mov_b32_e32 v1, s5 250; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 251; GFX940-NEXT: s_endpgm 252; 253; GFX90a-LABEL: preload_block_count_y: 254; GFX90a: ; %bb.1: 255; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 256; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 257; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 258; GFX90a-NEXT: s_branch .LBB7_0 259; GFX90a-NEXT: .p2align 8 260; GFX90a-NEXT: ; %bb.2: 261; GFX90a-NEXT: .LBB7_0: 262; GFX90a-NEXT: v_mov_b32_e32 v0, 0 263; GFX90a-NEXT: v_mov_b32_e32 v1, s9 264; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 265; GFX90a-NEXT: s_endpgm 266 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 267 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 268 %load = load i32, ptr addrspace(4) %gep 269 store i32 %load, ptr addrspace(1) %out 270 ret void 271} 272 273define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) #0 { 274; GFX940-LABEL: random_incorrect_offset: 275; GFX940: ; %bb.1: 276; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 277; GFX940-NEXT: s_waitcnt lgkmcnt(0) 278; GFX940-NEXT: s_branch .LBB8_0 279; GFX940-NEXT: .p2align 8 280; GFX940-NEXT: ; %bb.2: 281; GFX940-NEXT: .LBB8_0: 282; GFX940-NEXT: s_mov_b32 s4, 8 283; GFX940-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 284; GFX940-NEXT: v_mov_b32_e32 v0, 0 285; GFX940-NEXT: s_waitcnt lgkmcnt(0) 286; GFX940-NEXT: v_mov_b32_e32 v1, s0 287; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 288; GFX940-NEXT: s_endpgm 289; 290; GFX90a-LABEL: random_incorrect_offset: 291; GFX90a: ; %bb.1: 292; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 293; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 294; GFX90a-NEXT: s_branch .LBB8_0 295; GFX90a-NEXT: .p2align 8 296; GFX90a-NEXT: ; %bb.2: 297; GFX90a-NEXT: .LBB8_0: 298; GFX90a-NEXT: s_mov_b32 s0, 8 299; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 300; GFX90a-NEXT: v_mov_b32_e32 v0, 0 301; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 302; GFX90a-NEXT: v_mov_b32_e32 v1, s0 303; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 304; GFX90a-NEXT: s_endpgm 305 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 306 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 307 %load = load i32, ptr addrspace(4) %gep 308 store i32 %load, ptr addrspace(1) %out 309 ret void 310} 311 312define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0 { 313; GFX940-LABEL: preload_block_count_z: 314; GFX940: ; %bb.1: 315; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 316; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 317; GFX940-NEXT: s_load_dword s6, s[0:1], 0x10 318; GFX940-NEXT: s_waitcnt lgkmcnt(0) 319; GFX940-NEXT: s_branch .LBB9_0 320; GFX940-NEXT: .p2align 8 321; GFX940-NEXT: ; %bb.2: 322; GFX940-NEXT: .LBB9_0: 323; GFX940-NEXT: v_mov_b32_e32 v0, 0 324; GFX940-NEXT: v_mov_b32_e32 v1, s6 325; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 326; GFX940-NEXT: s_endpgm 327; 328; GFX90a-LABEL: preload_block_count_z: 329; GFX90a: ; %bb.1: 330; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 331; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 332; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 333; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 334; GFX90a-NEXT: s_branch .LBB9_0 335; GFX90a-NEXT: .p2align 8 336; GFX90a-NEXT: ; %bb.2: 337; GFX90a-NEXT: .LBB9_0: 338; GFX90a-NEXT: v_mov_b32_e32 v0, 0 339; GFX90a-NEXT: v_mov_b32_e32 v1, s10 340; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 341; GFX90a-NEXT: s_endpgm 342 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 343 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 344 %load = load i32, ptr addrspace(4) %gep 345 store i32 %load, ptr addrspace(1) %out 346 ret void 347} 348 349define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspace(1) inreg %out, i8 inreg %val) #0 { 350; GFX940-LABEL: preload_block_count_x_imparg_align_ptr_i8: 351; GFX940: ; %bb.1: 352; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 353; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 354; GFX940-NEXT: s_load_dword s6, s[0:1], 0x10 355; GFX940-NEXT: s_waitcnt lgkmcnt(0) 356; GFX940-NEXT: s_branch .LBB10_0 357; GFX940-NEXT: .p2align 8 358; GFX940-NEXT: ; %bb.2: 359; GFX940-NEXT: .LBB10_0: 360; GFX940-NEXT: s_and_b32 s0, s4, 0xff 361; GFX940-NEXT: s_add_i32 s0, s6, s0 362; GFX940-NEXT: v_mov_b32_e32 v0, 0 363; GFX940-NEXT: v_mov_b32_e32 v1, s0 364; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 365; GFX940-NEXT: s_endpgm 366; 367; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8: 368; GFX90a: ; %bb.1: 369; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 370; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 371; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 372; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 373; GFX90a-NEXT: s_branch .LBB10_0 374; GFX90a-NEXT: .p2align 8 375; GFX90a-NEXT: ; %bb.2: 376; GFX90a-NEXT: .LBB10_0: 377; GFX90a-NEXT: s_and_b32 s0, s8, 0xff 378; GFX90a-NEXT: s_add_i32 s0, s10, s0 379; GFX90a-NEXT: v_mov_b32_e32 v0, 0 380; GFX90a-NEXT: v_mov_b32_e32 v1, s0 381; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 382; GFX90a-NEXT: s_endpgm 383 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 384 %load = load i32, ptr addrspace(4) %imp_arg_ptr 385 %ext = zext i8 %val to i32 386 %add = add i32 %load, %ext 387 store i32 %add, ptr addrspace(1) %out 388 ret void 389} 390 391define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out) #0 { 392; GFX940-LABEL: preload_block_count_xyz: 393; GFX940: ; %bb.1: 394; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 395; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 396; GFX940-NEXT: s_load_dword s6, s[0:1], 0x10 397; GFX940-NEXT: s_waitcnt lgkmcnt(0) 398; GFX940-NEXT: s_branch .LBB11_0 399; GFX940-NEXT: .p2align 8 400; GFX940-NEXT: ; %bb.2: 401; GFX940-NEXT: .LBB11_0: 402; GFX940-NEXT: v_mov_b32_e32 v3, 0 403; GFX940-NEXT: v_mov_b32_e32 v0, s4 404; GFX940-NEXT: v_mov_b32_e32 v1, s5 405; GFX940-NEXT: v_mov_b32_e32 v2, s6 406; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 407; GFX940-NEXT: s_endpgm 408; 409; GFX90a-LABEL: preload_block_count_xyz: 410; GFX90a: ; %bb.1: 411; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 412; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 413; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 414; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 415; GFX90a-NEXT: s_branch .LBB11_0 416; GFX90a-NEXT: .p2align 8 417; GFX90a-NEXT: ; %bb.2: 418; GFX90a-NEXT: .LBB11_0: 419; GFX90a-NEXT: v_mov_b32_e32 v3, 0 420; GFX90a-NEXT: v_mov_b32_e32 v0, s8 421; GFX90a-NEXT: v_mov_b32_e32 v1, s9 422; GFX90a-NEXT: v_mov_b32_e32 v2, s10 423; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 424; GFX90a-NEXT: s_endpgm 425 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 426 %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0 427 %load_x = load i32, ptr addrspace(4) %gep_x 428 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4 429 %load_y = load i32, ptr addrspace(4) %gep_y 430 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 431 %load_z = load i32, ptr addrspace(4) %gep_z 432 %ins.0 = insertelement <3 x i32> poison, i32 %load_x, i32 0 433 %ins.1 = insertelement <3 x i32> %ins.0, i32 %load_y, i32 1 434 %ins.2 = insertelement <3 x i32> %ins.1, i32 %load_z, i32 2 435 store <3 x i32> %ins.2, ptr addrspace(1) %out 436 ret void 437} 438 439define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out) #0 { 440; GFX940-LABEL: preload_workgroup_size_x: 441; GFX940: ; %bb.1: 442; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 443; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 444; GFX940-NEXT: s_waitcnt lgkmcnt(0) 445; GFX940-NEXT: s_branch .LBB12_0 446; GFX940-NEXT: .p2align 8 447; GFX940-NEXT: ; %bb.2: 448; GFX940-NEXT: .LBB12_0: 449; GFX940-NEXT: s_and_b32 s0, s7, 0xffff 450; GFX940-NEXT: v_mov_b32_e32 v0, 0 451; GFX940-NEXT: v_mov_b32_e32 v1, s0 452; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 453; GFX940-NEXT: s_endpgm 454; 455; GFX90a-LABEL: preload_workgroup_size_x: 456; GFX90a: ; %bb.1: 457; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 458; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 459; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 460; GFX90a-NEXT: s_branch .LBB12_0 461; GFX90a-NEXT: .p2align 8 462; GFX90a-NEXT: ; %bb.2: 463; GFX90a-NEXT: .LBB12_0: 464; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff 465; GFX90a-NEXT: v_mov_b32_e32 v0, 0 466; GFX90a-NEXT: v_mov_b32_e32 v1, s0 467; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 468; GFX90a-NEXT: s_endpgm 469 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 470 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 471 %load = load i16, ptr addrspace(4) %gep 472 %conv = zext i16 %load to i32 473 store i32 %conv, ptr addrspace(1) %out 474 ret void 475} 476 477define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out) #0 { 478; GFX940-LABEL: preload_workgroup_size_y: 479; GFX940: ; %bb.1: 480; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 481; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 482; GFX940-NEXT: s_waitcnt lgkmcnt(0) 483; GFX940-NEXT: s_branch .LBB13_0 484; GFX940-NEXT: .p2align 8 485; GFX940-NEXT: ; %bb.2: 486; GFX940-NEXT: .LBB13_0: 487; GFX940-NEXT: s_lshr_b32 s0, s7, 16 488; GFX940-NEXT: v_mov_b32_e32 v0, 0 489; GFX940-NEXT: v_mov_b32_e32 v1, s0 490; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 491; GFX940-NEXT: s_endpgm 492; 493; GFX90a-LABEL: preload_workgroup_size_y: 494; GFX90a: ; %bb.1: 495; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 496; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 497; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 498; GFX90a-NEXT: s_branch .LBB13_0 499; GFX90a-NEXT: .p2align 8 500; GFX90a-NEXT: ; %bb.2: 501; GFX90a-NEXT: .LBB13_0: 502; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 503; GFX90a-NEXT: v_mov_b32_e32 v0, 0 504; GFX90a-NEXT: v_mov_b32_e32 v1, s0 505; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 506; GFX90a-NEXT: s_endpgm 507 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 508 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 509 %load = load i16, ptr addrspace(4) %gep 510 %conv = zext i16 %load to i32 511 store i32 %conv, ptr addrspace(1) %out 512 ret void 513} 514 515define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out) #0 { 516; GFX940-LABEL: preload_workgroup_size_z: 517; GFX940: ; %bb.1: 518; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 519; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 520; GFX940-NEXT: s_load_dword s8, s[0:1], 0x18 521; GFX940-NEXT: s_waitcnt lgkmcnt(0) 522; GFX940-NEXT: s_branch .LBB14_0 523; GFX940-NEXT: .p2align 8 524; GFX940-NEXT: ; %bb.2: 525; GFX940-NEXT: .LBB14_0: 526; GFX940-NEXT: s_and_b32 s0, s8, 0xffff 527; GFX940-NEXT: v_mov_b32_e32 v0, 0 528; GFX940-NEXT: v_mov_b32_e32 v1, s0 529; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 530; GFX940-NEXT: s_endpgm 531; 532; GFX90a-LABEL: preload_workgroup_size_z: 533; GFX90a: ; %bb.1: 534; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 535; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 536; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 537; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 538; GFX90a-NEXT: s_branch .LBB14_0 539; GFX90a-NEXT: .p2align 8 540; GFX90a-NEXT: ; %bb.2: 541; GFX90a-NEXT: .LBB14_0: 542; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff 543; GFX90a-NEXT: v_mov_b32_e32 v0, 0 544; GFX90a-NEXT: v_mov_b32_e32 v1, s0 545; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 546; GFX90a-NEXT: s_endpgm 547 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 548 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 549 %load = load i16, ptr addrspace(4) %gep 550 %conv = zext i16 %load to i32 551 store i32 %conv, ptr addrspace(1) %out 552 ret void 553} 554 555define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %out) #0 { 556; GFX940-LABEL: preload_workgroup_size_xyz: 557; GFX940: ; %bb.1: 558; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 559; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 560; GFX940-NEXT: s_load_dword s8, s[0:1], 0x18 561; GFX940-NEXT: s_waitcnt lgkmcnt(0) 562; GFX940-NEXT: s_branch .LBB15_0 563; GFX940-NEXT: .p2align 8 564; GFX940-NEXT: ; %bb.2: 565; GFX940-NEXT: .LBB15_0: 566; GFX940-NEXT: s_lshr_b32 s0, s7, 16 567; GFX940-NEXT: s_and_b32 s1, s7, 0xffff 568; GFX940-NEXT: s_and_b32 s4, s8, 0xffff 569; GFX940-NEXT: v_mov_b32_e32 v3, 0 570; GFX940-NEXT: v_mov_b32_e32 v0, s1 571; GFX940-NEXT: v_mov_b32_e32 v1, s0 572; GFX940-NEXT: v_mov_b32_e32 v2, s4 573; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 574; GFX940-NEXT: s_endpgm 575; 576; GFX90a-LABEL: preload_workgroup_size_xyz: 577; GFX90a: ; %bb.1: 578; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 579; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 580; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 581; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 582; GFX90a-NEXT: s_branch .LBB15_0 583; GFX90a-NEXT: .p2align 8 584; GFX90a-NEXT: ; %bb.2: 585; GFX90a-NEXT: .LBB15_0: 586; GFX90a-NEXT: s_lshr_b32 s0, s11, 16 587; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff 588; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff 589; GFX90a-NEXT: v_mov_b32_e32 v3, 0 590; GFX90a-NEXT: v_mov_b32_e32 v0, s1 591; GFX90a-NEXT: v_mov_b32_e32 v1, s0 592; GFX90a-NEXT: v_mov_b32_e32 v2, s2 593; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 594; GFX90a-NEXT: s_endpgm 595 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 596 %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 597 %load_x = load i16, ptr addrspace(4) %gep_x 598 %conv_x = zext i16 %load_x to i32 599 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 600 %load_y = load i16, ptr addrspace(4) %gep_y 601 %conv_y = zext i16 %load_y to i32 602 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 603 %load_z = load i16, ptr addrspace(4) %gep_z 604 %conv_z = zext i16 %load_z to i32 605 %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 606 %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 607 %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 608 store <3 x i32> %ins.2, ptr addrspace(1) %out 609 ret void 610} 611 612define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 { 613; GFX940-LABEL: preload_remainder_x: 614; GFX940: ; %bb.1: 615; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 616; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 617; GFX940-NEXT: s_load_dword s8, s[0:1], 0x18 618; GFX940-NEXT: s_waitcnt lgkmcnt(0) 619; GFX940-NEXT: s_branch .LBB16_0 620; GFX940-NEXT: .p2align 8 621; GFX940-NEXT: ; %bb.2: 622; GFX940-NEXT: .LBB16_0: 623; GFX940-NEXT: s_lshr_b32 s0, s8, 16 624; GFX940-NEXT: v_mov_b32_e32 v0, 0 625; GFX940-NEXT: v_mov_b32_e32 v1, s0 626; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 627; GFX940-NEXT: s_endpgm 628; 629; GFX90a-LABEL: preload_remainder_x: 630; GFX90a: ; %bb.1: 631; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 632; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 633; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18 634; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 635; GFX90a-NEXT: s_branch .LBB16_0 636; GFX90a-NEXT: .p2align 8 637; GFX90a-NEXT: ; %bb.2: 638; GFX90a-NEXT: .LBB16_0: 639; GFX90a-NEXT: s_lshr_b32 s0, s12, 16 640; GFX90a-NEXT: v_mov_b32_e32 v0, 0 641; GFX90a-NEXT: v_mov_b32_e32 v1, s0 642; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 643; GFX90a-NEXT: s_endpgm 644 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 645 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 646 %load = load i16, ptr addrspace(4) %gep 647 %conv = zext i16 %load to i32 648 store i32 %conv, ptr addrspace(1) %out 649 ret void 650} 651 652define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 { 653; GFX940-LABEL: preloadremainder_y: 654; GFX940: ; %bb.1: 655; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 656; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 657; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 658; GFX940-NEXT: s_waitcnt lgkmcnt(0) 659; GFX940-NEXT: s_branch .LBB17_0 660; GFX940-NEXT: .p2align 8 661; GFX940-NEXT: ; %bb.2: 662; GFX940-NEXT: .LBB17_0: 663; GFX940-NEXT: s_and_b32 s0, s9, 0xffff 664; GFX940-NEXT: v_mov_b32_e32 v0, 0 665; GFX940-NEXT: v_mov_b32_e32 v1, s0 666; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 667; GFX940-NEXT: s_endpgm 668; 669; GFX90a-LABEL: preloadremainder_y: 670; GFX90a: ; %bb.1: 671; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 672; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 673; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 674; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 675; GFX90a-NEXT: s_branch .LBB17_0 676; GFX90a-NEXT: .p2align 8 677; GFX90a-NEXT: ; %bb.2: 678; GFX90a-NEXT: .LBB17_0: 679; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff 680; GFX90a-NEXT: v_mov_b32_e32 v0, 0 681; GFX90a-NEXT: v_mov_b32_e32 v1, s0 682; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 683; GFX90a-NEXT: s_endpgm 684 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 685 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 686 %load = load i16, ptr addrspace(4) %gep 687 %conv = zext i16 %load to i32 688 store i32 %conv, ptr addrspace(1) %out 689 ret void 690} 691 692define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 { 693; GFX940-LABEL: preloadremainder_z: 694; GFX940: ; %bb.1: 695; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 696; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 697; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 698; GFX940-NEXT: s_waitcnt lgkmcnt(0) 699; GFX940-NEXT: s_branch .LBB18_0 700; GFX940-NEXT: .p2align 8 701; GFX940-NEXT: ; %bb.2: 702; GFX940-NEXT: .LBB18_0: 703; GFX940-NEXT: s_lshr_b32 s0, s9, 16 704; GFX940-NEXT: v_mov_b32_e32 v0, 0 705; GFX940-NEXT: v_mov_b32_e32 v1, s0 706; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 707; GFX940-NEXT: s_endpgm 708; 709; GFX90a-LABEL: preloadremainder_z: 710; GFX90a: ; %bb.1: 711; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 712; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 713; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 714; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 715; GFX90a-NEXT: s_branch .LBB18_0 716; GFX90a-NEXT: .p2align 8 717; GFX90a-NEXT: ; %bb.2: 718; GFX90a-NEXT: .LBB18_0: 719; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 720; GFX90a-NEXT: v_mov_b32_e32 v0, 0 721; GFX90a-NEXT: v_mov_b32_e32 v1, s0 722; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 723; GFX90a-NEXT: s_endpgm 724 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 725 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 726 %load = load i16, ptr addrspace(4) %gep 727 %conv = zext i16 %load to i32 728 store i32 %conv, ptr addrspace(1) %out 729 ret void 730} 731 732define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0 { 733; GFX940-LABEL: preloadremainder_xyz: 734; GFX940: ; %bb.1: 735; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 736; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 737; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 738; GFX940-NEXT: s_waitcnt lgkmcnt(0) 739; GFX940-NEXT: s_branch .LBB19_0 740; GFX940-NEXT: .p2align 8 741; GFX940-NEXT: ; %bb.2: 742; GFX940-NEXT: .LBB19_0: 743; GFX940-NEXT: s_lshr_b32 s0, s9, 16 744; GFX940-NEXT: s_lshr_b32 s1, s8, 16 745; GFX940-NEXT: s_and_b32 s4, s9, 0xffff 746; GFX940-NEXT: v_mov_b32_e32 v3, 0 747; GFX940-NEXT: v_mov_b32_e32 v0, s1 748; GFX940-NEXT: v_mov_b32_e32 v1, s4 749; GFX940-NEXT: v_mov_b32_e32 v2, s0 750; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 751; GFX940-NEXT: s_endpgm 752; 753; GFX90a-LABEL: preloadremainder_xyz: 754; GFX90a: ; %bb.1: 755; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 756; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 757; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 758; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 759; GFX90a-NEXT: s_branch .LBB19_0 760; GFX90a-NEXT: .p2align 8 761; GFX90a-NEXT: ; %bb.2: 762; GFX90a-NEXT: .LBB19_0: 763; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 764; GFX90a-NEXT: s_lshr_b32 s1, s12, 16 765; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff 766; GFX90a-NEXT: v_mov_b32_e32 v3, 0 767; GFX90a-NEXT: v_mov_b32_e32 v0, s1 768; GFX90a-NEXT: v_mov_b32_e32 v1, s2 769; GFX90a-NEXT: v_mov_b32_e32 v2, s0 770; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 771; GFX90a-NEXT: s_endpgm 772 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 773 %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18 774 %load_x = load i16, ptr addrspace(4) %gep_x 775 %conv_x = zext i16 %load_x to i32 776 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20 777 %load_y = load i16, ptr addrspace(4) %gep_y 778 %conv_y = zext i16 %load_y to i32 779 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 780 %load_z = load i16, ptr addrspace(4) %gep_z 781 %conv_z = zext i16 %load_z to i32 782 %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 783 %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 784 %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 785 store <3 x i32> %ins.2, ptr addrspace(1) %out 786 ret void 787} 788 789define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inreg %out) { 790; GFX940-LABEL: no_free_sgprs_preloadremainder_z: 791; GFX940: ; %bb.1: 792; GFX940-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 793; GFX940-NEXT: s_waitcnt lgkmcnt(0) 794; GFX940-NEXT: s_branch .LBB20_0 795; GFX940-NEXT: .p2align 8 796; GFX940-NEXT: ; %bb.2: 797; GFX940-NEXT: .LBB20_0: 798; GFX940-NEXT: s_lshr_b32 s0, s15, 16 799; GFX940-NEXT: v_mov_b32_e32 v0, 0 800; GFX940-NEXT: v_mov_b32_e32 v1, s0 801; GFX940-NEXT: global_store_dword v0, v1, s[8:9] sc0 sc1 802; GFX940-NEXT: s_endpgm 803; 804; GFX90a-LABEL: no_free_sgprs_preloadremainder_z: 805; GFX90a: ; %bb.1: 806; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 807; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 808; GFX90a-NEXT: s_branch .LBB20_0 809; GFX90a-NEXT: .p2align 8 810; GFX90a-NEXT: ; %bb.2: 811; GFX90a-NEXT: .LBB20_0: 812; GFX90a-NEXT: s_load_dword s0, s[8:9], 0x1c 813; GFX90a-NEXT: v_mov_b32_e32 v0, 0 814; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 815; GFX90a-NEXT: s_lshr_b32 s0, s0, 16 816; GFX90a-NEXT: v_mov_b32_e32 v1, s0 817; GFX90a-NEXT: global_store_dword v0, v1, s[12:13] 818; GFX90a-NEXT: s_endpgm 819 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 820 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 821 %load = load i16, ptr addrspace(4) %gep 822 %conv = zext i16 %load to i32 823 store i32 %conv, ptr addrspace(1) %out 824 ret void 825} 826 827; Check for consistency between isel and earlier passes preload SGPR accounting with max preload SGPRs. 828 829define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %out, i192 inreg %t0, i32 inreg %t1) #0 { 830; GFX940-LABEL: preload_block_max_user_sgprs: 831; GFX940: ; %bb.1: 832; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 833; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 834; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28 835; GFX940-NEXT: s_waitcnt lgkmcnt(0) 836; GFX940-NEXT: s_branch .LBB21_0 837; GFX940-NEXT: .p2align 8 838; GFX940-NEXT: ; %bb.2: 839; GFX940-NEXT: .LBB21_0: 840; GFX940-NEXT: v_mov_b32_e32 v0, 0 841; GFX940-NEXT: v_mov_b32_e32 v1, s12 842; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 843; GFX940-NEXT: s_endpgm 844; 845; GFX90a-LABEL: preload_block_max_user_sgprs: 846; GFX90a: ; %bb.1: 847; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 848; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 849; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 850; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20 851; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 852; GFX90a-NEXT: s_branch .LBB21_0 853; GFX90a-NEXT: .p2align 8 854; GFX90a-NEXT: ; %bb.2: 855; GFX90a-NEXT: .LBB21_0: 856; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x28 857; GFX90a-NEXT: v_mov_b32_e32 v0, 0 858; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 859; GFX90a-NEXT: v_mov_b32_e32 v1, s0 860; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 861; GFX90a-NEXT: s_endpgm 862 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 863 %load = load i32, ptr addrspace(4) %imp_arg_ptr 864 store i32 %load, ptr addrspace(1) %out 865 ret void 866} 867 868define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(ptr addrspace(1) inreg %out) #0 { 869; GFX940-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: 870; GFX940: ; %bb.1: 871; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 872; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 873; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 874; GFX940-NEXT: s_waitcnt lgkmcnt(0) 875; GFX940-NEXT: s_branch .LBB22_0 876; GFX940-NEXT: .p2align 8 877; GFX940-NEXT: ; %bb.2: 878; GFX940-NEXT: .LBB22_0: 879; GFX940-NEXT: s_lshr_b32 s0, s9, 16 880; GFX940-NEXT: s_and_b32 s1, s8, 0xffff 881; GFX940-NEXT: v_mov_b32_e32 v3, 0 882; GFX940-NEXT: v_mov_b32_e32 v0, s6 883; GFX940-NEXT: v_mov_b32_e32 v1, s1 884; GFX940-NEXT: v_mov_b32_e32 v2, s0 885; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 886; GFX940-NEXT: s_endpgm 887; 888; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z: 889; GFX90a: ; %bb.1: 890; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 891; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 892; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 893; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 894; GFX90a-NEXT: s_branch .LBB22_0 895; GFX90a-NEXT: .p2align 8 896; GFX90a-NEXT: ; %bb.2: 897; GFX90a-NEXT: .LBB22_0: 898; GFX90a-NEXT: s_lshr_b32 s0, s13, 16 899; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff 900; GFX90a-NEXT: v_mov_b32_e32 v3, 0 901; GFX90a-NEXT: v_mov_b32_e32 v0, s10 902; GFX90a-NEXT: v_mov_b32_e32 v1, s1 903; GFX90a-NEXT: v_mov_b32_e32 v2, s0 904; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 905; GFX90a-NEXT: s_endpgm 906 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 907 %gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8 908 %gep1 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 909 %gep2 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 910 %load0 = load i32, ptr addrspace(4) %gep0 911 %load1 = load i16, ptr addrspace(4) %gep1 912 %load2 = load i16, ptr addrspace(4) %gep2 913 %conv1 = zext i16 %load1 to i32 914 %conv2 = zext i16 %load2 to i32 915 %ins.0 = insertelement <3 x i32> poison, i32 %load0, i32 0 916 %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv1, i32 1 917 %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv2, i32 2 918 store <3 x i32> %ins.2, ptr addrspace(1) %out 919 ret void 920} 921 922attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } 923