1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 3 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90a %s 5 6define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0) #0 { 7; GFX940-LABEL: ptr1_i8: 8; GFX940: ; %bb.1: 9; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 10; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 11; GFX940-NEXT: s_waitcnt lgkmcnt(0) 12; GFX940-NEXT: s_branch .LBB0_0 13; GFX940-NEXT: .p2align 8 14; GFX940-NEXT: ; %bb.2: 15; GFX940-NEXT: .LBB0_0: 16; GFX940-NEXT: s_and_b32 s0, s4, 0xff 17; GFX940-NEXT: v_mov_b32_e32 v0, 0 18; GFX940-NEXT: v_mov_b32_e32 v1, s0 19; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 20; GFX940-NEXT: s_endpgm 21; 22; GFX90a-LABEL: ptr1_i8: 23; GFX90a: ; %bb.1: 24; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 25; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 26; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 27; GFX90a-NEXT: s_branch .LBB0_0 28; GFX90a-NEXT: .p2align 8 29; GFX90a-NEXT: ; %bb.2: 30; GFX90a-NEXT: .LBB0_0: 31; GFX90a-NEXT: s_and_b32 s0, s8, 0xff 32; GFX90a-NEXT: v_mov_b32_e32 v0, 0 33; GFX90a-NEXT: v_mov_b32_e32 v1, s0 34; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 35; GFX90a-NEXT: s_endpgm 36 %ext = zext i8 %arg0 to i32 37 store i32 %ext, ptr addrspace(1) %out 38 ret void 39} 40 41define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zeroext inreg %arg0) #0 { 42; GFX940-LABEL: ptr1_i8_zext_arg: 43; GFX940: ; %bb.1: 44; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 45; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 46; GFX940-NEXT: s_waitcnt lgkmcnt(0) 47; GFX940-NEXT: s_branch .LBB1_0 48; GFX940-NEXT: .p2align 8 49; GFX940-NEXT: ; %bb.2: 50; GFX940-NEXT: .LBB1_0: 51; GFX940-NEXT: s_and_b32 s0, s4, 0xff 52; GFX940-NEXT: v_mov_b32_e32 v0, 0 53; GFX940-NEXT: v_mov_b32_e32 v1, s0 54; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 55; GFX940-NEXT: s_endpgm 56; 57; GFX90a-LABEL: ptr1_i8_zext_arg: 58; GFX90a: ; %bb.1: 59; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 60; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 61; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 62; GFX90a-NEXT: s_branch .LBB1_0 63; GFX90a-NEXT: .p2align 8 64; GFX90a-NEXT: ; %bb.2: 65; GFX90a-NEXT: .LBB1_0: 66; GFX90a-NEXT: s_and_b32 s0, s8, 0xff 67; GFX90a-NEXT: v_mov_b32_e32 v0, 0 68; GFX90a-NEXT: v_mov_b32_e32 v1, s0 69; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 70; GFX90a-NEXT: s_endpgm 71 %ext = zext i8 %arg0 to i32 72 store i32 %ext, ptr addrspace(1) %out, align 4 73 ret void 74} 75 76define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0) #0 { 77; GFX940-LABEL: ptr1_i16_preload_arg: 78; GFX940: ; %bb.1: 79; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 80; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 81; GFX940-NEXT: s_waitcnt lgkmcnt(0) 82; GFX940-NEXT: s_branch .LBB2_0 83; GFX940-NEXT: .p2align 8 84; GFX940-NEXT: ; %bb.2: 85; GFX940-NEXT: .LBB2_0: 86; GFX940-NEXT: s_and_b32 s0, s4, 0xffff 87; GFX940-NEXT: v_mov_b32_e32 v0, 0 88; GFX940-NEXT: v_mov_b32_e32 v1, s0 89; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 90; GFX940-NEXT: s_endpgm 91; 92; GFX90a-LABEL: ptr1_i16_preload_arg: 93; GFX90a: ; %bb.1: 94; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 95; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 96; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 97; GFX90a-NEXT: s_branch .LBB2_0 98; GFX90a-NEXT: .p2align 8 99; GFX90a-NEXT: ; %bb.2: 100; GFX90a-NEXT: .LBB2_0: 101; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff 102; GFX90a-NEXT: v_mov_b32_e32 v0, 0 103; GFX90a-NEXT: v_mov_b32_e32 v1, s0 104; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 105; GFX90a-NEXT: s_endpgm 106 %ext = zext i16 %arg0 to i32 107 store i32 %ext, ptr addrspace(1) %out, align 4 108 ret void 109} 110 111define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32 inreg %arg0) #0 { 112; GFX940-LABEL: ptr1_i32_preload_arg: 113; GFX940: ; %bb.1: 114; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 115; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 116; GFX940-NEXT: s_waitcnt lgkmcnt(0) 117; GFX940-NEXT: s_branch .LBB3_0 118; GFX940-NEXT: .p2align 8 119; GFX940-NEXT: ; %bb.2: 120; GFX940-NEXT: .LBB3_0: 121; GFX940-NEXT: v_mov_b32_e32 v0, 0 122; GFX940-NEXT: v_mov_b32_e32 v1, s4 123; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 124; GFX940-NEXT: s_endpgm 125; 126; GFX90a-LABEL: ptr1_i32_preload_arg: 127; GFX90a: ; %bb.1: 128; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 129; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 130; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 131; GFX90a-NEXT: s_branch .LBB3_0 132; GFX90a-NEXT: .p2align 8 133; GFX90a-NEXT: ; %bb.2: 134; GFX90a-NEXT: .LBB3_0: 135; GFX90a-NEXT: v_mov_b32_e32 v0, 0 136; GFX90a-NEXT: v_mov_b32_e32 v1, s8 137; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 138; GFX90a-NEXT: s_endpgm 139 store i32 %arg0, ptr addrspace(1) %out 140 ret void 141} 142 143 144define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspace(1) inreg %out, i32 inreg %arg1) #0 { 145; GFX940-LABEL: i32_ptr1_i32_preload_arg: 146; GFX940: ; %bb.1: 147; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 148; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 149; GFX940-NEXT: s_load_dword s6, s[0:1], 0x10 150; GFX940-NEXT: s_waitcnt lgkmcnt(0) 151; GFX940-NEXT: s_branch .LBB4_0 152; GFX940-NEXT: .p2align 8 153; GFX940-NEXT: ; %bb.2: 154; GFX940-NEXT: .LBB4_0: 155; GFX940-NEXT: s_add_i32 s0, s2, s6 156; GFX940-NEXT: v_mov_b32_e32 v0, 0 157; GFX940-NEXT: v_mov_b32_e32 v1, s0 158; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 159; GFX940-NEXT: s_endpgm 160; 161; GFX90a-LABEL: i32_ptr1_i32_preload_arg: 162; GFX90a: ; %bb.1: 163; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 164; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 165; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10 166; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 167; GFX90a-NEXT: s_branch .LBB4_0 168; GFX90a-NEXT: .p2align 8 169; GFX90a-NEXT: ; %bb.2: 170; GFX90a-NEXT: .LBB4_0: 171; GFX90a-NEXT: s_add_i32 s0, s6, s10 172; GFX90a-NEXT: v_mov_b32_e32 v0, 0 173; GFX90a-NEXT: v_mov_b32_e32 v1, s0 174; GFX90a-NEXT: global_store_dword v0, v1, s[8:9] 175; GFX90a-NEXT: s_endpgm 176 %add = add i32 %arg0, %arg1 177 store i32 %add, ptr addrspace(1) %out 178 ret void 179} 180 181define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %arg0, i16 inreg %arg1) #0 { 182; GFX940-LABEL: ptr1_i16_i16_preload_arg: 183; GFX940: ; %bb.1: 184; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 185; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 186; GFX940-NEXT: s_waitcnt lgkmcnt(0) 187; GFX940-NEXT: s_branch .LBB5_0 188; GFX940-NEXT: .p2align 8 189; GFX940-NEXT: ; %bb.2: 190; GFX940-NEXT: .LBB5_0: 191; GFX940-NEXT: s_lshr_b32 s0, s4, 16 192; GFX940-NEXT: s_and_b32 s1, s4, 0xffff 193; GFX940-NEXT: s_add_i32 s0, s1, s0 194; GFX940-NEXT: v_mov_b32_e32 v0, 0 195; GFX940-NEXT: v_mov_b32_e32 v1, s0 196; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 197; GFX940-NEXT: s_endpgm 198; 199; GFX90a-LABEL: ptr1_i16_i16_preload_arg: 200; GFX90a: ; %bb.1: 201; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 202; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 203; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 204; GFX90a-NEXT: s_branch .LBB5_0 205; GFX90a-NEXT: .p2align 8 206; GFX90a-NEXT: ; %bb.2: 207; GFX90a-NEXT: .LBB5_0: 208; GFX90a-NEXT: s_lshr_b32 s0, s8, 16 209; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff 210; GFX90a-NEXT: s_add_i32 s0, s1, s0 211; GFX90a-NEXT: v_mov_b32_e32 v0, 0 212; GFX90a-NEXT: v_mov_b32_e32 v1, s0 213; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 214; GFX90a-NEXT: s_endpgm 215 %ext = zext i16 %arg0 to i32 216 %ext1 = zext i16 %arg1 to i32 217 %add = add i32 %ext, %ext1 218 store i32 %add, ptr addrspace(1) %out, align 4 219 ret void 220} 221 222define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2 x i8> inreg %in) #0 { 223; GFX940-LABEL: ptr1_v2i8_preload_arg: 224; GFX940: ; %bb.1: 225; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 226; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 227; GFX940-NEXT: s_waitcnt lgkmcnt(0) 228; GFX940-NEXT: s_branch .LBB6_0 229; GFX940-NEXT: .p2align 8 230; GFX940-NEXT: ; %bb.2: 231; GFX940-NEXT: .LBB6_0: 232; GFX940-NEXT: v_mov_b32_e32 v0, 0 233; GFX940-NEXT: v_mov_b32_e32 v1, s4 234; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 235; GFX940-NEXT: s_endpgm 236; 237; GFX90a-LABEL: ptr1_v2i8_preload_arg: 238; GFX90a: ; %bb.1: 239; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 240; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 241; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 242; GFX90a-NEXT: s_branch .LBB6_0 243; GFX90a-NEXT: .p2align 8 244; GFX90a-NEXT: ; %bb.2: 245; GFX90a-NEXT: .LBB6_0: 246; GFX90a-NEXT: v_mov_b32_e32 v0, 0 247; GFX90a-NEXT: v_mov_b32_e32 v1, s8 248; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 249; GFX90a-NEXT: s_endpgm 250 store <2 x i8> %in, ptr addrspace(1) %out 251 ret void 252} 253 254 255define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) #0 { 256; GFX940-LABEL: byref_preload_arg: 257; GFX940: ; %bb.1: 258; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 259; GFX940-NEXT: s_waitcnt lgkmcnt(0) 260; GFX940-NEXT: s_branch .LBB7_0 261; GFX940-NEXT: .p2align 8 262; GFX940-NEXT: ; %bb.2: 263; GFX940-NEXT: .LBB7_0: 264; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100 265; GFX940-NEXT: v_mov_b32_e32 v0, 0 266; GFX940-NEXT: s_waitcnt lgkmcnt(0) 267; GFX940-NEXT: v_mov_b32_e32 v1, s4 268; GFX940-NEXT: v_mov_b32_e32 v2, s5 269; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 270; GFX940-NEXT: s_waitcnt vmcnt(0) 271; GFX940-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 272; GFX940-NEXT: s_waitcnt vmcnt(0) 273; GFX940-NEXT: s_endpgm 274; 275; GFX90a-LABEL: byref_preload_arg: 276; GFX90a: ; %bb.1: 277; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 278; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 279; GFX90a-NEXT: s_branch .LBB7_0 280; GFX90a-NEXT: .p2align 8 281; GFX90a-NEXT: ; %bb.2: 282; GFX90a-NEXT: .LBB7_0: 283; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 284; GFX90a-NEXT: v_mov_b32_e32 v0, 0 285; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 286; GFX90a-NEXT: v_mov_b32_e32 v1, s0 287; GFX90a-NEXT: v_mov_b32_e32 v2, s1 288; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 289; GFX90a-NEXT: s_waitcnt vmcnt(0) 290; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] 291; GFX90a-NEXT: s_waitcnt vmcnt(0) 292; GFX90a-NEXT: s_endpgm 293 %in = load i32, ptr addrspace(4) %in.byref 294 store volatile i32 %in, ptr addrspace(1) %out, align 4 295 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 296 ret void 297} 298 299; The second argument is not expected to be preloaded with the current behavior. 300 301define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 inreg %after.offset) #0 { 302; GFX940-LABEL: byref_staggered_preload_arg: 303; GFX940: ; %bb.1: 304; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 305; GFX940-NEXT: s_waitcnt lgkmcnt(0) 306; GFX940-NEXT: s_branch .LBB8_0 307; GFX940-NEXT: .p2align 8 308; GFX940-NEXT: ; %bb.2: 309; GFX940-NEXT: .LBB8_0: 310; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100 311; GFX940-NEXT: v_mov_b32_e32 v0, 0 312; GFX940-NEXT: s_waitcnt lgkmcnt(0) 313; GFX940-NEXT: v_mov_b32_e32 v1, s4 314; GFX940-NEXT: v_mov_b32_e32 v2, s5 315; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 316; GFX940-NEXT: s_waitcnt vmcnt(0) 317; GFX940-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1 318; GFX940-NEXT: s_waitcnt vmcnt(0) 319; GFX940-NEXT: s_endpgm 320; 321; GFX90a-LABEL: byref_staggered_preload_arg: 322; GFX90a: ; %bb.1: 323; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 324; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 325; GFX90a-NEXT: s_branch .LBB8_0 326; GFX90a-NEXT: .p2align 8 327; GFX90a-NEXT: ; %bb.2: 328; GFX90a-NEXT: .LBB8_0: 329; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 330; GFX90a-NEXT: v_mov_b32_e32 v0, 0 331; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 332; GFX90a-NEXT: v_mov_b32_e32 v1, s0 333; GFX90a-NEXT: v_mov_b32_e32 v2, s1 334; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 335; GFX90a-NEXT: s_waitcnt vmcnt(0) 336; GFX90a-NEXT: global_store_dword v0, v2, s[6:7] 337; GFX90a-NEXT: s_waitcnt vmcnt(0) 338; GFX90a-NEXT: s_endpgm 339 %in = load i32, ptr addrspace(4) %in.byref 340 store volatile i32 %in, ptr addrspace(1) %out, align 4 341 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 342 ret void 343} 344 345 346define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x i32> inreg %in) #0 { 347; GFX940-LABEL: v8i32_arg: 348; GFX940: ; %bb.1: 349; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 350; GFX940-NEXT: s_waitcnt lgkmcnt(0) 351; GFX940-NEXT: s_branch .LBB9_0 352; GFX940-NEXT: .p2align 8 353; GFX940-NEXT: ; %bb.2: 354; GFX940-NEXT: .LBB9_0: 355; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 356; GFX940-NEXT: v_mov_b32_e32 v4, 0 357; GFX940-NEXT: s_waitcnt lgkmcnt(0) 358; GFX940-NEXT: v_mov_b32_e32 v0, s8 359; GFX940-NEXT: v_mov_b32_e32 v1, s9 360; GFX940-NEXT: v_mov_b32_e32 v2, s10 361; GFX940-NEXT: v_mov_b32_e32 v3, s11 362; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 363; GFX940-NEXT: s_nop 1 364; GFX940-NEXT: v_mov_b32_e32 v0, s4 365; GFX940-NEXT: v_mov_b32_e32 v1, s5 366; GFX940-NEXT: v_mov_b32_e32 v2, s6 367; GFX940-NEXT: v_mov_b32_e32 v3, s7 368; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 369; GFX940-NEXT: s_endpgm 370; 371; GFX90a-LABEL: v8i32_arg: 372; GFX90a: ; %bb.1: 373; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 374; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 375; GFX90a-NEXT: s_branch .LBB9_0 376; GFX90a-NEXT: .p2align 8 377; GFX90a-NEXT: ; %bb.2: 378; GFX90a-NEXT: .LBB9_0: 379; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 380; GFX90a-NEXT: v_mov_b32_e32 v4, 0 381; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 382; GFX90a-NEXT: v_mov_b32_e32 v0, s12 383; GFX90a-NEXT: v_mov_b32_e32 v1, s13 384; GFX90a-NEXT: v_mov_b32_e32 v2, s14 385; GFX90a-NEXT: v_mov_b32_e32 v3, s15 386; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 387; GFX90a-NEXT: s_nop 0 388; GFX90a-NEXT: v_mov_b32_e32 v0, s8 389; GFX90a-NEXT: v_mov_b32_e32 v1, s9 390; GFX90a-NEXT: v_mov_b32_e32 v2, s10 391; GFX90a-NEXT: v_mov_b32_e32 v3, s11 392; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 393; GFX90a-NEXT: s_endpgm 394 store <8 x i32> %in, ptr addrspace(1) %out, align 4 395 ret void 396} 397 398define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i16> inreg %in) #0 { 399; GFX940-LABEL: v3i16_preload_arg: 400; GFX940: ; %bb.1: 401; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 402; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 403; GFX940-NEXT: s_waitcnt lgkmcnt(0) 404; GFX940-NEXT: s_branch .LBB10_0 405; GFX940-NEXT: .p2align 8 406; GFX940-NEXT: ; %bb.2: 407; GFX940-NEXT: .LBB10_0: 408; GFX940-NEXT: v_mov_b32_e32 v0, 0 409; GFX940-NEXT: v_mov_b32_e32 v1, s5 410; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 411; GFX940-NEXT: v_mov_b32_e32 v1, s4 412; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 413; GFX940-NEXT: s_endpgm 414; 415; GFX90a-LABEL: v3i16_preload_arg: 416; GFX90a: ; %bb.1: 417; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 418; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 419; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 420; GFX90a-NEXT: s_branch .LBB10_0 421; GFX90a-NEXT: .p2align 8 422; GFX90a-NEXT: ; %bb.2: 423; GFX90a-NEXT: .LBB10_0: 424; GFX90a-NEXT: v_mov_b32_e32 v0, 0 425; GFX90a-NEXT: v_mov_b32_e32 v1, s9 426; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 427; GFX90a-NEXT: v_mov_b32_e32 v1, s8 428; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 429; GFX90a-NEXT: s_endpgm 430 store <3 x i16> %in, ptr addrspace(1) %out, align 4 431 ret void 432} 433 434define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x i32> inreg %in) #0 { 435; GFX940-LABEL: v3i32_preload_arg: 436; GFX940: ; %bb.1: 437; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 438; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 439; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 440; GFX940-NEXT: s_waitcnt lgkmcnt(0) 441; GFX940-NEXT: s_branch .LBB11_0 442; GFX940-NEXT: .p2align 8 443; GFX940-NEXT: ; %bb.2: 444; GFX940-NEXT: .LBB11_0: 445; GFX940-NEXT: v_mov_b32_e32 v0, s6 446; GFX940-NEXT: v_mov_b32_e32 v1, s7 447; GFX940-NEXT: v_mov_b32_e32 v2, s8 448; GFX940-NEXT: v_mov_b32_e32 v3, 0 449; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 450; GFX940-NEXT: s_endpgm 451; 452; GFX90a-LABEL: v3i32_preload_arg: 453; GFX90a: ; %bb.1: 454; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 455; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 456; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 457; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 458; GFX90a-NEXT: s_branch .LBB11_0 459; GFX90a-NEXT: .p2align 8 460; GFX90a-NEXT: ; %bb.2: 461; GFX90a-NEXT: .LBB11_0: 462; GFX90a-NEXT: v_mov_b32_e32 v0, s10 463; GFX90a-NEXT: v_mov_b32_e32 v1, s11 464; GFX90a-NEXT: v_mov_b32_e32 v2, s12 465; GFX90a-NEXT: v_mov_b32_e32 v3, 0 466; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 467; GFX90a-NEXT: s_endpgm 468 store <3 x i32> %in, ptr addrspace(1) %out, align 4 469 ret void 470} 471 472define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %out, <3 x float> inreg %in) #0 { 473; GFX940-LABEL: v3f32_preload_arg: 474; GFX940: ; %bb.1: 475; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 476; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 477; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 478; GFX940-NEXT: s_waitcnt lgkmcnt(0) 479; GFX940-NEXT: s_branch .LBB12_0 480; GFX940-NEXT: .p2align 8 481; GFX940-NEXT: ; %bb.2: 482; GFX940-NEXT: .LBB12_0: 483; GFX940-NEXT: v_mov_b32_e32 v3, 0 484; GFX940-NEXT: v_mov_b32_e32 v0, s6 485; GFX940-NEXT: v_mov_b32_e32 v1, s7 486; GFX940-NEXT: v_mov_b32_e32 v2, s8 487; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 488; GFX940-NEXT: s_endpgm 489; 490; GFX90a-LABEL: v3f32_preload_arg: 491; GFX90a: ; %bb.1: 492; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 493; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 494; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 495; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 496; GFX90a-NEXT: s_branch .LBB12_0 497; GFX90a-NEXT: .p2align 8 498; GFX90a-NEXT: ; %bb.2: 499; GFX90a-NEXT: .LBB12_0: 500; GFX90a-NEXT: v_mov_b32_e32 v3, 0 501; GFX90a-NEXT: v_mov_b32_e32 v0, s10 502; GFX90a-NEXT: v_mov_b32_e32 v1, s11 503; GFX90a-NEXT: v_mov_b32_e32 v2, s12 504; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 505; GFX90a-NEXT: s_endpgm 506 store <3 x float> %in, ptr addrspace(1) %out, align 4 507 ret void 508} 509 510define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %out, <5 x i8> inreg %in) #0 { 511; GFX940-LABEL: v5i8_preload_arg: 512; GFX940: ; %bb.1: 513; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 514; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 515; GFX940-NEXT: s_waitcnt lgkmcnt(0) 516; GFX940-NEXT: s_branch .LBB13_0 517; GFX940-NEXT: .p2align 8 518; GFX940-NEXT: ; %bb.2: 519; GFX940-NEXT: .LBB13_0: 520; GFX940-NEXT: s_lshr_b32 s1, s4, 24 521; GFX940-NEXT: s_and_b32 s0, s4, 0xffff 522; GFX940-NEXT: s_lshl_b32 s1, s1, 8 523; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 524; GFX940-NEXT: s_or_b32 s1, s4, s1 525; GFX940-NEXT: s_lshl_b32 s1, s1, 16 526; GFX940-NEXT: s_or_b32 s0, s0, s1 527; GFX940-NEXT: v_mov_b32_e32 v0, 0 528; GFX940-NEXT: v_mov_b32_e32 v1, s5 529; GFX940-NEXT: global_store_byte v0, v1, s[2:3] offset:4 sc0 sc1 530; GFX940-NEXT: v_mov_b32_e32 v1, s0 531; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 532; GFX940-NEXT: s_endpgm 533; 534; GFX90a-LABEL: v5i8_preload_arg: 535; GFX90a: ; %bb.1: 536; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 537; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 538; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 539; GFX90a-NEXT: s_branch .LBB13_0 540; GFX90a-NEXT: .p2align 8 541; GFX90a-NEXT: ; %bb.2: 542; GFX90a-NEXT: .LBB13_0: 543; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 544; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 545; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 546; GFX90a-NEXT: s_or_b32 s1, s2, s1 547; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff 548; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 549; GFX90a-NEXT: s_or_b32 s0, s0, s1 550; GFX90a-NEXT: v_mov_b32_e32 v0, 0 551; GFX90a-NEXT: v_mov_b32_e32 v1, s9 552; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4 553; GFX90a-NEXT: v_mov_b32_e32 v1, s0 554; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 555; GFX90a-NEXT: s_endpgm 556 store <5 x i8> %in, ptr addrspace(1) %out, align 4 557 ret void 558} 559 560define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x double> inreg %in) #0 { 561; GFX940-LABEL: v5f64_arg: 562; GFX940: ; %bb.1: 563; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 564; GFX940-NEXT: s_waitcnt lgkmcnt(0) 565; GFX940-NEXT: s_branch .LBB14_0 566; GFX940-NEXT: .p2align 8 567; GFX940-NEXT: ; %bb.2: 568; GFX940-NEXT: .LBB14_0: 569; GFX940-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 570; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 571; GFX940-NEXT: v_mov_b32_e32 v4, 0 572; GFX940-NEXT: s_waitcnt lgkmcnt(0) 573; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[12:13] 574; GFX940-NEXT: v_mov_b32_e32 v0, s8 575; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] offset:32 sc0 sc1 576; GFX940-NEXT: v_mov_b32_e32 v1, s9 577; GFX940-NEXT: v_mov_b32_e32 v2, s10 578; GFX940-NEXT: v_mov_b32_e32 v3, s11 579; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16 sc0 sc1 580; GFX940-NEXT: s_nop 1 581; GFX940-NEXT: v_mov_b32_e32 v0, s4 582; GFX940-NEXT: v_mov_b32_e32 v1, s5 583; GFX940-NEXT: v_mov_b32_e32 v2, s6 584; GFX940-NEXT: v_mov_b32_e32 v3, s7 585; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 586; GFX940-NEXT: s_endpgm 587; 588; GFX90a-LABEL: v5f64_arg: 589; GFX90a: ; %bb.1: 590; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 591; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 592; GFX90a-NEXT: s_branch .LBB14_0 593; GFX90a-NEXT: .p2align 8 594; GFX90a-NEXT: ; %bb.2: 595; GFX90a-NEXT: .LBB14_0: 596; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 597; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 598; GFX90a-NEXT: v_mov_b32_e32 v4, 0 599; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 600; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] 601; GFX90a-NEXT: v_mov_b32_e32 v0, s12 602; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32 603; GFX90a-NEXT: v_mov_b32_e32 v1, s13 604; GFX90a-NEXT: v_mov_b32_e32 v2, s14 605; GFX90a-NEXT: v_mov_b32_e32 v3, s15 606; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16 607; GFX90a-NEXT: s_nop 0 608; GFX90a-NEXT: v_mov_b32_e32 v0, s8 609; GFX90a-NEXT: v_mov_b32_e32 v1, s9 610; GFX90a-NEXT: v_mov_b32_e32 v2, s10 611; GFX90a-NEXT: v_mov_b32_e32 v3, s11 612; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 613; GFX90a-NEXT: s_endpgm 614 store <5 x double> %in, ptr addrspace(1) %out, align 8 615 ret void 616} 617 618define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8> inreg %in) #0 { 619; GFX940-LABEL: v8i8_preload_arg: 620; GFX940: ; %bb.1: 621; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 622; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 623; GFX940-NEXT: s_waitcnt lgkmcnt(0) 624; GFX940-NEXT: s_branch .LBB15_0 625; GFX940-NEXT: .p2align 8 626; GFX940-NEXT: ; %bb.2: 627; GFX940-NEXT: .LBB15_0: 628; GFX940-NEXT: s_lshr_b32 s1, s5, 24 629; GFX940-NEXT: s_and_b32 s0, s5, 0xffff 630; GFX940-NEXT: s_lshl_b32 s1, s1, 8 631; GFX940-NEXT: s_bfe_u32 s5, s5, 0x80010 632; GFX940-NEXT: s_or_b32 s1, s5, s1 633; GFX940-NEXT: s_lshl_b32 s1, s1, 16 634; GFX940-NEXT: s_lshr_b32 s5, s4, 24 635; GFX940-NEXT: s_or_b32 s0, s0, s1 636; GFX940-NEXT: s_and_b32 s1, s4, 0xffff 637; GFX940-NEXT: s_lshl_b32 s5, s5, 8 638; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 639; GFX940-NEXT: s_or_b32 s4, s4, s5 640; GFX940-NEXT: s_lshl_b32 s4, s4, 16 641; GFX940-NEXT: s_or_b32 s1, s1, s4 642; GFX940-NEXT: v_mov_b32_e32 v0, s1 643; GFX940-NEXT: v_mov_b32_e32 v1, s0 644; GFX940-NEXT: v_mov_b32_e32 v2, 0 645; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 646; GFX940-NEXT: s_endpgm 647; 648; GFX90a-LABEL: v8i8_preload_arg: 649; GFX90a: ; %bb.1: 650; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 651; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 652; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 653; GFX90a-NEXT: s_branch .LBB15_0 654; GFX90a-NEXT: .p2align 8 655; GFX90a-NEXT: ; %bb.2: 656; GFX90a-NEXT: .LBB15_0: 657; GFX90a-NEXT: s_lshr_b32 s1, s9, 24 658; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 659; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010 660; GFX90a-NEXT: s_or_b32 s1, s2, s1 661; GFX90a-NEXT: s_lshr_b32 s2, s8, 24 662; GFX90a-NEXT: s_lshl_b32 s2, s2, 8 663; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010 664; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff 665; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 666; GFX90a-NEXT: s_or_b32 s2, s3, s2 667; GFX90a-NEXT: s_or_b32 s0, s0, s1 668; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff 669; GFX90a-NEXT: s_lshl_b32 s2, s2, 16 670; GFX90a-NEXT: s_or_b32 s1, s1, s2 671; GFX90a-NEXT: v_mov_b32_e32 v0, s1 672; GFX90a-NEXT: v_mov_b32_e32 v1, s0 673; GFX90a-NEXT: v_mov_b32_e32 v2, 0 674; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 675; GFX90a-NEXT: s_endpgm 676 store <8 x i8> %in, ptr addrspace(1) %out 677 ret void 678} 679 680define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i64 inreg %a) #0 { 681; GFX940-LABEL: i64_kernel_preload_arg: 682; GFX940: ; %bb.1: 683; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 684; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 685; GFX940-NEXT: s_waitcnt lgkmcnt(0) 686; GFX940-NEXT: s_branch .LBB16_0 687; GFX940-NEXT: .p2align 8 688; GFX940-NEXT: ; %bb.2: 689; GFX940-NEXT: .LBB16_0: 690; GFX940-NEXT: v_mov_b32_e32 v2, 0 691; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] 692; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 693; GFX940-NEXT: s_endpgm 694; 695; GFX90a-LABEL: i64_kernel_preload_arg: 696; GFX90a: ; %bb.1: 697; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 698; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 699; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 700; GFX90a-NEXT: s_branch .LBB16_0 701; GFX90a-NEXT: .p2align 8 702; GFX90a-NEXT: ; %bb.2: 703; GFX90a-NEXT: .LBB16_0: 704; GFX90a-NEXT: v_mov_b32_e32 v2, 0 705; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] 706; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 707; GFX90a-NEXT: s_endpgm 708 store i64 %a, ptr addrspace(1) %out, align 8 709 ret void 710} 711 712define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, double inreg %in) #0 { 713; GFX940-LABEL: f64_kernel_preload_arg: 714; GFX940: ; %bb.1: 715; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 716; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 717; GFX940-NEXT: s_waitcnt lgkmcnt(0) 718; GFX940-NEXT: s_branch .LBB17_0 719; GFX940-NEXT: .p2align 8 720; GFX940-NEXT: ; %bb.2: 721; GFX940-NEXT: .LBB17_0: 722; GFX940-NEXT: v_mov_b32_e32 v2, 0 723; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] 724; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1 725; GFX940-NEXT: s_endpgm 726; 727; GFX90a-LABEL: f64_kernel_preload_arg: 728; GFX90a: ; %bb.1: 729; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 730; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 731; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 732; GFX90a-NEXT: s_branch .LBB17_0 733; GFX90a-NEXT: .p2align 8 734; GFX90a-NEXT: ; %bb.2: 735; GFX90a-NEXT: .LBB17_0: 736; GFX90a-NEXT: v_mov_b32_e32 v2, 0 737; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] 738; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 739; GFX90a-NEXT: s_endpgm 740 store double %in, ptr addrspace(1) %out 741 ret void 742} 743 744define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in) #0 { 745; GFX940-LABEL: half_kernel_preload_arg: 746; GFX940: ; %bb.1: 747; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 748; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 749; GFX940-NEXT: s_waitcnt lgkmcnt(0) 750; GFX940-NEXT: s_branch .LBB18_0 751; GFX940-NEXT: .p2align 8 752; GFX940-NEXT: ; %bb.2: 753; GFX940-NEXT: .LBB18_0: 754; GFX940-NEXT: v_mov_b32_e32 v0, 0 755; GFX940-NEXT: v_mov_b32_e32 v1, s4 756; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 757; GFX940-NEXT: s_endpgm 758; 759; GFX90a-LABEL: half_kernel_preload_arg: 760; GFX90a: ; %bb.1: 761; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 762; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 763; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 764; GFX90a-NEXT: s_branch .LBB18_0 765; GFX90a-NEXT: .p2align 8 766; GFX90a-NEXT: ; %bb.2: 767; GFX90a-NEXT: .LBB18_0: 768; GFX90a-NEXT: v_mov_b32_e32 v0, 0 769; GFX90a-NEXT: v_mov_b32_e32 v1, s8 770; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 771; GFX90a-NEXT: s_endpgm 772 store half %in, ptr addrspace(1) %out 773 ret void 774} 775 776define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, bfloat inreg %in) #0 { 777; GFX940-LABEL: bfloat_kernel_preload_arg: 778; GFX940: ; %bb.1: 779; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 780; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 781; GFX940-NEXT: s_waitcnt lgkmcnt(0) 782; GFX940-NEXT: s_branch .LBB19_0 783; GFX940-NEXT: .p2align 8 784; GFX940-NEXT: ; %bb.2: 785; GFX940-NEXT: .LBB19_0: 786; GFX940-NEXT: v_mov_b32_e32 v0, 0 787; GFX940-NEXT: v_mov_b32_e32 v1, s4 788; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 789; GFX940-NEXT: s_endpgm 790; 791; GFX90a-LABEL: bfloat_kernel_preload_arg: 792; GFX90a: ; %bb.1: 793; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 794; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 795; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 796; GFX90a-NEXT: s_branch .LBB19_0 797; GFX90a-NEXT: .p2align 8 798; GFX90a-NEXT: ; %bb.2: 799; GFX90a-NEXT: .LBB19_0: 800; GFX90a-NEXT: v_mov_b32_e32 v0, 0 801; GFX90a-NEXT: v_mov_b32_e32 v1, s8 802; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 803; GFX90a-NEXT: s_endpgm 804 store bfloat %in, ptr addrspace(1) %out 805 ret void 806} 807 808define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <2 x bfloat> inreg %in) #0 { 809; GFX940-LABEL: v2bfloat_kernel_preload_arg: 810; GFX940: ; %bb.1: 811; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 812; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 813; GFX940-NEXT: s_waitcnt lgkmcnt(0) 814; GFX940-NEXT: s_branch .LBB20_0 815; GFX940-NEXT: .p2align 8 816; GFX940-NEXT: ; %bb.2: 817; GFX940-NEXT: .LBB20_0: 818; GFX940-NEXT: v_mov_b32_e32 v0, 0 819; GFX940-NEXT: v_mov_b32_e32 v1, s4 820; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 821; GFX940-NEXT: s_endpgm 822; 823; GFX90a-LABEL: v2bfloat_kernel_preload_arg: 824; GFX90a: ; %bb.1: 825; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 826; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 827; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 828; GFX90a-NEXT: s_branch .LBB20_0 829; GFX90a-NEXT: .p2align 8 830; GFX90a-NEXT: ; %bb.2: 831; GFX90a-NEXT: .LBB20_0: 832; GFX90a-NEXT: v_mov_b32_e32 v0, 0 833; GFX90a-NEXT: v_mov_b32_e32 v1, s8 834; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 835; GFX90a-NEXT: s_endpgm 836 store <2 x bfloat> %in, ptr addrspace(1) %out 837 ret void 838} 839 840define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <3 x bfloat> inreg %in) #0 { 841; GFX940-LABEL: v3bfloat_kernel_preload_arg: 842; GFX940: ; %bb.1: 843; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 844; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 845; GFX940-NEXT: s_waitcnt lgkmcnt(0) 846; GFX940-NEXT: s_branch .LBB21_0 847; GFX940-NEXT: .p2align 8 848; GFX940-NEXT: ; %bb.2: 849; GFX940-NEXT: .LBB21_0: 850; GFX940-NEXT: v_mov_b32_e32 v0, 0 851; GFX940-NEXT: v_mov_b32_e32 v1, s5 852; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 853; GFX940-NEXT: v_mov_b32_e32 v1, s4 854; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 855; GFX940-NEXT: s_endpgm 856; 857; GFX90a-LABEL: v3bfloat_kernel_preload_arg: 858; GFX90a: ; %bb.1: 859; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 860; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 861; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 862; GFX90a-NEXT: s_branch .LBB21_0 863; GFX90a-NEXT: .p2align 8 864; GFX90a-NEXT: ; %bb.2: 865; GFX90a-NEXT: .LBB21_0: 866; GFX90a-NEXT: v_mov_b32_e32 v0, 0 867; GFX90a-NEXT: v_mov_b32_e32 v1, s9 868; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 869; GFX90a-NEXT: v_mov_b32_e32 v1, s8 870; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 871; GFX90a-NEXT: s_endpgm 872 store <3 x bfloat> %in, ptr addrspace(1) %out 873 ret void 874} 875 876define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, <6 x bfloat> inreg %in) #0 { 877; GFX940-LABEL: v6bfloat_kernel_preload_arg: 878; GFX940: ; %bb.1: 879; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 880; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 881; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 882; GFX940-NEXT: s_waitcnt lgkmcnt(0) 883; GFX940-NEXT: s_branch .LBB22_0 884; GFX940-NEXT: .p2align 8 885; GFX940-NEXT: ; %bb.2: 886; GFX940-NEXT: .LBB22_0: 887; GFX940-NEXT: v_mov_b32_e32 v0, s6 888; GFX940-NEXT: v_mov_b32_e32 v1, s7 889; GFX940-NEXT: v_mov_b32_e32 v2, s8 890; GFX940-NEXT: v_mov_b32_e32 v3, 0 891; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 892; GFX940-NEXT: s_endpgm 893; 894; GFX90a-LABEL: v6bfloat_kernel_preload_arg: 895; GFX90a: ; %bb.1: 896; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 897; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 898; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 899; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 900; GFX90a-NEXT: s_branch .LBB22_0 901; GFX90a-NEXT: .p2align 8 902; GFX90a-NEXT: ; %bb.2: 903; GFX90a-NEXT: .LBB22_0: 904; GFX90a-NEXT: v_mov_b32_e32 v0, s10 905; GFX90a-NEXT: v_mov_b32_e32 v1, s11 906; GFX90a-NEXT: v_mov_b32_e32 v2, s12 907; GFX90a-NEXT: v_mov_b32_e32 v3, 0 908; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 909; GFX90a-NEXT: s_endpgm 910 store <6 x bfloat> %in, ptr addrspace(1) %out 911 ret void 912} 913 914define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out, half inreg %in, <7 x bfloat> inreg %in2, ptr addrspace(1) inreg %out2) #0 { 915; GFX940-LABEL: half_v7bfloat_kernel_preload_arg: 916; GFX940: ; %bb.1: 917; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 918; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 919; GFX940-NEXT: s_waitcnt lgkmcnt(0) 920; GFX940-NEXT: s_branch .LBB23_0 921; GFX940-NEXT: .p2align 8 922; GFX940-NEXT: ; %bb.2: 923; GFX940-NEXT: .LBB23_0: 924; GFX940-NEXT: v_mov_b32_e32 v3, 0 925; GFX940-NEXT: v_mov_b32_e32 v0, s4 926; GFX940-NEXT: global_store_short v3, v0, s[2:3] sc0 sc1 927; GFX940-NEXT: v_mov_b32_e32 v0, s9 928; GFX940-NEXT: global_store_short v3, v0, s[10:11] offset:12 sc0 sc1 929; GFX940-NEXT: v_mov_b32_e32 v2, s8 930; GFX940-NEXT: v_mov_b32_e32 v0, s6 931; GFX940-NEXT: v_mov_b32_e32 v1, s7 932; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 933; GFX940-NEXT: s_endpgm 934; 935; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg: 936; GFX90a: ; %bb.1: 937; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 938; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 939; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 940; GFX90a-NEXT: s_branch .LBB23_0 941; GFX90a-NEXT: .p2align 8 942; GFX90a-NEXT: ; %bb.2: 943; GFX90a-NEXT: .LBB23_0: 944; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 945; GFX90a-NEXT: v_mov_b32_e32 v3, 0 946; GFX90a-NEXT: v_mov_b32_e32 v0, s8 947; GFX90a-NEXT: global_store_short v3, v0, s[6:7] 948; GFX90a-NEXT: v_mov_b32_e32 v0, s13 949; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 950; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12 951; GFX90a-NEXT: v_mov_b32_e32 v2, s12 952; GFX90a-NEXT: v_mov_b32_e32 v0, s10 953; GFX90a-NEXT: v_mov_b32_e32 v1, s11 954; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 955; GFX90a-NEXT: s_endpgm 956 store half %in, ptr addrspace(1) %out 957 store <7 x bfloat> %in2, ptr addrspace(1) %out2 958 ret void 959} 960 961define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1 inreg %in) #0 { 962; GFX940-LABEL: i1_kernel_preload_arg: 963; GFX940: ; %bb.1: 964; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 965; GFX940-NEXT: s_load_dword s4, s[0:1], 0x8 966; GFX940-NEXT: s_waitcnt lgkmcnt(0) 967; GFX940-NEXT: s_branch .LBB24_0 968; GFX940-NEXT: .p2align 8 969; GFX940-NEXT: ; %bb.2: 970; GFX940-NEXT: .LBB24_0: 971; GFX940-NEXT: s_and_b32 s0, s4, 1 972; GFX940-NEXT: v_mov_b32_e32 v0, 0 973; GFX940-NEXT: v_mov_b32_e32 v1, s0 974; GFX940-NEXT: global_store_byte v0, v1, s[2:3] sc0 sc1 975; GFX940-NEXT: s_endpgm 976; 977; GFX90a-LABEL: i1_kernel_preload_arg: 978; GFX90a: ; %bb.1: 979; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 980; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8 981; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 982; GFX90a-NEXT: s_branch .LBB24_0 983; GFX90a-NEXT: .p2align 8 984; GFX90a-NEXT: ; %bb.2: 985; GFX90a-NEXT: .LBB24_0: 986; GFX90a-NEXT: s_and_b32 s0, s8, 1 987; GFX90a-NEXT: v_mov_b32_e32 v0, 0 988; GFX90a-NEXT: v_mov_b32_e32 v1, s0 989; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] 990; GFX90a-NEXT: s_endpgm 991 store i1 %in, ptr addrspace(1) %out 992 ret void 993} 994 995define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out, fp128 inreg %in) #0 { 996; GFX940-LABEL: fp128_kernel_preload_arg: 997; GFX940: ; %bb.1: 998; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 999; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1000; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 1001; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX940-NEXT: s_branch .LBB25_0 1003; GFX940-NEXT: .p2align 8 1004; GFX940-NEXT: ; %bb.2: 1005; GFX940-NEXT: .LBB25_0: 1006; GFX940-NEXT: v_mov_b32_e32 v4, 0 1007; GFX940-NEXT: v_mov_b32_e32 v0, s6 1008; GFX940-NEXT: v_mov_b32_e32 v1, s7 1009; GFX940-NEXT: v_mov_b32_e32 v2, s8 1010; GFX940-NEXT: v_mov_b32_e32 v3, s9 1011; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1 1012; GFX940-NEXT: s_endpgm 1013; 1014; GFX90a-LABEL: fp128_kernel_preload_arg: 1015; GFX90a: ; %bb.1: 1016; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1017; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1018; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 1019; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX90a-NEXT: s_branch .LBB25_0 1021; GFX90a-NEXT: .p2align 8 1022; GFX90a-NEXT: ; %bb.2: 1023; GFX90a-NEXT: .LBB25_0: 1024; GFX90a-NEXT: v_mov_b32_e32 v4, 0 1025; GFX90a-NEXT: v_mov_b32_e32 v0, s10 1026; GFX90a-NEXT: v_mov_b32_e32 v1, s11 1027; GFX90a-NEXT: v_mov_b32_e32 v2, s12 1028; GFX90a-NEXT: v_mov_b32_e32 v3, s13 1029; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 1030; GFX90a-NEXT: s_endpgm 1031 store fp128 %in, ptr addrspace(1) %out 1032 ret void 1033} 1034 1035define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x i8> inreg %in) #0 { 1036; GFX940-LABEL: v7i8_kernel_preload_arg: 1037; GFX940: ; %bb.1: 1038; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1039; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 1040; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1041; GFX940-NEXT: s_branch .LBB26_0 1042; GFX940-NEXT: .p2align 8 1043; GFX940-NEXT: ; %bb.2: 1044; GFX940-NEXT: .LBB26_0: 1045; GFX940-NEXT: s_lshr_b32 s1, s4, 24 1046; GFX940-NEXT: s_and_b32 s0, s4, 0xffff 1047; GFX940-NEXT: s_lshl_b32 s1, s1, 8 1048; GFX940-NEXT: s_bfe_u32 s4, s4, 0x80010 1049; GFX940-NEXT: s_or_b32 s1, s4, s1 1050; GFX940-NEXT: s_lshl_b32 s1, s1, 16 1051; GFX940-NEXT: s_or_b32 s0, s0, s1 1052; GFX940-NEXT: v_mov_b32_e32 v0, 0 1053; GFX940-NEXT: v_mov_b32_e32 v1, s5 1054; GFX940-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:6 sc0 sc1 1055; GFX940-NEXT: global_store_short v0, v1, s[2:3] offset:4 sc0 sc1 1056; GFX940-NEXT: v_mov_b32_e32 v1, s0 1057; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 1058; GFX940-NEXT: s_endpgm 1059; 1060; GFX90a-LABEL: v7i8_kernel_preload_arg: 1061; GFX90a: ; %bb.1: 1062; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1063; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 1064; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1065; GFX90a-NEXT: s_branch .LBB26_0 1066; GFX90a-NEXT: .p2align 8 1067; GFX90a-NEXT: ; %bb.2: 1068; GFX90a-NEXT: .LBB26_0: 1069; GFX90a-NEXT: s_lshr_b32 s1, s8, 24 1070; GFX90a-NEXT: s_lshl_b32 s1, s1, 8 1071; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010 1072; GFX90a-NEXT: s_or_b32 s1, s2, s1 1073; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff 1074; GFX90a-NEXT: s_lshl_b32 s1, s1, 16 1075; GFX90a-NEXT: s_or_b32 s0, s0, s1 1076; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1077; GFX90a-NEXT: v_mov_b32_e32 v1, s9 1078; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6 1079; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4 1080; GFX90a-NEXT: v_mov_b32_e32 v1, s0 1081; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 1082; GFX90a-NEXT: s_endpgm 1083 store <7 x i8> %in, ptr addrspace(1) %out 1084 ret void 1085} 1086 1087define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out, <7 x half> inreg %in) #0 { 1088; GFX940-LABEL: v7half_kernel_preload_arg: 1089; GFX940: ; %bb.1: 1090; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1091; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1092; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x18 1093; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX940-NEXT: s_branch .LBB27_0 1095; GFX940-NEXT: .p2align 8 1096; GFX940-NEXT: ; %bb.2: 1097; GFX940-NEXT: .LBB27_0: 1098; GFX940-NEXT: v_mov_b32_e32 v3, 0 1099; GFX940-NEXT: v_mov_b32_e32 v0, s9 1100; GFX940-NEXT: global_store_short v3, v0, s[2:3] offset:12 sc0 sc1 1101; GFX940-NEXT: v_mov_b32_e32 v2, s8 1102; GFX940-NEXT: v_mov_b32_e32 v0, s6 1103; GFX940-NEXT: v_mov_b32_e32 v1, s7 1104; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3] sc0 sc1 1105; GFX940-NEXT: s_endpgm 1106; 1107; GFX90a-LABEL: v7half_kernel_preload_arg: 1108; GFX90a: ; %bb.1: 1109; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1110; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1111; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18 1112; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX90a-NEXT: s_branch .LBB27_0 1114; GFX90a-NEXT: .p2align 8 1115; GFX90a-NEXT: ; %bb.2: 1116; GFX90a-NEXT: .LBB27_0: 1117; GFX90a-NEXT: v_mov_b32_e32 v3, 0 1118; GFX90a-NEXT: v_mov_b32_e32 v0, s13 1119; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12 1120; GFX90a-NEXT: v_mov_b32_e32 v2, s12 1121; GFX90a-NEXT: v_mov_b32_e32 v0, s10 1122; GFX90a-NEXT: v_mov_b32_e32 v1, s11 1123; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7] 1124; GFX90a-NEXT: s_endpgm 1125 store <7 x half> %in, ptr addrspace(1) %out 1126 ret void 1127} 1128 1129define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i32 inreg %in2, ptr addrspace(1) inreg %out2) #0 { 1130; GFX940-LABEL: i16_i32_kernel_preload_arg: 1131; GFX940: ; %bb.1: 1132; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1133; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1134; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1135; GFX940-NEXT: s_branch .LBB28_0 1136; GFX940-NEXT: .p2align 8 1137; GFX940-NEXT: ; %bb.2: 1138; GFX940-NEXT: .LBB28_0: 1139; GFX940-NEXT: v_mov_b32_e32 v0, 0 1140; GFX940-NEXT: v_mov_b32_e32 v1, s4 1141; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 1142; GFX940-NEXT: v_mov_b32_e32 v1, s5 1143; GFX940-NEXT: global_store_dword v0, v1, s[6:7] sc0 sc1 1144; GFX940-NEXT: s_endpgm 1145; 1146; GFX90a-LABEL: i16_i32_kernel_preload_arg: 1147; GFX90a: ; %bb.1: 1148; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1149; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1150; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1151; GFX90a-NEXT: s_branch .LBB28_0 1152; GFX90a-NEXT: .p2align 8 1153; GFX90a-NEXT: ; %bb.2: 1154; GFX90a-NEXT: .LBB28_0: 1155; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1156; GFX90a-NEXT: v_mov_b32_e32 v1, s8 1157; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 1158; GFX90a-NEXT: v_mov_b32_e32 v1, s9 1159; GFX90a-NEXT: global_store_dword v0, v1, s[10:11] 1160; GFX90a-NEXT: s_endpgm 1161 store i16 %in, ptr addrspace(1) %out 1162 store i32 %in2, ptr addrspace(1) %out2 1163 ret void 1164} 1165 1166define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <3 x i32> inreg %in2, ptr addrspace(1) inreg %out2) #0 { 1167; GFX940-LABEL: i16_v3i32_kernel_preload_arg: 1168; GFX940: ; %bb.1: 1169; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1170; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8 1171; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1172; GFX940-NEXT: s_branch .LBB29_0 1173; GFX940-NEXT: .p2align 8 1174; GFX940-NEXT: ; %bb.2: 1175; GFX940-NEXT: .LBB29_0: 1176; GFX940-NEXT: v_mov_b32_e32 v3, 0 1177; GFX940-NEXT: v_mov_b32_e32 v4, s4 1178; GFX940-NEXT: v_mov_b32_e32 v0, s6 1179; GFX940-NEXT: v_mov_b32_e32 v1, s7 1180; GFX940-NEXT: v_mov_b32_e32 v2, s8 1181; GFX940-NEXT: global_store_short v3, v4, s[2:3] sc0 sc1 1182; GFX940-NEXT: global_store_dwordx3 v3, v[0:2], s[10:11] sc0 sc1 1183; GFX940-NEXT: s_endpgm 1184; 1185; GFX90a-LABEL: i16_v3i32_kernel_preload_arg: 1186; GFX90a: ; %bb.1: 1187; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1188; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1189; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1190; GFX90a-NEXT: s_branch .LBB29_0 1191; GFX90a-NEXT: .p2align 8 1192; GFX90a-NEXT: ; %bb.2: 1193; GFX90a-NEXT: .LBB29_0: 1194; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 1195; GFX90a-NEXT: v_mov_b32_e32 v3, 0 1196; GFX90a-NEXT: v_mov_b32_e32 v4, s8 1197; GFX90a-NEXT: v_mov_b32_e32 v0, s10 1198; GFX90a-NEXT: v_mov_b32_e32 v1, s11 1199; GFX90a-NEXT: v_mov_b32_e32 v2, s12 1200; GFX90a-NEXT: global_store_short v3, v4, s[6:7] 1201; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1202; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 1203; GFX90a-NEXT: s_endpgm 1204 store i16 %in, ptr addrspace(1) %out 1205 store <3 x i32> %in2, ptr addrspace(1) %out2 1206 ret void 1207} 1208 1209define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, i16 inreg %in2, ptr addrspace(1) inreg %out2) #0 { 1210; GFX940-LABEL: i16_i16_kernel_preload_arg: 1211; GFX940: ; %bb.1: 1212; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1213; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1214; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX940-NEXT: s_branch .LBB30_0 1216; GFX940-NEXT: .p2align 8 1217; GFX940-NEXT: ; %bb.2: 1218; GFX940-NEXT: .LBB30_0: 1219; GFX940-NEXT: v_mov_b32_e32 v0, 0 1220; GFX940-NEXT: v_mov_b32_e32 v1, s4 1221; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 1222; GFX940-NEXT: global_store_short_d16_hi v0, v1, s[6:7] sc0 sc1 1223; GFX940-NEXT: s_endpgm 1224; 1225; GFX90a-LABEL: i16_i16_kernel_preload_arg: 1226; GFX90a: ; %bb.1: 1227; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1228; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1229; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1230; GFX90a-NEXT: s_branch .LBB30_0 1231; GFX90a-NEXT: .p2align 8 1232; GFX90a-NEXT: ; %bb.2: 1233; GFX90a-NEXT: .LBB30_0: 1234; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1235; GFX90a-NEXT: v_mov_b32_e32 v1, s8 1236; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 1237; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11] 1238; GFX90a-NEXT: s_endpgm 1239 store i16 %in, ptr addrspace(1) %out 1240 store i16 %in2, ptr addrspace(1) %out2 1241 ret void 1242} 1243 1244define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %out, i16 inreg %in, <2 x i8> inreg %in2, ptr addrspace(1) inreg %out2) #0 { 1245; GFX940-LABEL: i16_v2i8_kernel_preload_arg: 1246; GFX940: ; %bb.1: 1247; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1248; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8 1249; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1250; GFX940-NEXT: s_branch .LBB31_0 1251; GFX940-NEXT: .p2align 8 1252; GFX940-NEXT: ; %bb.2: 1253; GFX940-NEXT: .LBB31_0: 1254; GFX940-NEXT: s_lshr_b32 s0, s4, 24 1255; GFX940-NEXT: s_lshl_b32 s0, s0, 8 1256; GFX940-NEXT: s_bfe_u32 s1, s4, 0x80010 1257; GFX940-NEXT: s_or_b32 s0, s1, s0 1258; GFX940-NEXT: v_mov_b32_e32 v0, 0 1259; GFX940-NEXT: v_mov_b32_e32 v1, s4 1260; GFX940-NEXT: global_store_short v0, v1, s[2:3] sc0 sc1 1261; GFX940-NEXT: v_mov_b32_e32 v1, s0 1262; GFX940-NEXT: global_store_short v0, v1, s[6:7] sc0 sc1 1263; GFX940-NEXT: s_endpgm 1264; 1265; GFX90a-LABEL: i16_v2i8_kernel_preload_arg: 1266; GFX90a: ; %bb.1: 1267; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1268; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1269; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX90a-NEXT: s_branch .LBB31_0 1271; GFX90a-NEXT: .p2align 8 1272; GFX90a-NEXT: ; %bb.2: 1273; GFX90a-NEXT: .LBB31_0: 1274; GFX90a-NEXT: s_lshr_b32 s0, s8, 24 1275; GFX90a-NEXT: s_lshl_b32 s0, s0, 8 1276; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010 1277; GFX90a-NEXT: s_or_b32 s0, s1, s0 1278; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1279; GFX90a-NEXT: v_mov_b32_e32 v1, s8 1280; GFX90a-NEXT: global_store_short v0, v1, s[6:7] 1281; GFX90a-NEXT: v_mov_b32_e32 v1, s0 1282; GFX90a-NEXT: global_store_short v0, v1, s[10:11] 1283; GFX90a-NEXT: s_endpgm 1284 store i16 %in, ptr addrspace(1) %out 1285 store <2 x i8> %in2, ptr addrspace(1) %out2 1286 ret void 1287} 1288 1289; The second argument is not expected to be preloaded with the current behavior. 1290 1291define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, ptr addrspace(1) %out, i32 inreg %arg1) #0 { 1292; GFX940-LABEL: i32_ptr1_i32_staggered_preload_arg: 1293; GFX940: ; %bb.1: 1294; GFX940-NEXT: s_load_dword s2, s[0:1], 0x0 1295; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX940-NEXT: s_branch .LBB32_0 1297; GFX940-NEXT: .p2align 8 1298; GFX940-NEXT: ; %bb.2: 1299; GFX940-NEXT: .LBB32_0: 1300; GFX940-NEXT: s_load_dword s3, s[0:1], 0x10 1301; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 1302; GFX940-NEXT: v_mov_b32_e32 v0, 0 1303; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX940-NEXT: s_add_i32 s0, s2, s3 1305; GFX940-NEXT: v_mov_b32_e32 v1, s0 1306; GFX940-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 1307; GFX940-NEXT: s_endpgm 1308; 1309; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg: 1310; GFX90a: ; %bb.1: 1311; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0 1312; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1313; GFX90a-NEXT: s_branch .LBB32_0 1314; GFX90a-NEXT: .p2align 8 1315; GFX90a-NEXT: ; %bb.2: 1316; GFX90a-NEXT: .LBB32_0: 1317; GFX90a-NEXT: s_load_dword s2, s[4:5], 0x10 1318; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1319; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1320; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX90a-NEXT: s_add_i32 s2, s6, s2 1322; GFX90a-NEXT: v_mov_b32_e32 v1, s2 1323; GFX90a-NEXT: global_store_dword v0, v1, s[0:1] 1324; GFX90a-NEXT: s_endpgm 1325 %add = add i32 %arg0, %arg1 1326 store i32 %add, ptr addrspace(1) %out 1327 ret void 1328} 1329 1330define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out, i8 inreg %arg0, i32 inreg %unused) #0 { 1331; GFX940-LABEL: ptr1_i8_trailing_unused: 1332; GFX940: ; %bb.1: 1333; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 1334; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 1335; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX940-NEXT: s_branch .LBB33_0 1337; GFX940-NEXT: .p2align 8 1338; GFX940-NEXT: ; %bb.2: 1339; GFX940-NEXT: .LBB33_0: 1340; GFX940-NEXT: s_and_b32 s0, s4, 0xff 1341; GFX940-NEXT: v_mov_b32_e32 v0, 0 1342; GFX940-NEXT: v_mov_b32_e32 v1, s0 1343; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1 1344; GFX940-NEXT: s_endpgm 1345; 1346; GFX90a-LABEL: ptr1_i8_trailing_unused: 1347; GFX90a: ; %bb.1: 1348; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 1349; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8 1350; GFX90a-NEXT: s_waitcnt lgkmcnt(0) 1351; GFX90a-NEXT: s_branch .LBB33_0 1352; GFX90a-NEXT: .p2align 8 1353; GFX90a-NEXT: ; %bb.2: 1354; GFX90a-NEXT: .LBB33_0: 1355; GFX90a-NEXT: s_and_b32 s0, s8, 0xff 1356; GFX90a-NEXT: v_mov_b32_e32 v0, 0 1357; GFX90a-NEXT: v_mov_b32_e32 v1, s0 1358; GFX90a-NEXT: global_store_dword v0, v1, s[6:7] 1359; GFX90a-NEXT: s_endpgm 1360 %ext = zext i8 %arg0 to i32 1361 store i32 %ext, ptr addrspace(1) %out 1362 ret void 1363} 1364 1365attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } 1366