1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s 3; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s 4 5; Opt checks from utils/update_test_checks.py, llc checks from utils/update_llc_test_checks.py 6 7; Define four variables and four non-kernel functions which access exactly one variable each 8@v0 = addrspace(3) global float poison 9@v1 = addrspace(3) global i16 poison, align 16 10@v2 = addrspace(3) global i64 poison 11@v3 = addrspace(3) global i8 poison 12@unused = addrspace(3) global i16 poison 13 14; OPT{LITERAL}: @llvm.amdgcn.lds.offset.table = internal addrspace(4) constant [2 x [1 x i32]] [[1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds to i32)], [1 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds to i32)]] 15 16define void @f0() { 17; OPT-LABEL: @f0( 18; OPT-NEXT: [[LD:%.*]] = load float, ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 19; OPT-NEXT: [[MUL:%.*]] = fmul float [[LD]], 2.000000e+00 20; OPT-NEXT: store float [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds, align 4 21; OPT-NEXT: ret void 22; 23; GCN-LABEL: f0: 24; GCN: ; %bb.0: 25; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GCN-NEXT: v_mov_b32_e32 v0, 0 27; GCN-NEXT: s_mov_b32 m0, -1 28; GCN-NEXT: ds_read_b32 v1, v0 offset:4 29; GCN-NEXT: s_waitcnt lgkmcnt(0) 30; GCN-NEXT: v_add_f32_e32 v1, v1, v1 31; GCN-NEXT: ds_write_b32 v0, v1 offset:4 32; GCN-NEXT: s_waitcnt lgkmcnt(0) 33; GCN-NEXT: s_setpc_b64 s[30:31] 34 %ld = load float, ptr addrspace(3) @v0 35 %mul = fmul float %ld, 2. 36 store float %mul, ptr addrspace(3) @v0 37 ret void 38} 39 40define void @f1() { 41; OPT-LABEL: @f1( 42; OPT-NEXT: [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 43; OPT-NEXT: [[MUL:%.*]] = mul i16 [[LD]], 3 44; OPT-NEXT: store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 16 45; OPT-NEXT: ret void 46; 47; GCN-LABEL: f1: 48; GCN: ; %bb.0: 49; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GCN-NEXT: v_mov_b32_e32 v0, 0 51; GCN-NEXT: s_mov_b32 m0, -1 52; GCN-NEXT: ds_read_u16 v1, v0 53; GCN-NEXT: s_waitcnt lgkmcnt(0) 54; GCN-NEXT: v_mul_lo_u32 v1, v1, 3 55; GCN-NEXT: ds_write_b16 v0, v1 56; GCN-NEXT: s_waitcnt lgkmcnt(0) 57; GCN-NEXT: s_setpc_b64 s[30:31] 58 %ld = load i16, ptr addrspace(3) @v1 59 %mul = mul i16 %ld, 3 60 store i16 %mul, ptr addrspace(3) @v1 61 ret void 62} 63 64define void @f2() { 65; OPT-LABEL: @f2( 66; OPT-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() 67; OPT-NEXT: [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 68; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4 69; OPT-NEXT: [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) 70; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8 71; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 4 72; OPT-NEXT: [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 73; OPT-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4 74; OPT-NEXT: [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) 75; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8 76; OPT-NEXT: ret void 77; 78; GCN-LABEL: f2: 79; GCN: ; %bb.0: 80; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 81; GCN-NEXT: s_mov_b32 s4, s15 82; GCN-NEXT: s_ashr_i32 s5, s15, 31 83; GCN-NEXT: s_getpc_b64 s[6:7] 84; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 85; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 86; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 87; GCN-NEXT: s_add_u32 s4, s4, s6 88; GCN-NEXT: s_addc_u32 s5, s5, s7 89; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 90; GCN-NEXT: s_waitcnt lgkmcnt(0) 91; GCN-NEXT: v_mov_b32_e32 v2, s4 92; GCN-NEXT: s_mov_b32 m0, -1 93; GCN-NEXT: ds_read_b64 v[0:1], v2 94; GCN-NEXT: s_waitcnt lgkmcnt(0) 95; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 96; GCN-NEXT: ds_write_b64 v2, v[0:1] 97; GCN-NEXT: s_waitcnt lgkmcnt(0) 98; GCN-NEXT: s_setpc_b64 s[30:31] 99 %ld = load i64, ptr addrspace(3) @v2 100 %mul = mul i64 %ld, 4 101 store i64 %mul, ptr addrspace(3) @v2 102 ret void 103} 104 105define void @f3() { 106; OPT-LABEL: @f3( 107; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 108; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 5 109; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K23_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds, i32 0, i32 1), align 8 110; OPT-NEXT: ret void 111; 112; GCN-LABEL: f3: 113; GCN: ; %bb.0: 114; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 115; GCN-NEXT: v_mov_b32_e32 v0, 0 116; GCN-NEXT: s_mov_b32 m0, -1 117; GCN-NEXT: ds_read_u8 v1, v0 offset:8 118; GCN-NEXT: s_waitcnt lgkmcnt(0) 119; GCN-NEXT: v_mul_lo_u32 v1, v1, 5 120; GCN-NEXT: ds_write_b8 v0, v1 offset:8 121; GCN-NEXT: s_waitcnt lgkmcnt(0) 122; GCN-NEXT: s_setpc_b64 s[30:31] 123 %ld = load i8, ptr addrspace(3) @v3 124 %mul = mul i8 %ld, 5 125 store i8 %mul, ptr addrspace(3) @v3 126 ret void 127} 128 129; Doesn't access any via a function, won't be in the lookup table 130define amdgpu_kernel void @kernel_no_table() { 131; OPT-LABEL: @kernel_no_table( 132; OPT-NEXT: [[LD:%.*]] = load i64, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 133; OPT-NEXT: [[MUL:%.*]] = mul i64 [[LD]], 8 134; OPT-NEXT: store i64 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_no_table.lds, align 8 135; OPT-NEXT: ret void 136; 137; GCN-LABEL: kernel_no_table: 138; GCN: ; %bb.0: 139; GCN-NEXT: v_mov_b32_e32 v2, 0 140; GCN-NEXT: s_mov_b32 m0, -1 141; GCN-NEXT: ds_read_b64 v[0:1], v2 142; GCN-NEXT: s_waitcnt lgkmcnt(0) 143; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 144; GCN-NEXT: ds_write_b64 v2, v[0:1] 145; GCN-NEXT: s_endpgm 146 %ld = load i64, ptr addrspace(3) @v2 147 %mul = mul i64 %ld, 8 148 store i64 %mul, ptr addrspace(3) @v2 149 ret void 150} 151 152; Access two variables, will allocate those two 153define amdgpu_kernel void @k01() { 154; OPT-LABEL: @k01( 155; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k01.lds) ] 156; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] 157; OPT-NEXT: call void @f0() 158; OPT-NEXT: call void @f1() 159; OPT-NEXT: ret void 160; 161; GCN-LABEL: k01: 162; GCN: ; %bb.0: 163; GCN-NEXT: s_mov_b32 s32, 0 164; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 165; GCN-NEXT: s_add_i32 s12, s12, s17 166; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 167; GCN-NEXT: s_add_u32 s0, s0, s17 168; GCN-NEXT: s_addc_u32 s1, s1, 0 169; GCN-NEXT: s_mov_b32 s13, s15 170; GCN-NEXT: s_mov_b32 s12, s14 171; GCN-NEXT: s_getpc_b64 s[14:15] 172; GCN-NEXT: s_add_u32 s14, s14, f0@gotpcrel32@lo+4 173; GCN-NEXT: s_addc_u32 s15, s15, f0@gotpcrel32@hi+12 174; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 175; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 176; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 177; GCN-NEXT: v_or_b32_e32 v0, v0, v1 178; GCN-NEXT: v_or_b32_e32 v31, v0, v2 179; GCN-NEXT: s_mov_b32 s14, s16 180; GCN-NEXT: s_waitcnt lgkmcnt(0) 181; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 182; GCN-NEXT: s_getpc_b64 s[14:15] 183; GCN-NEXT: s_add_u32 s14, s14, f1@gotpcrel32@lo+4 184; GCN-NEXT: s_addc_u32 s15, s15, f1@gotpcrel32@hi+12 185; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 186; GCN-NEXT: s_mov_b32 s14, s16 187; GCN-NEXT: s_waitcnt lgkmcnt(0) 188; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 189; GCN-NEXT: s_endpgm 190 191 call void @f0() 192 call void @f1() 193 ret void 194} 195 196define amdgpu_kernel void @k23() { 197; OPT-LABEL: @k23( 198; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META8:![0-9]+]] 199; OPT-NEXT: call void @f2() 200; OPT-NEXT: call void @f3() 201; OPT-NEXT: ret void 202; 203; GCN-LABEL: k23: 204; GCN: ; %bb.0: 205; GCN-NEXT: s_mov_b32 s32, 0 206; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 207; GCN-NEXT: s_add_i32 s12, s12, s17 208; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 209; GCN-NEXT: s_add_u32 s0, s0, s17 210; GCN-NEXT: s_addc_u32 s1, s1, 0 211; GCN-NEXT: s_mov_b32 s20, s16 212; GCN-NEXT: s_mov_b32 s13, s15 213; GCN-NEXT: s_mov_b32 s12, s14 214; GCN-NEXT: s_mov_b64 s[16:17], s[6:7] 215; GCN-NEXT: s_mov_b64 s[18:19], s[4:5] 216; GCN-NEXT: s_getpc_b64 s[4:5] 217; GCN-NEXT: s_add_u32 s4, s4, f2@gotpcrel32@lo+4 218; GCN-NEXT: s_addc_u32 s5, s5, f2@gotpcrel32@hi+12 219; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 220; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 221; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 222; GCN-NEXT: v_or_b32_e32 v0, v0, v1 223; GCN-NEXT: v_or_b32_e32 v31, v0, v2 224; GCN-NEXT: s_mov_b32 s15, 1 225; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] 226; GCN-NEXT: s_mov_b32 s14, s20 227; GCN-NEXT: s_waitcnt lgkmcnt(0) 228; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] 229; GCN-NEXT: s_getpc_b64 s[4:5] 230; GCN-NEXT: s_add_u32 s4, s4, f3@gotpcrel32@lo+4 231; GCN-NEXT: s_addc_u32 s5, s5, f3@gotpcrel32@hi+12 232; GCN-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x0 233; GCN-NEXT: s_mov_b64 s[4:5], s[18:19] 234; GCN-NEXT: s_mov_b64 s[6:7], s[16:17] 235; GCN-NEXT: s_waitcnt lgkmcnt(0) 236; GCN-NEXT: s_swappc_b64 s[30:31], s[22:23] 237; GCN-NEXT: s_endpgm 238 239 240 call void @f2() 241 call void @f3() 242 ret void 243} 244 245; Access and allocate three variables 246define amdgpu_kernel void @k123() { 247; OPT-LABEL: @k123( 248; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] 249; OPT-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] 250; OPT-NEXT: call void @f1() 251; OPT-NEXT: [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]] 252; OPT-NEXT: [[MUL:%.*]] = mul i8 [[LD]], 8 253; OPT-NEXT: store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META14]], !noalias [[META11]] 254; OPT-NEXT: call void @f2() 255; OPT-NEXT: ret void 256; 257; GCN-LABEL: k123: 258; GCN: ; %bb.0: 259; GCN-NEXT: s_mov_b32 s32, 0 260; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 261; GCN-NEXT: s_add_i32 s12, s12, s17 262; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 263; GCN-NEXT: s_add_u32 s0, s0, s17 264; GCN-NEXT: s_addc_u32 s1, s1, 0 265; GCN-NEXT: s_mov_b32 s13, s15 266; GCN-NEXT: s_mov_b32 s12, s14 267; GCN-NEXT: s_getpc_b64 s[14:15] 268; GCN-NEXT: s_add_u32 s14, s14, f1@gotpcrel32@lo+4 269; GCN-NEXT: s_addc_u32 s15, s15, f1@gotpcrel32@hi+12 270; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 271; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 272; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 273; GCN-NEXT: v_or_b32_e32 v0, v0, v1 274; GCN-NEXT: v_or_b32_e32 v31, v0, v2 275; GCN-NEXT: s_mov_b32 s15, 0 276; GCN-NEXT: s_mov_b32 s14, s16 277; GCN-NEXT: s_waitcnt lgkmcnt(0) 278; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 279; GCN-NEXT: v_mov_b32_e32 v0, 0 280; GCN-NEXT: s_mov_b32 m0, -1 281; GCN-NEXT: ds_read_u8 v1, v0 offset:16 282; GCN-NEXT: s_getpc_b64 s[14:15] 283; GCN-NEXT: s_add_u32 s14, s14, f2@gotpcrel32@lo+4 284; GCN-NEXT: s_addc_u32 s15, s15, f2@gotpcrel32@hi+12 285; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0 286; GCN-NEXT: s_waitcnt lgkmcnt(0) 287; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1 288; GCN-NEXT: ds_write_b8 v0, v1 offset:16 289; GCN-NEXT: s_mov_b32 s15, 0 290; GCN-NEXT: s_mov_b32 s14, s16 291; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] 292; GCN-NEXT: s_endpgm 293 call void @f1() 294 %ld = load i8, ptr addrspace(3) @v3 295 %mul = mul i8 %ld, 8 296 store i8 %mul, ptr addrspace(3) @v3 297 call void @f2() 298 ret void 299} 300 301!0 = !{i32 0} 302!1 = !{i32 2} 303!2 = !{i32 1} 304 305; OPT: attributes #0 = { "amdgpu-lds-size"="8" } 306; OPT: attributes #1 = { "amdgpu-lds-size"="16" } 307; OPT: attributes #2 = { "amdgpu-lds-size"="24" } 308; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } 309; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 310 311; OPT: !0 = !{i32 0, i32 1} 312; OPT: !1 = !{i32 4, i32 5} 313; OPT: !2 = !{i32 8, i32 9} 314; OPT: !3 = !{i32 1, !"amdhsa_code_object_version", i32 500} 315; OPT: !4 = !{i32 1} 316; OPT: !5 = !{!6} 317; OPT: !6 = distinct !{!6, !7} 318; OPT: !7 = distinct !{!7} 319; OPT: !8 = !{!9} 320; OPT: !9 = distinct !{!9, !7} 321; OPT: !10 = !{i32 0} 322; OPT: !11 = !{!12} 323; OPT: !12 = distinct !{!12, !13} 324; OPT: !13 = distinct !{!13} 325; OPT: !14 = !{!15} 326; OPT: !15 = distinct !{!15, !13} 327 328attributes #0 = { "amdgpu-lds-size"="8" } 329attributes #1 = { "amdgpu-lds-size"="16" } 330attributes #2 = { "amdgpu-lds-size"="24" } 331attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) } 332attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 333 334; Table size length number-kernels * number-variables * sizeof(uint16_t) 335; GCN: .type llvm.amdgcn.lds.offset.table,@object 336; GCN-NEXT: .section .data.rel.ro,"aw" 337; GCN-NEXT: .p2align 2, 0x0 338; GCN-NEXT: llvm.amdgcn.lds.offset.table: 339; GCN-NEXT: .long 8 340; GCN-NEXT: .long 0 341; GCN-NEXT: .size llvm.amdgcn.lds.offset.table, 8 342 343!llvm.module.flags = !{!3} 344!3 = !{i32 1, !"amdhsa_code_object_version", i32 500} 345