1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -S < %s | FileCheck -check-prefix=NO-PRELOAD %s 3; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s | FileCheck -check-prefix=PRELOAD %s 4 5define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) %out) { 6; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( 7; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { 8; NO-PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 9; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 10; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]] 11; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 12; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 13; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 14; NO-PRELOAD-NEXT: ret void 15; 16; PRELOAD-LABEL: define amdgpu_kernel void @preload_block_count_x( 17; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]]) #[[ATTR0:[0-9]+]] { 18; PRELOAD-NEXT: [[PRELOAD_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 19; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 20; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 21; PRELOAD-NEXT: store i32 [[_HIDDEN_BLOCK_COUNT_X]], ptr addrspace(1) [[OUT]], align 4 22; PRELOAD-NEXT: ret void 23; 24 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 25 %load = load i32, ptr addrspace(4) %imp_arg_ptr 26 store i32 %load, ptr addrspace(1) %out 27 ret void 28} 29 30define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) %out, i512) { 31; NO-PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( 32; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]], i512 [[TMP0:%.*]]) #[[ATTR0]] { 33; NO-PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 34; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 35; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] 36; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 37; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 38; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 39; NO-PRELOAD-NEXT: ret void 40; 41; PRELOAD-LABEL: define amdgpu_kernel void @no_free_sgprs_block_count_x( 42; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i512 inreg [[TMP0:%.*]]) #[[ATTR0]] { 43; PRELOAD-NEXT: [[NO_FREE_SGPRS_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(328) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 44; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 45; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 46; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 47; PRELOAD-NEXT: ret void 48; 49 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 50 %load = load i32, ptr addrspace(4) %imp_arg_ptr 51 store i32 %load, ptr addrspace(1) %out 52 ret void 53} 54 55define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) %out) { 56; NO-PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( 57; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { 58; NO-PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 59; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOADREMAINDER_Z_KERNARG_SEGMENT]], i64 0 60; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] 61; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 62; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 63; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 64; NO-PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[LOAD]] to i32 65; NO-PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT_LOAD]], align 4 66; NO-PRELOAD-NEXT: ret void 67; 68; PRELOAD-LABEL: define amdgpu_kernel void @preloadremainder_z( 69; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_REMAINDER_Z:%.*]]) #[[ATTR0]] { 70; PRELOAD-NEXT: [[PRELOADREMAINDER_Z_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 71; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 72; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 22 73; PRELOAD-NEXT: [[LOAD:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 74; PRELOAD-NEXT: [[CONV:%.*]] = zext i16 [[_HIDDEN_REMAINDER_Z]] to i32 75; PRELOAD-NEXT: store i32 [[CONV]], ptr addrspace(1) [[OUT]], align 4 76; PRELOAD-NEXT: ret void 77; 78 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 79 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22 80 %load = load i16, ptr addrspace(4) %gep 81 %conv = zext i16 %load to i32 82 store i32 %conv, ptr addrspace(1) %out 83 ret void 84} 85 86define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) %out) { 87; NO-PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( 88; NO-PRELOAD-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { 89; NO-PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 90; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT]], i64 0 91; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]] 92; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 93; NO-PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 94; NO-PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 95; NO-PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[LOAD_X]] to i32 96; NO-PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 97; NO-PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 98; NO-PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[LOAD_Y]] to i32 99; NO-PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 100; NO-PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 101; NO-PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[LOAD_Z]] to i32 102; NO-PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 103; NO-PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 104; NO-PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 105; NO-PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT_LOAD]], align 16 106; NO-PRELOAD-NEXT: ret void 107; 108; PRELOAD-LABEL: define amdgpu_kernel void @preload_workgroup_size_xyz( 109; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Y:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_Z:%.*]]) #[[ATTR0]] { 110; PRELOAD-NEXT: [[PRELOAD_WORKGROUP_SIZE_XYZ_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 111; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 112; PRELOAD-NEXT: [[GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 12 113; PRELOAD-NEXT: [[LOAD_X:%.*]] = load i16, ptr addrspace(4) [[GEP_X]], align 2 114; PRELOAD-NEXT: [[CONV_X:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_X]] to i32 115; PRELOAD-NEXT: [[GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 14 116; PRELOAD-NEXT: [[LOAD_Y:%.*]] = load i16, ptr addrspace(4) [[GEP_Y]], align 2 117; PRELOAD-NEXT: [[CONV_Y:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Y]] to i32 118; PRELOAD-NEXT: [[GEP_Z:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 16 119; PRELOAD-NEXT: [[LOAD_Z:%.*]] = load i16, ptr addrspace(4) [[GEP_Z]], align 2 120; PRELOAD-NEXT: [[CONV_Z:%.*]] = zext i16 [[_HIDDEN_GROUP_SIZE_Z]] to i32 121; PRELOAD-NEXT: [[INS_0:%.*]] = insertelement <3 x i32> poison, i32 [[CONV_X]], i32 0 122; PRELOAD-NEXT: [[INS_1:%.*]] = insertelement <3 x i32> [[INS_0]], i32 [[CONV_Y]], i32 1 123; PRELOAD-NEXT: [[INS_2:%.*]] = insertelement <3 x i32> [[INS_1]], i32 [[CONV_Z]], i32 2 124; PRELOAD-NEXT: store <3 x i32> [[INS_2]], ptr addrspace(1) [[OUT]], align 16 125; PRELOAD-NEXT: ret void 126; 127 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 128 %gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12 129 %load_x = load i16, ptr addrspace(4) %gep_x 130 %conv_x = zext i16 %load_x to i32 131 %gep_y = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14 132 %load_y = load i16, ptr addrspace(4) %gep_y 133 %conv_y = zext i16 %load_y to i32 134 %gep_z = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16 135 %load_z = load i16, ptr addrspace(4) %gep_z 136 %conv_z = zext i16 %load_z to i32 137 %ins.0 = insertelement <3 x i32> poison, i32 %conv_x, i32 0 138 %ins.1 = insertelement <3 x i32> %ins.0, i32 %conv_y, i32 1 139 %ins.2 = insertelement <3 x i32> %ins.1, i32 %conv_z, i32 2 140 store <3 x i32> %ins.2, ptr addrspace(1) %out 141 ret void 142} 143 144define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inreg %out) { 145; NO-PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( 146; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { 147; NO-PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 148; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 149; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 150; NO-PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 151; NO-PRELOAD-NEXT: ret void 152; 153; PRELOAD-LABEL: define amdgpu_kernel void @incorrect_type_i64_block_count_x( 154; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { 155; PRELOAD-NEXT: [[INCORRECT_TYPE_I64_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 156; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 157; PRELOAD-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[IMP_ARG_PTR]], align 8 158; PRELOAD-NEXT: store i64 [[LOAD]], ptr addrspace(1) [[OUT]], align 8 159; PRELOAD-NEXT: ret void 160; 161 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 162 %load = load i64, ptr addrspace(4) %imp_arg_ptr 163 store i64 %load, ptr addrspace(1) %out 164 ret void 165} 166 167define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { 168; NO-PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( 169; NO-PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { 170; NO-PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 171; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 172; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 173; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 174; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 175; NO-PRELOAD-NEXT: ret void 176; 177; PRELOAD-LABEL: define amdgpu_kernel void @random_incorrect_offset( 178; PRELOAD-SAME: ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] { 179; PRELOAD-NEXT: [[RANDOM_INCORRECT_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 180; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 181; PRELOAD-NEXT: [[GEP:%.*]] = getelementptr i8, ptr addrspace(4) [[IMP_ARG_PTR]], i32 2 182; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[GEP]], align 4 183; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4 184; PRELOAD-NEXT: ret void 185; 186 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 187 %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 188 %load = load i32, ptr addrspace(4) %gep 189 store i32 %load, ptr addrspace(1) %out 190 ret void 191} 192 193define amdgpu_kernel void @incompatible_attribute_block_count_x(ptr addrspace(1) byref(i32) %out) { 194; NO-PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( 195; NO-PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { 196; NO-PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 197; NO-PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 198; NO-PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) 199; NO-PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 200; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 201; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 202; NO-PRELOAD-NEXT: ret void 203; 204; PRELOAD-LABEL: define amdgpu_kernel void @incompatible_attribute_block_count_x( 205; PRELOAD-SAME: ptr addrspace(1) byref(i32) [[OUT:%.*]]) #[[ATTR0]] { 206; PRELOAD-NEXT: [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() 207; PRELOAD-NEXT: [[OUT_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[INCOMPATIBLE_ATTRIBUTE_BLOCK_COUNT_X_KERNARG_SEGMENT]], i64 0 208; PRELOAD-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[OUT_BYVAL_KERNARG_OFFSET]] to ptr addrspace(1) 209; PRELOAD-NEXT: [[IMP_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 210; PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[IMP_ARG_PTR]], align 4 211; PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[TMP1]], align 4 212; PRELOAD-NEXT: ret void 213; 214 %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() 215 %load = load i32, ptr addrspace(4) %imp_arg_ptr 216 store i32 %load, ptr addrspace(1) %out 217 ret void 218} 219 220;. 221; NO-PRELOAD: [[META0]] = !{} 222;. 223