xref: /llvm-project/llvm/test/CodeGen/AMDGPU/preload-kernargs-IR-lowering.ll (revision e31bfc040a240c4ba8b24df840290d6f31b83bfc)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
2; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
4; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
5; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s
6
7define amdgpu_kernel void @test_preload_IR_lowering_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
8; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
9; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
10; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
11; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 0
12; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0:![0-9]+]]
13; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
14; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
15; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
16; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
17; NO-PRELOAD-NEXT:    ret void
18;
19; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
20; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
21; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
22; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT]], i64 8
23; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
24; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
25; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
26; PRELOAD-1-NEXT:    ret void
27;
28; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
29; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
30; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
31; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
32; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
33; PRELOAD-3-NEXT:    ret void
34;
35; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_2
36; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
37; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
38; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
39; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
40; PRELOAD-8-NEXT:    ret void
41;
42  %load = load i32, ptr addrspace(1) %in
43  store i32 %load, ptr addrspace(1) %out
44  ret void
45}
46
47define amdgpu_kernel void @test_preload_IR_lowering_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
48; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
49; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
50; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
51; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 0
52; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
53; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
54; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
55; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
56; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
57; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
58; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
59; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
60; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
61; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
62; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
63; NO-PRELOAD-NEXT:    ret void
64;
65; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
66; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
67; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
68; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 8
69; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
70; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 16
71; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
72; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
73; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
74; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
75; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
76; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
77; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
78; PRELOAD-1-NEXT:    ret void
79;
80; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
81; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
82; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
83; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT]], i64 24
84; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
85; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
86; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
87; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
88; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
89; PRELOAD-3-NEXT:    ret void
90;
91; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4
92; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
93; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
94; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
95; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
96; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
97; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
98; PRELOAD-8-NEXT:    ret void
99;
100  %load = load i32, ptr addrspace(1) %in
101  %load1 = load i32, ptr addrspace(1) %in1
102  store i32 %load, ptr addrspace(1) %out
103  store i32 %load1, ptr addrspace(1) %out1
104  ret void
105}
106
107define amdgpu_kernel void @test_preload_IR_lowering_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 {
108; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
109; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
110; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
111; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 0
112; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
113; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
114; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
115; NO-PRELOAD-NEXT:    [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
116; NO-PRELOAD-NEXT:    [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
117; NO-PRELOAD-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
118; NO-PRELOAD-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
119; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
120; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
121; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
122; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
123; NO-PRELOAD-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
124; NO-PRELOAD-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
125; NO-PRELOAD-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
126; NO-PRELOAD-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
127; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
128; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
129; NO-PRELOAD-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
130; NO-PRELOAD-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
131; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
132; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
133; NO-PRELOAD-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
134; NO-PRELOAD-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
135; NO-PRELOAD-NEXT:    ret void
136;
137; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
138; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
139; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
140; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 8
141; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
142; PRELOAD-1-NEXT:    [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 16
143; PRELOAD-1-NEXT:    [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
144; PRELOAD-1-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
145; PRELOAD-1-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
146; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
147; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
148; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
149; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
150; PRELOAD-1-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
151; PRELOAD-1-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
152; PRELOAD-1-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
153; PRELOAD-1-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
154; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
155; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
156; PRELOAD-1-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4
157; PRELOAD-1-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
158; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
159; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
160; PRELOAD-1-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
161; PRELOAD-1-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
162; PRELOAD-1-NEXT:    ret void
163;
164; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
165; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] {
166; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
167; PRELOAD-3-NEXT:    [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 24
168; PRELOAD-3-NEXT:    [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
169; PRELOAD-3-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 32
170; PRELOAD-3-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
171; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 40
172; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
173; PRELOAD-3-NEXT:    [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 48
174; PRELOAD-3-NEXT:    [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
175; PRELOAD-3-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
176; PRELOAD-3-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
177; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
178; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
179; PRELOAD-3-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
180; PRELOAD-3-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4
181; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
182; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
183; PRELOAD-3-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4
184; PRELOAD-3-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
185; PRELOAD-3-NEXT:    ret void
186;
187; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_8
188; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] {
189; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
190; PRELOAD-8-NEXT:    [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_8_KERNARG_SEGMENT]], i64 56
191; PRELOAD-8-NEXT:    [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load [[META0:![0-9]+]]
192; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
193; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
194; PRELOAD-8-NEXT:    [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2]], align 4
195; PRELOAD-8-NEXT:    [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3]], align 4
196; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
197; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
198; PRELOAD-8-NEXT:    store i32 [[LOAD2]], ptr addrspace(1) [[OUT2]], align 4
199; PRELOAD-8-NEXT:    store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4
200; PRELOAD-8-NEXT:    ret void
201;
202  %load = load i32, ptr addrspace(1) %in
203  %load1 = load i32, ptr addrspace(1) %in1
204  %load2 = load i32, ptr addrspace(1) %in2
205  %load3 = load i32, ptr addrspace(1) %in3
206  store i32 %load, ptr addrspace(1) %out
207  store i32 %load1, ptr addrspace(1) %out1
208  store i32 %load2, ptr addrspace(1) %out2
209  store i32 %load3, ptr addrspace(1) %out3
210  ret void
211}
212
213; Preload args with inreg in the NO-PRELOAD case.
214
215define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
216; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
217; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
218; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
219; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0
220; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
221; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
222; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
223; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
224; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
225; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
226; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
227; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
228; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
229; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
230; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
231; NO-PRELOAD-NEXT:    ret void
232;
233; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
234; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
235; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
236; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8
237; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
238; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16
239; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
240; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24
241; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
242; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
243; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
244; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
245; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
246; PRELOAD-1-NEXT:    ret void
247;
248; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
249; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
250; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
251; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
252; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
253; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
254; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
255; PRELOAD-3-NEXT:    ret void
256;
257; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset
258; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
259; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
260; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
261; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
262; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
263; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
264; PRELOAD-8-NEXT:    ret void
265;
266  %load = load i32, ptr addrspace(1) %in
267  %load1 = load i32, ptr addrspace(1) %in1
268  store i32 %load, ptr addrspace(1) %out
269  store i32 %load1, ptr addrspace(1) %out1
270  ret void
271}
272
273; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument.
274
275define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 {
276; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
277; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
278; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
279; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
280; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
281; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
282; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
283; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
284; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
285; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
286; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
287; NO-PRELOAD-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
288; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
289; NO-PRELOAD-NEXT:    ret void
290;
291; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
292; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
293; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
294; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8
295; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
296; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16
297; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
298; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24
299; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
300; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
301; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
302; PRELOAD-1-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4
303; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
304; PRELOAD-1-NEXT:    ret void
305;
306; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
307; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
308; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
309; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
310; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
311; PRELOAD-3-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
312; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
313; PRELOAD-3-NEXT:    ret void
314;
315; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_inreg_offset_two_sequence
316; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
317; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
318; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
319; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
320; PRELOAD-8-NEXT:    store i32 [[LOAD]], ptr addrspace(1) [[OUT]], align 4
321; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
322; PRELOAD-8-NEXT:    ret void
323;
324  %load = load i32, ptr addrspace(1) %in
325  %load1 = load i32, ptr addrspace(1) %in1
326  store i32 %load, ptr addrspace(1) %out
327  store i32 %load1, ptr addrspace(1) %out1
328  ret void
329}
330
331define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 {
332; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
333; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
334; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
335; NO-PRELOAD-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0
336; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
337; NO-PRELOAD-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
338; NO-PRELOAD-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
339; NO-PRELOAD-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
340; NO-PRELOAD-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
341; NO-PRELOAD-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
342; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
343; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
344; NO-PRELOAD-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
345; NO-PRELOAD-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
346; NO-PRELOAD-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
347; NO-PRELOAD-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
348; NO-PRELOAD-NEXT:    [[EXT:%.*]] = zext i16 [[TMP2]] to i32
349; NO-PRELOAD-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
350; NO-PRELOAD-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
351; NO-PRELOAD-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
352; NO-PRELOAD-NEXT:    ret void
353;
354; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
355; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
356; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
357; PRELOAD-1-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8
358; PRELOAD-1-NEXT:    [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
359; PRELOAD-1-NEXT:    [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16
360; PRELOAD-1-NEXT:    [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
361; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
362; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
363; PRELOAD-1-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
364; PRELOAD-1-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
365; PRELOAD-1-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4
366; PRELOAD-1-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4
367; PRELOAD-1-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
368; PRELOAD-1-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
369; PRELOAD-1-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
370; PRELOAD-1-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
371; PRELOAD-1-NEXT:    ret void
372;
373; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
374; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] {
375; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
376; PRELOAD-3-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24
377; PRELOAD-3-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
378; PRELOAD-3-NEXT:    [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32
379; PRELOAD-3-NEXT:    [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load [[META0]]
380; PRELOAD-3-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
381; PRELOAD-3-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
382; PRELOAD-3-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
383; PRELOAD-3-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
384; PRELOAD-3-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
385; PRELOAD-3-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4
386; PRELOAD-3-NEXT:    ret void
387;
388; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_misaligned
389; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] {
390; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
391; PRELOAD-8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN]], align 4
392; PRELOAD-8-NEXT:    [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1]], align 4
393; PRELOAD-8-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
394; PRELOAD-8-NEXT:    [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]]
395; PRELOAD-8-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
396; PRELOAD-8-NEXT:    store i32 [[LOAD1]], ptr addrspace(1) [[OUT1]], align 4
397; PRELOAD-8-NEXT:    ret void
398;
399  %load = load i32, ptr addrspace(1) %in
400  %load1 = load i32, ptr addrspace(1) %in1
401  %ext = zext i16 %arg0 to i32
402  %add = add i32 %load, %ext
403  store i32 %add, ptr addrspace(1) %out
404  store i32 %load1, ptr addrspace(1) %out1
405  ret void
406}
407
408; In this case both i16 args with be preloaded into the first SGPR.
409
410define amdgpu_kernel void @test_preload_IR_lowering_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 {
411; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
412; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
413; NO-PRELOAD-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
414; NO-PRELOAD-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
415; NO-PRELOAD-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
416; NO-PRELOAD-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
417; NO-PRELOAD-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
418; NO-PRELOAD-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
419; NO-PRELOAD-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
420; NO-PRELOAD-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
421; NO-PRELOAD-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
422; NO-PRELOAD-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
423; NO-PRELOAD-NEXT:    [[EXT:%.*]] = zext i16 [[TMP2]] to i32
424; NO-PRELOAD-NEXT:    [[EXT1:%.*]] = zext i16 [[TMP5]] to i32
425; NO-PRELOAD-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
426; NO-PRELOAD-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
427; NO-PRELOAD-NEXT:    ret void
428;
429; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
430; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
431; PRELOAD-1-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
432; PRELOAD-1-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0
433; PRELOAD-1-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load [[META0]]
434; PRELOAD-1-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
435; PRELOAD-1-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
436; PRELOAD-1-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8
437; PRELOAD-1-NEXT:    [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load [[META0]]
438; PRELOAD-1-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
439; PRELOAD-1-NEXT:    [[EXT1:%.*]] = zext i16 [[TMP3]] to i32
440; PRELOAD-1-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
441; PRELOAD-1-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4
442; PRELOAD-1-NEXT:    ret void
443;
444; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
445; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
446; PRELOAD-3-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
447; PRELOAD-3-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
448; PRELOAD-3-NEXT:    [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
449; PRELOAD-3-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
450; PRELOAD-3-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
451; PRELOAD-3-NEXT:    ret void
452;
453; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_IR_lowering_kernel_4_i16_i16
454; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] {
455; PRELOAD-8-NEXT:    [[TEST_PRELOAD_IR_LOWERING_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
456; PRELOAD-8-NEXT:    [[EXT:%.*]] = zext i16 [[ARG0]] to i32
457; PRELOAD-8-NEXT:    [[EXT1:%.*]] = zext i16 [[ARG1]] to i32
458; PRELOAD-8-NEXT:    [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]]
459; PRELOAD-8-NEXT:    store i32 [[ADD]], ptr addrspace(1) [[OUT]], align 4
460; PRELOAD-8-NEXT:    ret void
461;
462  %ext = zext i16 %arg0 to i32
463  %ext1 = zext i16 %arg1 to i32
464  %add = add i32 %ext, %ext1
465  store i32 %add, ptr addrspace(1) %out
466  ret void
467}
468
469attributes #0 = { nounwind }
470