xref: /llvm-project/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll (revision ec7baca17e78d47c1571d1c06b95f920562293da)
1; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
2
3; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer1:]] "__device_side_enqueue_block_invoke_kernel"
4; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer2:]] "__device_side_enqueue_block_invoke_2_kernel"
5; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer3:]] "__device_side_enqueue_block_invoke_3_kernel"
6; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer4:]] "__device_side_enqueue_block_invoke_4_kernel"
7; CHECK-SPIRV: OpEntryPoint Kernel %[[#BlockKer5:]] "__device_side_enqueue_block_invoke_5_kernel"
8; CHECK-SPIRV: OpName %[[#BlockGlb1:]] "__block_literal_global"
9; CHECK-SPIRV: OpName %[[#BlockGlb2:]] "__block_literal_global.1"
10
11; CHECK-SPIRV: %[[#Int32Ty:]] = OpTypeInt 32
12; CHECK-SPIRV: %[[#Int8Ty:]] = OpTypeInt 8
13; CHECK-SPIRV: %[[#VoidTy:]] = OpTypeVoid
14; CHECK-SPIRV: %[[#Int8PtrGenTy:]] = OpTypePointer Generic %[[#Int8Ty]]
15; CHECK-SPIRV: %[[#EventTy:]] = OpTypeDeviceEvent
16; CHECK-SPIRV: %[[#EventPtrTy:]] = OpTypePointer Generic %[[#EventTy]]
17; CHECK-SPIRV: %[[#Int32LocPtrTy:]] = OpTypePointer Function %[[#Int32Ty]]
18; CHECK-SPIRV: %[[#BlockStructTy:]] = OpTypeStruct
19; CHECK-SPIRV: %[[#BlockStructLocPtrTy:]] = OpTypePointer Function %[[#BlockStructTy]]
20; CHECK-SPIRV: %[[#BlockTy1:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]]
21; CHECK-SPIRV: %[[#BlockTy2:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]]
22; CHECK-SPIRV: %[[#BlockTy3:]] = OpTypeFunction %[[#VoidTy]] %[[#Int8PtrGenTy]]
23
24; CHECK-SPIRV: %[[#ConstInt0:]] = OpConstant %[[#Int32Ty]] 0
25; CHECK-SPIRV: %[[#EventNull:]] = OpConstantNull %[[#EventPtrTy]]
26; CHECK-SPIRV: %[[#ConstInt21:]] = OpConstant %[[#Int32Ty]] 21
27; CHECK-SPIRV: %[[#ConstInt8:]] = OpConstant %[[#Int32Ty]] 8
28; CHECK-SPIRV: %[[#ConstInt24:]] = OpConstant %[[#Int32Ty]] 24
29; CHECK-SPIRV: %[[#ConstInt12:]] = OpConstant %[[#Int32Ty]] 12
30; CHECK-SPIRV: %[[#ConstInt2:]] = OpConstant %[[#Int32Ty]] 2
31
32;; typedef struct {int a;} ndrange_t;
33;; #define NULL ((void*)0)
34
35;; kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
36;;   queue_t default_queue;
37;;   unsigned flags = 0;
38;;   ndrange_t ndrange;
39;;   clk_event_t clk_event;
40;;   clk_event_t event_wait_list;
41;;   clk_event_t event_wait_list2[] = {clk_event};
42
43;; Emits block literal on stack and block kernel.
44
45; CHECK-SPIRV:      %[[#BlockLitPtr1:]] = OpBitcast %[[#BlockStructLocPtrTy]]
46; CHECK-SPIRV-NEXT: %[[#BlockLit1:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLitPtr1]]
47; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#EventNull]] %[[#BlockKer1]] %[[#BlockLit1]] %[[#ConstInt21]] %[[#ConstInt8]]
48
49;;   enqueue_kernel(default_queue, flags, ndrange,
50;;                  ^(void) {
51;;                    a[i] = c0;
52;;                  });
53
54;; Emits block literal on stack and block kernel.
55
56; CHECK-SPIRV:      %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]]
57; CHECK-SPIRV:      %[[#Event2:]] = OpPtrCastToGeneric %[[#EventPtrTy]]
58; CHECK-SPIRV:      %[[#BlockLitPtr2:]] = OpBitcast %[[#BlockStructLocPtrTy]]
59; CHECK-SPIRV-NEXT: %[[#BlockLit2:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLitPtr2]]
60; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt2]] %[[#Event1]] %[[#Event2]] %[[#BlockKer2]] %[[#BlockLit2]] %[[#ConstInt24]] %[[#ConstInt8]]
61
62;;   enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
63;;                  ^(void) {
64;;                    a[i] = b[i];
65;;                  });
66
67;;   char c;
68;; Emits global block literal and block kernel.
69
70; CHECK-SPIRV: %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]]
71; CHECK-SPIRV: %[[#Event2:]] = OpPtrCastToGeneric %[[#EventPtrTy]]
72; CHECK-SPIRV: %[[#BlockLit3Tmp:]] = OpBitcast %[[#]] %[[#BlockGlb1]]
73; CHECK-SPIRV: %[[#BlockLit3:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit3Tmp]]
74; CHECK-SPIRV: %[[#LocalBuf31:]] = OpPtrAccessChain %[[#Int32LocPtrTy]]
75; CHECK-SPIRV: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt2]] %[[#Event1]] %[[#Event2]] %[[#BlockKer3]] %[[#BlockLit3]] %[[#ConstInt12]] %[[#ConstInt8]] %[[#LocalBuf31]]
76
77;;   enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
78;;                  ^(local void *p) {
79;;                    return;
80;;                  },
81;;                  c);
82
83;; Emits global block literal and block kernel.
84
85; CHECK-SPIRV:      %[[#BlockLit4Tmp:]] = OpBitcast %[[#]] %[[#BlockGlb2]]
86; CHECK-SPIRV:      %[[#BlockLit4:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit4Tmp]]
87; CHECK-SPIRV:      %[[#LocalBuf41:]] = OpPtrAccessChain %[[#Int32LocPtrTy]]
88; CHECK-SPIRV-NEXT: %[[#LocalBuf42:]] = OpPtrAccessChain %[[#Int32LocPtrTy]]
89; CHECK-SPIRV-NEXT: %[[#LocalBuf43:]] = OpPtrAccessChain %[[#Int32LocPtrTy]]
90; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#EventNull]] %[[#BlockKer4]] %[[#BlockLit4]] %[[#ConstInt12]] %[[#ConstInt8]] %[[#LocalBuf41]] %[[#LocalBuf42]] %[[#LocalBuf43]]
91
92;;   enqueue_kernel(default_queue, flags, ndrange,
93;;                  ^(local void *p1, local void *p2, local void *p3) {
94;;                    return;
95;;                  },
96;;                  1, 2, 4);
97
98;; Emits block literal on stack and block kernel.
99
100; CHECK-SPIRV:      %[[#Event1:]] = OpPtrCastToGeneric %[[#EventPtrTy]]
101; CHECK-SPIRV:      %[[#BlockLit5Tmp:]] = OpBitcast %[[#BlockStructLocPtrTy]]
102; CHECK-SPIRV-NEXT: %[[#BlockLit5:]] = OpPtrCastToGeneric %[[#Int8PtrGenTy]] %[[#BlockLit5Tmp]]
103; CHECK-SPIRV-NEXT: %[[#]] = OpEnqueueKernel %[[#Int32Ty]] %[[#]] %[[#]] %[[#]] %[[#ConstInt0]] %[[#EventNull]] %[[#Event1]] %[[#BlockKer5]] %[[#BlockLit5]] %[[#ConstInt24]] %[[#ConstInt8]]
104
105;;   enqueue_kernel(default_queue, flags, ndrange, 0, NULL, &clk_event,
106;;                  ^(void) {
107;;                    a[i] = b[i];
108;;                  });
109;; }
110
111; CHECK-SPIRV-DAG: %[[#BlockKer1]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]]
112; CHECK-SPIRV-DAG: %[[#BlockKer2]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]]
113; CHECK-SPIRV-DAG: %[[#BlockKer3]] = OpFunction %[[#VoidTy]] None %[[#BlockTy3]]
114; CHECK-SPIRV-DAG: %[[#BlockKer4]] = OpFunction %[[#VoidTy]] None %[[#BlockTy2]]
115; CHECK-SPIRV-DAG: %[[#BlockKer5]] = OpFunction %[[#VoidTy]] None %[[#BlockTy1]]
116
117%opencl.queue_t = type opaque
118%struct.ndrange_t = type { i32 }
119%opencl.clk_event_t = type opaque
120%struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
121
122@__block_literal_global = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3 to i8*) to i8 addrspace(4)*) }, align 4
123@__block_literal_global.1 = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 12, i32 4, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4 to i8*) to i8 addrspace(4)*) }, align 4
124
125define dso_local spir_kernel void @device_side_enqueue(i32 addrspace(1)* noundef %a, i32 addrspace(1)* noundef %b, i32 noundef %i, i8 noundef signext %c0) {
126entry:
127  %a.addr = alloca i32 addrspace(1)*, align 4
128  %b.addr = alloca i32 addrspace(1)*, align 4
129  %i.addr = alloca i32, align 4
130  %c0.addr = alloca i8, align 1
131  %default_queue = alloca %opencl.queue_t*, align 4
132  %flags = alloca i32, align 4
133  %ndrange = alloca %struct.ndrange_t, align 4
134  %clk_event = alloca %opencl.clk_event_t*, align 4
135  %event_wait_list = alloca %opencl.clk_event_t*, align 4
136  %event_wait_list2 = alloca [1 x %opencl.clk_event_t*], align 4
137  %tmp = alloca %struct.ndrange_t, align 4
138  %block = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, align 4
139  %tmp3 = alloca %struct.ndrange_t, align 4
140  %block4 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
141  %c = alloca i8, align 1
142  %tmp11 = alloca %struct.ndrange_t, align 4
143  %block_sizes = alloca [1 x i32], align 4
144  %tmp12 = alloca %struct.ndrange_t, align 4
145  %block_sizes13 = alloca [3 x i32], align 4
146  %tmp14 = alloca %struct.ndrange_t, align 4
147  %block15 = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, align 4
148  store i32 addrspace(1)* %a, i32 addrspace(1)** %a.addr, align 4
149  store i32 addrspace(1)* %b, i32 addrspace(1)** %b.addr, align 4
150  store i32 %i, i32* %i.addr, align 4
151  store i8 %c0, i8* %c0.addr, align 1
152  store i32 0, i32* %flags, align 4
153  %arrayinit.begin = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
154  %0 = load %opencl.clk_event_t*, %opencl.clk_event_t** %clk_event, align 4
155  store %opencl.clk_event_t* %0, %opencl.clk_event_t** %arrayinit.begin, align 4
156  %1 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
157  %2 = load i32, i32* %flags, align 4
158  %3 = bitcast %struct.ndrange_t* %tmp to i8*
159  %4 = bitcast %struct.ndrange_t* %ndrange to i8*
160  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %3, i8* align 4 %4, i32 4, i1 false)
161  %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 0
162  store i32 21, i32* %block.size, align 4
163  %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 1
164  store i32 4, i32* %block.align, align 4
165  %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 2
166  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 4
167  %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 3
168  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
169  store i32 addrspace(1)* %5, i32 addrspace(1)** %block.captured, align 4
170  %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 4
171  %6 = load i32, i32* %i.addr, align 4
172  store i32 %6, i32* %block.captured1, align 4
173  %block.captured2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block, i32 0, i32 5
174  %7 = load i8, i8* %c0.addr, align 1
175  store i8 %7, i8* %block.captured2, align 4
176  %8 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>* %block to %struct.__opencl_block_literal_generic*
177  %9 = addrspacecast %struct.__opencl_block_literal_generic* %8 to i8 addrspace(4)*
178  %10 = call spir_func i32 @__enqueue_kernel_basic(%opencl.queue_t* %1, i32 %2, %struct.ndrange_t* byval(%struct.ndrange_t) %tmp, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %9)
179  %11 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
180  %12 = load i32, i32* %flags, align 4
181  %13 = bitcast %struct.ndrange_t* %tmp3 to i8*
182  %14 = bitcast %struct.ndrange_t* %ndrange to i8*
183  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %13, i8* align 4 %14, i32 4, i1 false)
184  %15 = addrspacecast %opencl.clk_event_t** %event_wait_list to %opencl.clk_event_t* addrspace(4)*
185  %16 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
186  %block.size5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 0
187  store i32 24, i32* %block.size5, align 4
188  %block.align6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 1
189  store i32 4, i32* %block.align6, align 4
190  %block.invoke7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 2
191  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke7, align 4
192  %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 3
193  %17 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
194  store i32 addrspace(1)* %17, i32 addrspace(1)** %block.captured8, align 4
195  %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 4
196  %18 = load i32, i32* %i.addr, align 4
197  store i32 %18, i32* %block.captured9, align 4
198  %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4, i32 0, i32 5
199  %19 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4
200  store i32 addrspace(1)* %19, i32 addrspace(1)** %block.captured10, align 4
201  %20 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block4 to %struct.__opencl_block_literal_generic*
202  %21 = addrspacecast %struct.__opencl_block_literal_generic* %20 to i8 addrspace(4)*
203  %22 = call spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %11, i32 %12, %struct.ndrange_t* %tmp3, i32 2, %opencl.clk_event_t* addrspace(4)* %15, %opencl.clk_event_t* addrspace(4)* %16, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %21)
204  %23 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
205  %24 = load i32, i32* %flags, align 4
206  %25 = bitcast %struct.ndrange_t* %tmp11 to i8*
207  %26 = bitcast %struct.ndrange_t* %ndrange to i8*
208  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %25, i8* align 4 %26, i32 4, i1 false)
209  %arraydecay = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
210  %27 = addrspacecast %opencl.clk_event_t** %arraydecay to %opencl.clk_event_t* addrspace(4)*
211  %28 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
212  %29 = getelementptr [1 x i32], [1 x i32]* %block_sizes, i32 0, i32 0
213  %30 = load i8, i8* %c, align 1
214  %31 = zext i8 %30 to i32
215  store i32 %31, i32* %29, align 4
216  %32 = call spir_func i32 @__enqueue_kernel_events_varargs(%opencl.queue_t* %23, i32 %24, %struct.ndrange_t* %tmp11, i32 2, %opencl.clk_event_t* addrspace(4)* %27, %opencl.clk_event_t* addrspace(4)* %28, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_3_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %29)
217  %33 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
218  %34 = load i32, i32* %flags, align 4
219  %35 = bitcast %struct.ndrange_t* %tmp12 to i8*
220  %36 = bitcast %struct.ndrange_t* %ndrange to i8*
221  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %35, i8* align 4 %36, i32 4, i1 false)
222  %37 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 0
223  store i32 1, i32* %37, align 4
224  %38 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 1
225  store i32 2, i32* %38, align 4
226  %39 = getelementptr [3 x i32], [3 x i32]* %block_sizes13, i32 0, i32 2
227  store i32 4, i32* %39, align 4
228  %40 = call spir_func i32 @__enqueue_kernel_varargs(%opencl.queue_t* %33, i32 %34, %struct.ndrange_t* %tmp12, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* @__device_side_enqueue_block_invoke_4_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global.1 to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %37)
229  %41 = load %opencl.queue_t*, %opencl.queue_t** %default_queue, align 4
230  %42 = load i32, i32* %flags, align 4
231  %43 = bitcast %struct.ndrange_t* %tmp14 to i8*
232  %44 = bitcast %struct.ndrange_t* %ndrange to i8*
233  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %43, i8* align 4 %44, i32 4, i1 false)
234  %45 = addrspacecast %opencl.clk_event_t** %clk_event to %opencl.clk_event_t* addrspace(4)*
235  %block.size16 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 0
236  store i32 24, i32* %block.size16, align 4
237  %block.align17 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 1
238  store i32 4, i32* %block.align17, align 4
239  %block.invoke18 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 2
240  store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5 to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke18, align 4
241  %block.captured19 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 3
242  %46 = load i32 addrspace(1)*, i32 addrspace(1)** %a.addr, align 4
243  store i32 addrspace(1)* %46, i32 addrspace(1)** %block.captured19, align 4
244  %block.captured20 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 4
245  %47 = load i32, i32* %i.addr, align 4
246  store i32 %47, i32* %block.captured20, align 4
247  %block.captured21 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15, i32 0, i32 5
248  %48 = load i32 addrspace(1)*, i32 addrspace(1)** %b.addr, align 4
249  store i32 addrspace(1)* %48, i32 addrspace(1)** %block.captured21, align 4
250  %49 = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block15 to %struct.__opencl_block_literal_generic*
251  %50 = addrspacecast %struct.__opencl_block_literal_generic* %49 to i8 addrspace(4)*
252  %51 = call spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t* %41, i32 %42, %struct.ndrange_t* %tmp14, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* %45, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)* %50)
253  ret void
254}
255
256declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
257
258define internal spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* noundef %.block_descriptor) {
259entry:
260  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
261  %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*, align 4
262  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
263  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)*
264  store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)** %block.addr, align 4
265  %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 5
266  %0 = load i8, i8 addrspace(4)* %block.capture.addr, align 4
267  %conv = sext i8 %0 to i32
268  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 3
269  %1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr1, align 4
270  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i8 }> addrspace(4)* %block, i32 0, i32 4
271  %2 = load i32, i32 addrspace(4)* %block.capture.addr2, align 4
272  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 %2
273  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
274  ret void
275}
276
277define spir_kernel void @__device_side_enqueue_block_invoke_kernel(i8 addrspace(4)* %0) {
278entry:
279  call spir_func void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0)
280  ret void
281}
282
283declare spir_func i32 @__enqueue_kernel_basic(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*)
284
285define internal spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* noundef %.block_descriptor) {
286entry:
287  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
288  %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
289  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
290  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
291  store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
292  %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5
293  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
294  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
295  %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
296  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
297  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
298  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
299  %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
300  %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
301  %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
302  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
303  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
304  ret void
305}
306
307define spir_kernel void @__device_side_enqueue_block_invoke_2_kernel(i8 addrspace(4)* %0) {
308entry:
309  call spir_func void @__device_side_enqueue_block_invoke_2(i8 addrspace(4)* %0)
310  ret void
311}
312
313declare spir_func i32 @__enqueue_kernel_basic_events(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*)
314
315define internal spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* noundef %.block_descriptor, i8 addrspace(3)* noundef %p) {
316entry:
317  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
318  %p.addr = alloca i8 addrspace(3)*, align 4
319  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
320  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
321  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
322  store i8 addrspace(3)* %p, i8 addrspace(3)** %p.addr, align 4
323  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
324  ret void
325}
326
327define spir_kernel void @__device_side_enqueue_block_invoke_3_kernel(i8 addrspace(4)* %0, i8 addrspace(3)* %1) {
328entry:
329  call spir_func void @__device_side_enqueue_block_invoke_3(i8 addrspace(4)* %0, i8 addrspace(3)* %1)
330  ret void
331}
332
333declare spir_func i32 @__enqueue_kernel_events_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i32, %opencl.clk_event_t* addrspace(4)*, %opencl.clk_event_t* addrspace(4)*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*)
334
335define internal spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* noundef %.block_descriptor, i8 addrspace(3)* noundef %p1, i8 addrspace(3)* noundef %p2, i8 addrspace(3)* noundef %p3) {
336entry:
337  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
338  %p1.addr = alloca i8 addrspace(3)*, align 4
339  %p2.addr = alloca i8 addrspace(3)*, align 4
340  %p3.addr = alloca i8 addrspace(3)*, align 4
341  %block.addr = alloca <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*, align 4
342  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
343  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)* }> addrspace(4)*
344  store i8 addrspace(3)* %p1, i8 addrspace(3)** %p1.addr, align 4
345  store i8 addrspace(3)* %p2, i8 addrspace(3)** %p2.addr, align 4
346  store i8 addrspace(3)* %p3, i8 addrspace(3)** %p3.addr, align 4
347  store <{ i32, i32, i8 addrspace(4)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)* }> addrspace(4)** %block.addr, align 4
348  ret void
349}
350
351define spir_kernel void @__device_side_enqueue_block_invoke_4_kernel(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3) {
352entry:
353  call spir_func void @__device_side_enqueue_block_invoke_4(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3)
354  ret void
355}
356
357declare spir_func i32 @__enqueue_kernel_varargs(%opencl.queue_t*, i32, %struct.ndrange_t*, i8 addrspace(4)*, i8 addrspace(4)*, i32, i32*)
358
359define internal spir_func void @__device_side_enqueue_block_invoke_5(i8 addrspace(4)* noundef %.block_descriptor) {
360entry:
361  %.block_descriptor.addr = alloca i8 addrspace(4)*, align 4
362  %block.addr = alloca <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*, align 4
363  store i8 addrspace(4)* %.block_descriptor, i8 addrspace(4)** %.block_descriptor.addr, align 4
364  %block = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)*
365  store <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)** %block.addr, align 4
366  %block.capture.addr = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 5
367  %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr, align 4
368  %block.capture.addr1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
369  %1 = load i32, i32 addrspace(4)* %block.capture.addr1, align 4
370  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 %1
371  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
372  %block.capture.addr2 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 3
373  %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %block.capture.addr2, align 4
374  %block.capture.addr3 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>, <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }> addrspace(4)* %block, i32 0, i32 4
375  %4 = load i32, i32 addrspace(4)* %block.capture.addr3, align 4
376  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 %4
377  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
378  ret void
379}
380
381define spir_kernel void @__device_side_enqueue_block_invoke_5_kernel(i8 addrspace(4)* %0) {
382entry:
383  call spir_func void @__device_side_enqueue_block_invoke_5(i8 addrspace(4)* %0)
384  ret void
385}
386