xref: /llvm-project/llvm/test/CodeGen/AMDGPU/udivrem24.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4
5; FUNC-LABEL: {{^}}udiv24_i8:
6; SI: v_cvt_f32_ubyte
7; SI-DAG: v_cvt_f32_ubyte
8; SI-DAG: v_rcp_iflag_f32
9; SI: v_cvt_u32_f32
10
11; EG: UINT_TO_FLT
12; EG-DAG: UINT_TO_FLT
13; EG-DAG: RECIP_IEEE
14; EG: FLT_TO_UINT
15define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
16  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
17  %num = load i8, ptr addrspace(1) %in
18  %den = load i8, ptr addrspace(1) %den_ptr
19  %result = udiv i8 %num, %den
20  store i8 %result, ptr addrspace(1) %out
21  ret void
22}
23
24; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out:
25; SI: v_cvt_f32_ubyte
26; SI-DAG: v_cvt_f32_ubyte
27; SI-DAG: v_rcp_iflag_f32
28; SI: v_cvt_u32_f32
29
30; EG: UINT_TO_FLT
31; EG-DAG: UINT_TO_FLT
32; EG-DAG: RECIP_IEEE
33; EG: FLT_TO_UINT
34define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
35  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
36  %num = load i8, ptr addrspace(1) %in
37  %den = load i8, ptr addrspace(1) %den_ptr
38  %result = udiv i8 %num, %den
39  store i8 %result, ptr addrspace(1) %out
40  ret void
41}
42
43; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in:
44; SI: v_cvt_f32_ubyte
45; SI-DAG: v_cvt_f32_ubyte
46; SI-DAG: v_rcp_iflag_f32
47; SI: v_cvt_u32_f32
48
49; EG: UINT_TO_FLT
50; EG-DAG: UINT_TO_FLT
51; EG-DAG: RECIP_IEEE
52; EG: FLT_TO_UINT
53define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
54  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
55  %num = load i8, ptr addrspace(1) %in
56  %den = load i8, ptr addrspace(1) %den_ptr
57  %result = udiv i8 %num, %den
58  store i8 %result, ptr addrspace(1) %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out:
63; SI: v_cvt_f32_ubyte
64; SI-DAG: v_cvt_f32_ubyte
65; SI-DAG: v_rcp_iflag_f32
66; SI: v_cvt_u32_f32
67
68; EG: UINT_TO_FLT
69; EG-DAG: UINT_TO_FLT
70; EG-DAG: RECIP_IEEE
71; EG: FLT_TO_UINT
72define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
73  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
74  %num = load i8, ptr addrspace(1) %in
75  %den = load i8, ptr addrspace(1) %den_ptr
76  %result = udiv i8 %num, %den
77  store i8 %result, ptr addrspace(1) %out
78  ret void
79}
80
81; FUNC-LABEL: {{^}}udiv24_i16:
82; SI: v_cvt_f32_u32
83; SI: v_cvt_f32_u32
84; SI: v_rcp_iflag_f32
85; SI: v_cvt_u32_f32
86
87; EG: UINT_TO_FLT
88; EG-DAG: UINT_TO_FLT
89; EG-DAG: RECIP_IEEE
90; EG: FLT_TO_UINT
91define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
92  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
93  %num = load i16, ptr addrspace(1) %in, align 2
94  %den = load i16, ptr addrspace(1) %den_ptr, align 2
95  %result = udiv i16 %num, %den
96  store i16 %result, ptr addrspace(1) %out, align 2
97  ret void
98}
99
100; FUNC-LABEL: {{^}}udiv23_i32:
101; SI: v_cvt_f32_u32
102; SI-DAG: v_cvt_f32_u32
103; SI-DAG: v_rcp_iflag_f32
104; SI: v_cvt_u32_f32
105
106; EG: UINT_TO_FLT
107; EG-DAG: UINT_TO_FLT
108; EG-DAG: RECIP_IEEE
109; EG: FLT_TO_UINT
110define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
111  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
112  %num = load i32, ptr addrspace(1) %in, align 4
113  %den = load i32, ptr addrspace(1) %den_ptr, align 4
114  %num.i23.0 = shl i32 %num, 9
115  %den.i23.0 = shl i32 %den, 9
116  %num.i23 = lshr i32 %num.i23.0, 9
117  %den.i23 = lshr i32 %den.i23.0, 9
118  %result = udiv i32 %num.i23, %den.i23
119  store i32 %result, ptr addrspace(1) %out, align 4
120  ret void
121}
122
123; FUNC-LABEL: {{^}}udiv24_i32:
124; SI: v_rcp_iflag
125; SI-NOT: v_rcp_f32
126; EG-NOT: RECIP_IEEE
127define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
128  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
129  %num = load i32, ptr addrspace(1) %in, align 4
130  %den = load i32, ptr addrspace(1) %den_ptr, align 4
131  %num.i24.0 = shl i32 %num, 8
132  %den.i24.0 = shl i32 %den, 8
133  %num.i24 = lshr i32 %num.i24.0, 8
134  %den.i24 = lshr i32 %den.i24.0, 8
135  %result = udiv i32 %num.i24, %den.i24
136  store i32 %result, ptr addrspace(1) %out, align 4
137  ret void
138}
139
140; FUNC-LABEL: {{^}}no_udiv24_u23_u24_i32:
141; SI: v_rcp_iflag
142; SI-NOT: v_rcp_f32
143; EG-NOT: RECIP_IEEE
144define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
145  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
146  %num = load i32, ptr addrspace(1) %in, align 4
147  %den = load i32, ptr addrspace(1) %den_ptr, align 4
148  %num.i23.0 = shl i32 %num, 9
149  %den.i24.0 = shl i32 %den, 8
150  %num.i23 = lshr i32 %num.i23.0, 9
151  %den.i24 = lshr i32 %den.i24.0, 8
152  %result = udiv i32 %num.i23, %den.i24
153  store i32 %result, ptr addrspace(1) %out, align 4
154  ret void
155}
156
157; FUNC-LABEL: {{^}}no_udiv24_u24_u23_i32:
158; SI: v_rcp_iflag
159; SI-NOT: v_rcp_f32
160; EG-NOT: RECIP_IEEE
161define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
162  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
163  %num = load i32, ptr addrspace(1) %in, align 4
164  %den = load i32, ptr addrspace(1) %den_ptr, align 4
165  %num.i24.0 = shl i32 %num, 8
166  %den.i23.0 = shl i32 %den, 9
167  %num.i24 = lshr i32 %num.i24.0, 8
168  %den.i23 = lshr i32 %den.i23.0, 9
169  %result = udiv i32 %num.i24, %den.i23
170  store i32 %result, ptr addrspace(1) %out, align 4
171  ret void
172}
173
174; FUNC-LABEL: {{^}}udiv25_i32:
175; RCP_IFLAG is for URECIP in the full 32b alg
176; SI: v_rcp_iflag
177; SI-NOT: v_rcp_f32
178
179; EG-NOT: UINT_TO_FLT
180; EG-NOT: RECIP_IEEE
181define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
182  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
183  %num = load i32, ptr addrspace(1) %in, align 4
184  %den = load i32, ptr addrspace(1) %den_ptr, align 4
185  %num.i25.0 = shl i32 %num, 7
186  %den.i25.0 = shl i32 %den, 7
187  %num.i25 = lshr i32 %num.i25.0, 7
188  %den.i25 = lshr i32 %den.i25.0, 7
189  %result = udiv i32 %num.i25, %den.i25
190  store i32 %result, ptr addrspace(1) %out, align 4
191  ret void
192}
193
194; FUNC-LABEL: {{^}}test_no_udiv24_i32_1:
195; RCP_IFLAG is for URECIP in the full 32b alg
196; SI: v_rcp_iflag
197; SI-NOT: v_rcp_f32
198
199; EG-NOT: UINT_TO_FLT
200; EG-NOT: RECIP_IEEE
201define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
202  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
203  %num = load i32, ptr addrspace(1) %in, align 4
204  %den = load i32, ptr addrspace(1) %den_ptr, align 4
205  %num.i24.0 = shl i32 %num, 8
206  %den.i24.0 = shl i32 %den, 7
207  %num.i24 = lshr i32 %num.i24.0, 8
208  %den.i24 = lshr i32 %den.i24.0, 7
209  %result = udiv i32 %num.i24, %den.i24
210  store i32 %result, ptr addrspace(1) %out, align 4
211  ret void
212}
213
214; FUNC-LABEL: {{^}}test_no_udiv24_i32_2:
215; RCP_IFLAG is for URECIP in the full 32b alg
216; SI: v_rcp_iflag
217; SI-NOT: v_rcp_f32
218
219; EG-NOT: UINT_TO_FLT
220; EG-NOT: RECIP_IEEE
221define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
222  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
223  %num = load i32, ptr addrspace(1) %in, align 4
224  %den = load i32, ptr addrspace(1) %den_ptr, align 4
225  %num.i24.0 = shl i32 %num, 7
226  %den.i24.0 = shl i32 %den, 8
227  %num.i24 = lshr i32 %num.i24.0, 7
228  %den.i24 = lshr i32 %den.i24.0, 8
229  %result = udiv i32 %num.i24, %den.i24
230  store i32 %result, ptr addrspace(1) %out, align 4
231  ret void
232}
233
234; FUNC-LABEL: {{^}}urem24_i8:
235; SI: v_cvt_f32_ubyte
236; SI-DAG: v_cvt_f32_ubyte
237; SI-DAG: v_rcp_iflag_f32
238; SI: v_cvt_u32_f32
239
240; EG: UINT_TO_FLT
241; EG-DAG: UINT_TO_FLT
242; EG-DAG: RECIP_IEEE
243; EG: FLT_TO_UINT
244define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) {
245  %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1
246  %num = load i8, ptr addrspace(1) %in
247  %den = load i8, ptr addrspace(1) %den_ptr
248  %result = urem i8 %num, %den
249  store i8 %result, ptr addrspace(1) %out
250  ret void
251}
252
253; FUNC-LABEL: {{^}}urem24_i16:
254; SI: v_cvt_f32_u32
255; SI: v_cvt_f32_u32
256; SI: v_rcp_iflag_f32
257; SI: v_cvt_u32_f32
258
259; EG: UINT_TO_FLT
260; EG-DAG: UINT_TO_FLT
261; EG-DAG: RECIP_IEEE
262; EG: FLT_TO_UINT
263define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
264  %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
265  %num = load i16, ptr addrspace(1) %in, align 2
266  %den = load i16, ptr addrspace(1) %den_ptr, align 2
267  %result = urem i16 %num, %den
268  store i16 %result, ptr addrspace(1) %out, align 2
269  ret void
270}
271
272; FUNC-LABEL: {{^}}urem24_i32:
273; SI-NOT: v_rcp_f32
274; EG-NOT: RECIP_IEEE
275define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
276  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
277  %num = load i32, ptr addrspace(1) %in, align 4
278  %den = load i32, ptr addrspace(1) %den_ptr, align 4
279  %num.i24.0 = shl i32 %num, 8
280  %den.i24.0 = shl i32 %den, 8
281  %num.i24 = lshr i32 %num.i24.0, 8
282  %den.i24 = lshr i32 %den.i24.0, 8
283  %result = urem i32 %num.i24, %den.i24
284  store i32 %result, ptr addrspace(1) %out, align 4
285  ret void
286}
287
288; FUNC-LABEL: {{^}}urem25_i32:
289; RCP_IFLAG is for URECIP in the full 32b alg
290; SI: v_rcp_iflag
291; SI-NOT: v_rcp_f32
292
293; EG-NOT: UINT_TO_FLT
294; EG-NOT: RECIP_IEEE
295define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
296  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
297  %num = load i32, ptr addrspace(1) %in, align 4
298  %den = load i32, ptr addrspace(1) %den_ptr, align 4
299  %num.i24.0 = shl i32 %num, 7
300  %den.i24.0 = shl i32 %den, 7
301  %num.i24 = lshr i32 %num.i24.0, 7
302  %den.i24 = lshr i32 %den.i24.0, 7
303  %result = urem i32 %num.i24, %den.i24
304  store i32 %result, ptr addrspace(1) %out, align 4
305  ret void
306}
307
308; FUNC-LABEL: {{^}}test_no_urem24_i32_1:
309; RCP_IFLAG is for URECIP in the full 32b alg
310; SI: v_rcp_iflag
311; SI-NOT: v_rcp_f32
312
313; EG-NOT: UINT_TO_FLT
314; EG-NOT: RECIP_IEEE
315define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
316  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
317  %num = load i32, ptr addrspace(1) %in, align 4
318  %den = load i32, ptr addrspace(1) %den_ptr, align 4
319  %num.i24.0 = shl i32 %num, 8
320  %den.i24.0 = shl i32 %den, 7
321  %num.i24 = lshr i32 %num.i24.0, 8
322  %den.i24 = lshr i32 %den.i24.0, 7
323  %result = urem i32 %num.i24, %den.i24
324  store i32 %result, ptr addrspace(1) %out, align 4
325  ret void
326}
327
328; FUNC-LABEL: {{^}}test_no_urem24_i32_2:
329; RCP_IFLAG is for URECIP in the full 32b alg
330; SI: v_rcp_iflag
331; SI-NOT: v_rcp_f32
332
333; EG-NOT: UINT_TO_FLT
334; EG-NOT: RECIP_IEEE
335define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
336  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
337  %num = load i32, ptr addrspace(1) %in, align 4
338  %den = load i32, ptr addrspace(1) %den_ptr, align 4
339  %num.i24.0 = shl i32 %num, 7
340  %den.i24.0 = shl i32 %den, 8
341  %num.i24 = lshr i32 %num.i24.0, 7
342  %den.i24 = lshr i32 %den.i24.0, 8
343  %result = urem i32 %num.i24, %den.i24
344  store i32 %result, ptr addrspace(1) %out, align 4
345  ret void
346}
347
348; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32:
349; SI: v_rcp_iflag_f32
350; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
351
352; EG: RECIP_IEEE
353define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
354  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
355  %num = load i32, ptr addrspace(1) %in, align 4
356  %den = load i32, ptr addrspace(1) %den_ptr, align 4
357  %num.i16.0 = shl i32 %num, 16
358  %den.i23.0 = shl i32 %den, 9
359  %num.i16 = lshr i32 %num.i16.0, 16
360  %den.i23 = lshr i32 %den.i23.0, 9
361  %result = udiv i32 %num.i16, %den.i23
362  store i32 %result, ptr addrspace(1) %out, align 4
363  ret void
364}
365
366; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32:
367; SI: v_rcp_iflag_f32
368; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff,
369
370; EG: RECIP_IEEE
371define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
372  %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
373  %num = load i32, ptr addrspace(1) %in, align 4
374  %den = load i32, ptr addrspace(1) %den_ptr, align 4
375  %num.i23.0 = shl i32 %num, 9
376  %den.i16.0 = shl i32 %den, 16
377  %num.i23 = lshr i32 %num.i23.0, 9
378  %den.i16 = lshr i32 %den.i16.0, 16
379  %result = udiv i32 %num.i23, %den.i16
380  store i32 %result, ptr addrspace(1) %out, align 4
381  ret void
382}
383
384attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
385attributes #1 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
386attributes #2 = { "denormal-fp-math-f32"="preserve-sign,ieee" }
387