xref: /llvm-project/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll (revision 310e79875752886a7713911e2a1ec14bc75bd4b3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; ## Support i16x2 instructions
3; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
4; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
5; RUN: | FileCheck -allow-deprecated-dag-overlap %s
6; RUN: %if ptxas %{                                                           \
7; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \
8; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
9; RUN:   | %ptxas-verify -arch=sm_90                                          \
10; RUN: %}
11
12target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
13
14define <4 x i8> @test_ret_const() #0 {
15; CHECK-LABEL: test_ret_const(
16; CHECK:       {
17; CHECK-NEXT:    .reg .b32 %r<2>;
18; CHECK-EMPTY:
19; CHECK-NEXT:  // %bb.0:
20; CHECK-NEXT:    mov.b32 %r1, -66911489;
21; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
22; CHECK-NEXT:    ret;
23  ret <4 x i8> <i8 -1, i8 2, i8 3, i8 -4>
24}
25
26define i8 @test_extract_0(<4 x i8> %a) #0 {
27; CHECK-LABEL: test_extract_0(
28; CHECK:       {
29; CHECK-NEXT:    .reg .b32 %r<3>;
30; CHECK-EMPTY:
31; CHECK-NEXT:  // %bb.0:
32; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_0_param_0];
33; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
34; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
35; CHECK-NEXT:    ret;
36  %e = extractelement <4 x i8> %a, i32 0
37  ret i8 %e
38}
39
40define i8 @test_extract_1(<4 x i8> %a) #0 {
41; CHECK-LABEL: test_extract_1(
42; CHECK:       {
43; CHECK-NEXT:    .reg .b32 %r<3>;
44; CHECK-EMPTY:
45; CHECK-NEXT:  // %bb.0:
46; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_1_param_0];
47; CHECK-NEXT:    bfe.u32 %r2, %r1, 8, 8;
48; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
49; CHECK-NEXT:    ret;
50  %e = extractelement <4 x i8> %a, i32 1
51  ret i8 %e
52}
53
54define i8 @test_extract_2(<4 x i8> %a) #0 {
55; CHECK-LABEL: test_extract_2(
56; CHECK:       {
57; CHECK-NEXT:    .reg .b32 %r<3>;
58; CHECK-EMPTY:
59; CHECK-NEXT:  // %bb.0:
60; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_2_param_0];
61; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
62; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
63; CHECK-NEXT:    ret;
64  %e = extractelement <4 x i8> %a, i32 2
65  ret i8 %e
66}
67
68define i8 @test_extract_3(<4 x i8> %a) #0 {
69; CHECK-LABEL: test_extract_3(
70; CHECK:       {
71; CHECK-NEXT:    .reg .b32 %r<3>;
72; CHECK-EMPTY:
73; CHECK-NEXT:  // %bb.0:
74; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_3_param_0];
75; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
76; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
77; CHECK-NEXT:    ret;
78  %e = extractelement <4 x i8> %a, i32 3
79  ret i8 %e
80}
81
82define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
83; CHECK-LABEL: test_extract_i(
84; CHECK:       {
85; CHECK-NEXT:    .reg .b32 %r<5>;
86; CHECK-NEXT:    .reg .b64 %rd<2>;
87; CHECK-EMPTY:
88; CHECK-NEXT:  // %bb.0:
89; CHECK-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
90; CHECK-NEXT:    ld.param.u32 %r1, [test_extract_i_param_0];
91; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
92; CHECK-NEXT:    shl.b32 %r3, %r2, 3;
93; CHECK-NEXT:    bfe.u32 %r4, %r1, %r3, 8;
94; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
95; CHECK-NEXT:    ret;
96  %e = extractelement <4 x i8> %a, i64 %idx
97  ret i8 %e
98}
99
100define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
101; CHECK-LABEL: test_add(
102; CHECK:       {
103; CHECK-NEXT:    .reg .b16 %rs<13>;
104; CHECK-NEXT:    .reg .b32 %r<18>;
105; CHECK-EMPTY:
106; CHECK-NEXT:  // %bb.0:
107; CHECK-NEXT:    ld.param.u32 %r2, [test_add_param_1];
108; CHECK-NEXT:    ld.param.u32 %r1, [test_add_param_0];
109; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
110; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
111; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
112; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
113; CHECK-NEXT:    add.s16 %rs3, %rs2, %rs1;
114; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
115; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
116; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
117; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
118; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
119; CHECK-NEXT:    add.s16 %rs6, %rs5, %rs4;
120; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
121; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
122; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
123; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
124; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
125; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
126; CHECK-NEXT:    add.s16 %rs9, %rs8, %rs7;
127; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
128; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
129; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
130; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
131; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
132; CHECK-NEXT:    add.s16 %rs12, %rs11, %rs10;
133; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
134; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
135; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
136; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
137; CHECK-NEXT:    ret;
138  %r = add <4 x i8> %a, %b
139  ret <4 x i8> %r
140}
141
142define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
143; CHECK-LABEL: test_add_imm_0(
144; CHECK:       {
145; CHECK-NEXT:    .reg .b16 %rs<9>;
146; CHECK-NEXT:    .reg .b32 %r<13>;
147; CHECK-EMPTY:
148; CHECK-NEXT:  // %bb.0:
149; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
150; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
151; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
152; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
153; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
154; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
155; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
156; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
157; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
158; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
159; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
160; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
161; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
162; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
163; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
164; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
165; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
166; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
167; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
168; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
169; CHECK-NEXT:    st.param.b32 [func_retval0], %r12;
170; CHECK-NEXT:    ret;
171  %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
172  ret <4 x i8> %r
173}
174
175define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
176; CHECK-LABEL: test_add_imm_1(
177; CHECK:       {
178; CHECK-NEXT:    .reg .b16 %rs<9>;
179; CHECK-NEXT:    .reg .b32 %r<13>;
180; CHECK-EMPTY:
181; CHECK-NEXT:  // %bb.0:
182; CHECK-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
183; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
184; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
185; CHECK-NEXT:    add.s16 %rs2, %rs1, 4;
186; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
187; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
188; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
189; CHECK-NEXT:    add.s16 %rs4, %rs3, 3;
190; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
191; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
192; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
193; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
194; CHECK-NEXT:    add.s16 %rs6, %rs5, 2;
195; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
196; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
197; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
198; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
199; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
200; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
201; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
202; CHECK-NEXT:    st.param.b32 [func_retval0], %r12;
203; CHECK-NEXT:    ret;
204  %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
205  ret <4 x i8> %r
206}
207
208define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
209; CHECK-LABEL: test_sub(
210; CHECK:       {
211; CHECK-NEXT:    .reg .b16 %rs<13>;
212; CHECK-NEXT:    .reg .b32 %r<18>;
213; CHECK-EMPTY:
214; CHECK-NEXT:  // %bb.0:
215; CHECK-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
216; CHECK-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
217; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
218; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
219; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
220; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
221; CHECK-NEXT:    sub.s16 %rs3, %rs2, %rs1;
222; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
223; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
224; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
225; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
226; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
227; CHECK-NEXT:    sub.s16 %rs6, %rs5, %rs4;
228; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
229; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
230; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
231; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
232; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
233; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
234; CHECK-NEXT:    sub.s16 %rs9, %rs8, %rs7;
235; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
236; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
237; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
238; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
239; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
240; CHECK-NEXT:    sub.s16 %rs12, %rs11, %rs10;
241; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
242; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
243; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
244; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
245; CHECK-NEXT:    ret;
246  %r = sub <4 x i8> %a, %b
247  ret <4 x i8> %r
248}
249
250define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
251; CHECK-LABEL: test_smax(
252; CHECK:       {
253; CHECK-NEXT:    .reg .pred %p<5>;
254; CHECK-NEXT:    .reg .b32 %r<26>;
255; CHECK-EMPTY:
256; CHECK-NEXT:  // %bb.0:
257; CHECK-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
258; CHECK-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
259; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
260; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
261; CHECK-NEXT:    setp.gt.s32 %p1, %r4, %r3;
262; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
263; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
264; CHECK-NEXT:    setp.gt.s32 %p2, %r6, %r5;
265; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
266; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
267; CHECK-NEXT:    setp.gt.s32 %p3, %r8, %r7;
268; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
269; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
270; CHECK-NEXT:    setp.gt.s32 %p4, %r10, %r9;
271; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
272; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
273; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
274; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
275; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
276; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
277; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
278; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
279; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
280; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
281; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
282; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
283; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
284; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
285; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
286; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
287; CHECK-NEXT:    ret;
288  %cmp = icmp sgt <4 x i8> %a, %b
289  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
290  ret <4 x i8> %r
291}
292
293define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
294; CHECK-LABEL: test_umax(
295; CHECK:       {
296; CHECK-NEXT:    .reg .pred %p<5>;
297; CHECK-NEXT:    .reg .b32 %r<18>;
298; CHECK-EMPTY:
299; CHECK-NEXT:  // %bb.0:
300; CHECK-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
301; CHECK-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
302; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
303; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
304; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
305; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
306; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
307; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
308; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
309; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
310; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
311; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
312; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
313; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
314; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
315; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
316; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
317; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
318; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
319; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
320; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
321; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
322; CHECK-NEXT:    ret;
323  %cmp = icmp ugt <4 x i8> %a, %b
324  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
325  ret <4 x i8> %r
326}
327
328define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
329; CHECK-LABEL: test_smin(
330; CHECK:       {
331; CHECK-NEXT:    .reg .pred %p<5>;
332; CHECK-NEXT:    .reg .b32 %r<26>;
333; CHECK-EMPTY:
334; CHECK-NEXT:  // %bb.0:
335; CHECK-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
336; CHECK-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
337; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
338; CHECK-NEXT:    bfe.s32 %r4, %r1, 0, 8;
339; CHECK-NEXT:    setp.le.s32 %p1, %r4, %r3;
340; CHECK-NEXT:    bfe.s32 %r5, %r2, 8, 8;
341; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
342; CHECK-NEXT:    setp.le.s32 %p2, %r6, %r5;
343; CHECK-NEXT:    bfe.s32 %r7, %r2, 16, 8;
344; CHECK-NEXT:    bfe.s32 %r8, %r1, 16, 8;
345; CHECK-NEXT:    setp.le.s32 %p3, %r8, %r7;
346; CHECK-NEXT:    bfe.s32 %r9, %r2, 24, 8;
347; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
348; CHECK-NEXT:    setp.le.s32 %p4, %r10, %r9;
349; CHECK-NEXT:    bfe.u32 %r11, %r1, 0, 8;
350; CHECK-NEXT:    bfe.u32 %r12, %r1, 8, 8;
351; CHECK-NEXT:    bfe.u32 %r13, %r1, 16, 8;
352; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
353; CHECK-NEXT:    bfe.u32 %r15, %r2, 24, 8;
354; CHECK-NEXT:    selp.b32 %r16, %r14, %r15, %p4;
355; CHECK-NEXT:    bfe.u32 %r17, %r2, 16, 8;
356; CHECK-NEXT:    selp.b32 %r18, %r13, %r17, %p3;
357; CHECK-NEXT:    prmt.b32 %r19, %r18, %r16, 0x3340U;
358; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
359; CHECK-NEXT:    selp.b32 %r21, %r12, %r20, %p2;
360; CHECK-NEXT:    bfe.u32 %r22, %r2, 0, 8;
361; CHECK-NEXT:    selp.b32 %r23, %r11, %r22, %p1;
362; CHECK-NEXT:    prmt.b32 %r24, %r23, %r21, 0x3340U;
363; CHECK-NEXT:    prmt.b32 %r25, %r24, %r19, 0x5410U;
364; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
365; CHECK-NEXT:    ret;
366  %cmp = icmp sle <4 x i8> %a, %b
367  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
368  ret <4 x i8> %r
369}
370
371define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
372; CHECK-LABEL: test_umin(
373; CHECK:       {
374; CHECK-NEXT:    .reg .pred %p<5>;
375; CHECK-NEXT:    .reg .b32 %r<18>;
376; CHECK-EMPTY:
377; CHECK-NEXT:  // %bb.0:
378; CHECK-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
379; CHECK-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
380; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
381; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
382; CHECK-NEXT:    setp.ls.u32 %p1, %r4, %r3;
383; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
384; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
385; CHECK-NEXT:    setp.ls.u32 %p2, %r6, %r5;
386; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
387; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
388; CHECK-NEXT:    setp.ls.u32 %p3, %r8, %r7;
389; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
390; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
391; CHECK-NEXT:    setp.ls.u32 %p4, %r10, %r9;
392; CHECK-NEXT:    selp.b32 %r11, %r10, %r9, %p4;
393; CHECK-NEXT:    selp.b32 %r12, %r8, %r7, %p3;
394; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
395; CHECK-NEXT:    selp.b32 %r14, %r6, %r5, %p2;
396; CHECK-NEXT:    selp.b32 %r15, %r4, %r3, %p1;
397; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
398; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
399; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
400; CHECK-NEXT:    ret;
401  %cmp = icmp ule <4 x i8> %a, %b
402  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
403  ret <4 x i8> %r
404}
405
406define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
407; CHECK-LABEL: test_eq(
408; CHECK:       {
409; CHECK-NEXT:    .reg .pred %p<5>;
410; CHECK-NEXT:    .reg .b32 %r<23>;
411; CHECK-EMPTY:
412; CHECK-NEXT:  // %bb.0:
413; CHECK-NEXT:    ld.param.u32 %r3, [test_eq_param_2];
414; CHECK-NEXT:    ld.param.u32 %r2, [test_eq_param_1];
415; CHECK-NEXT:    ld.param.u32 %r1, [test_eq_param_0];
416; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
417; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
418; CHECK-NEXT:    setp.eq.u32 %p1, %r5, %r4;
419; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
420; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
421; CHECK-NEXT:    setp.eq.u32 %p2, %r7, %r6;
422; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
423; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
424; CHECK-NEXT:    setp.eq.u32 %p3, %r9, %r8;
425; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
426; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
427; CHECK-NEXT:    setp.eq.u32 %p4, %r11, %r10;
428; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
429; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
430; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
431; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
432; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
433; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
434; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
435; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
436; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
437; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
438; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
439; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
440; CHECK-NEXT:    ret;
441  %cmp = icmp eq <4 x i8> %a, %b
442  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
443  ret <4 x i8> %r
444}
445
446define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
447; CHECK-LABEL: test_ne(
448; CHECK:       {
449; CHECK-NEXT:    .reg .pred %p<5>;
450; CHECK-NEXT:    .reg .b32 %r<23>;
451; CHECK-EMPTY:
452; CHECK-NEXT:  // %bb.0:
453; CHECK-NEXT:    ld.param.u32 %r3, [test_ne_param_2];
454; CHECK-NEXT:    ld.param.u32 %r2, [test_ne_param_1];
455; CHECK-NEXT:    ld.param.u32 %r1, [test_ne_param_0];
456; CHECK-NEXT:    bfe.u32 %r4, %r2, 0, 8;
457; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
458; CHECK-NEXT:    setp.ne.u32 %p1, %r5, %r4;
459; CHECK-NEXT:    bfe.u32 %r6, %r2, 8, 8;
460; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
461; CHECK-NEXT:    setp.ne.u32 %p2, %r7, %r6;
462; CHECK-NEXT:    bfe.u32 %r8, %r2, 16, 8;
463; CHECK-NEXT:    bfe.u32 %r9, %r1, 16, 8;
464; CHECK-NEXT:    setp.ne.u32 %p3, %r9, %r8;
465; CHECK-NEXT:    bfe.u32 %r10, %r2, 24, 8;
466; CHECK-NEXT:    bfe.u32 %r11, %r1, 24, 8;
467; CHECK-NEXT:    setp.ne.u32 %p4, %r11, %r10;
468; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
469; CHECK-NEXT:    selp.b32 %r13, %r11, %r12, %p4;
470; CHECK-NEXT:    bfe.u32 %r14, %r3, 16, 8;
471; CHECK-NEXT:    selp.b32 %r15, %r9, %r14, %p3;
472; CHECK-NEXT:    prmt.b32 %r16, %r15, %r13, 0x3340U;
473; CHECK-NEXT:    bfe.u32 %r17, %r3, 8, 8;
474; CHECK-NEXT:    selp.b32 %r18, %r7, %r17, %p2;
475; CHECK-NEXT:    bfe.u32 %r19, %r3, 0, 8;
476; CHECK-NEXT:    selp.b32 %r20, %r5, %r19, %p1;
477; CHECK-NEXT:    prmt.b32 %r21, %r20, %r18, 0x3340U;
478; CHECK-NEXT:    prmt.b32 %r22, %r21, %r16, 0x5410U;
479; CHECK-NEXT:    st.param.b32 [func_retval0], %r22;
480; CHECK-NEXT:    ret;
481  %cmp = icmp ne <4 x i8> %a, %b
482  %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
483  ret <4 x i8> %r
484}
485
486define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
487; CHECK-LABEL: test_mul(
488; CHECK:       {
489; CHECK-NEXT:    .reg .b16 %rs<13>;
490; CHECK-NEXT:    .reg .b32 %r<18>;
491; CHECK-EMPTY:
492; CHECK-NEXT:  // %bb.0:
493; CHECK-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
494; CHECK-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
495; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
496; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
497; CHECK-NEXT:    bfe.u32 %r4, %r1, 24, 8;
498; CHECK-NEXT:    cvt.u16.u32 %rs2, %r4;
499; CHECK-NEXT:    mul.lo.s16 %rs3, %rs2, %rs1;
500; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
501; CHECK-NEXT:    bfe.u32 %r6, %r2, 16, 8;
502; CHECK-NEXT:    cvt.u16.u32 %rs4, %r6;
503; CHECK-NEXT:    bfe.u32 %r7, %r1, 16, 8;
504; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
505; CHECK-NEXT:    mul.lo.s16 %rs6, %rs5, %rs4;
506; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
507; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
508; CHECK-NEXT:    bfe.u32 %r10, %r2, 8, 8;
509; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
510; CHECK-NEXT:    bfe.u32 %r11, %r1, 8, 8;
511; CHECK-NEXT:    cvt.u16.u32 %rs8, %r11;
512; CHECK-NEXT:    mul.lo.s16 %rs9, %rs8, %rs7;
513; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
514; CHECK-NEXT:    bfe.u32 %r13, %r2, 0, 8;
515; CHECK-NEXT:    cvt.u16.u32 %rs10, %r13;
516; CHECK-NEXT:    bfe.u32 %r14, %r1, 0, 8;
517; CHECK-NEXT:    cvt.u16.u32 %rs11, %r14;
518; CHECK-NEXT:    mul.lo.s16 %rs12, %rs11, %rs10;
519; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
520; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
521; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
522; CHECK-NEXT:    st.param.b32 [func_retval0], %r17;
523; CHECK-NEXT:    ret;
524  %r = mul <4 x i8> %a, %b
525  ret <4 x i8> %r
526}
527
528define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
529; CHECK-LABEL: test_or(
530; CHECK:       {
531; CHECK-NEXT:    .reg .b32 %r<4>;
532; CHECK-EMPTY:
533; CHECK-NEXT:  // %bb.0:
534; CHECK-NEXT:    ld.param.u32 %r2, [test_or_param_1];
535; CHECK-NEXT:    ld.param.u32 %r1, [test_or_param_0];
536; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
537; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
538; CHECK-NEXT:    ret;
539  %r = or <4 x i8> %a, %b
540  ret <4 x i8> %r
541}
542
543define <4 x i8> @test_or_computed(i8 %a) {
544; CHECK-LABEL: test_or_computed(
545; CHECK:       {
546; CHECK-NEXT:    .reg .b16 %rs<2>;
547; CHECK-NEXT:    .reg .b32 %r<8>;
548; CHECK-EMPTY:
549; CHECK-NEXT:  // %bb.0:
550; CHECK-NEXT:    ld.param.u8 %rs1, [test_or_computed_param_0];
551; CHECK-NEXT:    mov.b32 %r1, 0;
552; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
553; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
554; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
555; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
556; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
557; CHECK-NEXT:    or.b32 %r7, %r6, %r5;
558; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
559; CHECK-NEXT:    ret;
560  %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
561  %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
562  %r = or <4 x i8> %ins.1, %ins.0
563  ret <4 x i8> %r
564}
565
566define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 {
567; CHECK-LABEL: test_or_imm_0(
568; CHECK:       {
569; CHECK-NEXT:    .reg .b32 %r<3>;
570; CHECK-EMPTY:
571; CHECK-NEXT:  // %bb.0:
572; CHECK-NEXT:    ld.param.u32 %r1, [test_or_imm_0_param_0];
573; CHECK-NEXT:    or.b32 %r2, %r1, 67305985;
574; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
575; CHECK-NEXT:    ret;
576  %r = or <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
577  ret <4 x i8> %r
578}
579
580define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
581; CHECK-LABEL: test_or_imm_1(
582; CHECK:       {
583; CHECK-NEXT:    .reg .b32 %r<3>;
584; CHECK-EMPTY:
585; CHECK-NEXT:  // %bb.0:
586; CHECK-NEXT:    ld.param.u32 %r1, [test_or_imm_1_param_0];
587; CHECK-NEXT:    or.b32 %r2, %r1, 67305985;
588; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
589; CHECK-NEXT:    ret;
590  %r = or <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
591  ret <4 x i8> %r
592}
593
594define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
595; CHECK-LABEL: test_xor(
596; CHECK:       {
597; CHECK-NEXT:    .reg .b32 %r<4>;
598; CHECK-EMPTY:
599; CHECK-NEXT:  // %bb.0:
600; CHECK-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
601; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
602; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
603; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
604; CHECK-NEXT:    ret;
605  %r = xor <4 x i8> %a, %b
606  ret <4 x i8> %r
607}
608
609define <4 x i8> @test_xor_computed(i8 %a) {
610; CHECK-LABEL: test_xor_computed(
611; CHECK:       {
612; CHECK-NEXT:    .reg .b16 %rs<2>;
613; CHECK-NEXT:    .reg .b32 %r<8>;
614; CHECK-EMPTY:
615; CHECK-NEXT:  // %bb.0:
616; CHECK-NEXT:    ld.param.u8 %rs1, [test_xor_computed_param_0];
617; CHECK-NEXT:    mov.b32 %r1, 0;
618; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
619; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
620; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
621; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
622; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
623; CHECK-NEXT:    xor.b32 %r7, %r6, %r5;
624; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
625; CHECK-NEXT:    ret;
626  %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
627  %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
628  %r = xor <4 x i8> %ins.1, %ins.0
629  ret <4 x i8> %r
630}
631
632define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 {
633; CHECK-LABEL: test_xor_imm_0(
634; CHECK:       {
635; CHECK-NEXT:    .reg .b32 %r<3>;
636; CHECK-EMPTY:
637; CHECK-NEXT:  // %bb.0:
638; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_imm_0_param_0];
639; CHECK-NEXT:    xor.b32 %r2, %r1, 67305985;
640; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
641; CHECK-NEXT:    ret;
642  %r = xor <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
643  ret <4 x i8> %r
644}
645
646define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
647; CHECK-LABEL: test_xor_imm_1(
648; CHECK:       {
649; CHECK-NEXT:    .reg .b32 %r<3>;
650; CHECK-EMPTY:
651; CHECK-NEXT:  // %bb.0:
652; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_imm_1_param_0];
653; CHECK-NEXT:    xor.b32 %r2, %r1, 67305985;
654; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
655; CHECK-NEXT:    ret;
656  %r = xor <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
657  ret <4 x i8> %r
658}
659
660define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
661; CHECK-LABEL: test_and(
662; CHECK:       {
663; CHECK-NEXT:    .reg .b32 %r<4>;
664; CHECK-EMPTY:
665; CHECK-NEXT:  // %bb.0:
666; CHECK-NEXT:    ld.param.u32 %r2, [test_and_param_1];
667; CHECK-NEXT:    ld.param.u32 %r1, [test_and_param_0];
668; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
669; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
670; CHECK-NEXT:    ret;
671  %r = and <4 x i8> %a, %b
672  ret <4 x i8> %r
673}
674
675define <4 x i8> @test_and_computed(i8 %a) {
676; CHECK-LABEL: test_and_computed(
677; CHECK:       {
678; CHECK-NEXT:    .reg .b16 %rs<2>;
679; CHECK-NEXT:    .reg .b32 %r<8>;
680; CHECK-EMPTY:
681; CHECK-NEXT:  // %bb.0:
682; CHECK-NEXT:    ld.param.u8 %rs1, [test_and_computed_param_0];
683; CHECK-NEXT:    mov.b32 %r1, 0;
684; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x3340U;
685; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
686; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
687; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
688; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
689; CHECK-NEXT:    and.b32 %r7, %r6, %r5;
690; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
691; CHECK-NEXT:    ret;
692  %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
693  %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
694  %r = and <4 x i8> %ins.1, %ins.0
695  ret <4 x i8> %r
696}
697
698define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 {
699; CHECK-LABEL: test_and_imm_0(
700; CHECK:       {
701; CHECK-NEXT:    .reg .b32 %r<3>;
702; CHECK-EMPTY:
703; CHECK-NEXT:  // %bb.0:
704; CHECK-NEXT:    ld.param.u32 %r1, [test_and_imm_0_param_0];
705; CHECK-NEXT:    and.b32 %r2, %r1, 67305985;
706; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
707; CHECK-NEXT:    ret;
708  %r = and <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
709  ret <4 x i8> %r
710}
711
712define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
713; CHECK-LABEL: test_and_imm_1(
714; CHECK:       {
715; CHECK-NEXT:    .reg .b32 %r<3>;
716; CHECK-EMPTY:
717; CHECK-NEXT:  // %bb.0:
718; CHECK-NEXT:    ld.param.u32 %r1, [test_and_imm_1_param_0];
719; CHECK-NEXT:    and.b32 %r2, %r1, 67305985;
720; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
721; CHECK-NEXT:    ret;
722  %r = and <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
723  ret <4 x i8> %r
724}
725
726define void @test_ldst_v2i8(ptr %a, ptr %b) {
727; CHECK-LABEL: test_ldst_v2i8(
728; CHECK:       {
729; CHECK-NEXT:    .reg .b32 %r<2>;
730; CHECK-NEXT:    .reg .b64 %rd<3>;
731; CHECK-EMPTY:
732; CHECK-NEXT:  // %bb.0:
733; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v2i8_param_1];
734; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v2i8_param_0];
735; CHECK-NEXT:    ld.u32 %r1, [%rd1];
736; CHECK-NEXT:    st.u32 [%rd2], %r1;
737; CHECK-NEXT:    ret;
738  %t1 = load <4 x i8>, ptr %a
739  store <4 x i8> %t1, ptr %b, align 16
740  ret void
741}
742
743define void @test_ldst_v3i8(ptr %a, ptr %b) {
744; CHECK-LABEL: test_ldst_v3i8(
745; CHECK:       {
746; CHECK-NEXT:    .reg .b32 %r<3>;
747; CHECK-NEXT:    .reg .b64 %rd<3>;
748; CHECK-EMPTY:
749; CHECK-NEXT:  // %bb.0:
750; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v3i8_param_1];
751; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
752; CHECK-NEXT:    ld.u32 %r1, [%rd1];
753; CHECK-NEXT:    st.u16 [%rd2], %r1;
754; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
755; CHECK-NEXT:    st.u8 [%rd2+2], %r2;
756; CHECK-NEXT:    ret;
757  %t1 = load <3 x i8>, ptr %a
758  store <3 x i8> %t1, ptr %b, align 16
759  ret void
760}
761
762define void @test_ldst_v4i8(ptr %a, ptr %b) {
763; CHECK-LABEL: test_ldst_v4i8(
764; CHECK:       {
765; CHECK-NEXT:    .reg .b32 %r<2>;
766; CHECK-NEXT:    .reg .b64 %rd<3>;
767; CHECK-EMPTY:
768; CHECK-NEXT:  // %bb.0:
769; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i8_param_1];
770; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i8_param_0];
771; CHECK-NEXT:    ld.u32 %r1, [%rd1];
772; CHECK-NEXT:    st.u32 [%rd2], %r1;
773; CHECK-NEXT:    ret;
774  %t1 = load <4 x i8>, ptr %a
775  store <4 x i8> %t1, ptr %b, align 16
776  ret void
777}
778
779define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
780; CHECK-LABEL: test_ldst_v4i8_unaligned(
781; CHECK:       {
782; CHECK-NEXT:    .reg .b32 %r<5>;
783; CHECK-NEXT:    .reg .b64 %rd<3>;
784; CHECK-EMPTY:
785; CHECK-NEXT:  // %bb.0:
786; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
787; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
788; CHECK-NEXT:    ld.u8 %r1, [%rd1];
789; CHECK-NEXT:    ld.u8 %r2, [%rd1+1];
790; CHECK-NEXT:    ld.u8 %r3, [%rd1+2];
791; CHECK-NEXT:    ld.u8 %r4, [%rd1+3];
792; CHECK-NEXT:    st.u8 [%rd2+3], %r4;
793; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
794; CHECK-NEXT:    st.u8 [%rd2+1], %r2;
795; CHECK-NEXT:    st.u8 [%rd2], %r1;
796; CHECK-NEXT:    ret;
797  %t1 = load <4 x i8>, ptr %a, align 1
798  store <4 x i8> %t1, ptr %b, align 1
799  ret void
800}
801
802
803define void @test_ldst_v8i8(ptr %a, ptr %b) {
804; CHECK-LABEL: test_ldst_v8i8(
805; CHECK:       {
806; CHECK-NEXT:    .reg .b32 %r<3>;
807; CHECK-NEXT:    .reg .b64 %rd<3>;
808; CHECK-EMPTY:
809; CHECK-NEXT:  // %bb.0:
810; CHECK-NEXT:    ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
811; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
812; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
813; CHECK-NEXT:    st.v2.b32 [%rd2], {%r1, %r2};
814; CHECK-NEXT:    ret;
815  %t1 = load <8 x i8>, ptr %a
816  store <8 x i8> %t1, ptr %b, align 16
817  ret void
818}
819
820declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
821
822define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
823; CHECK-LABEL: test_call(
824; CHECK:       {
825; CHECK-NEXT:    .reg .b32 %r<5>;
826; CHECK-EMPTY:
827; CHECK-NEXT:  // %bb.0:
828; CHECK-NEXT:    ld.param.u32 %r2, [test_call_param_1];
829; CHECK-NEXT:    ld.param.u32 %r1, [test_call_param_0];
830; CHECK-NEXT:    { // callseq 0, 0
831; CHECK-NEXT:    .param .align 4 .b8 param0[4];
832; CHECK-NEXT:    st.param.b32 [param0], %r1;
833; CHECK-NEXT:    .param .align 4 .b8 param1[4];
834; CHECK-NEXT:    st.param.b32 [param1], %r2;
835; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
836; CHECK-NEXT:    call.uni (retval0),
837; CHECK-NEXT:    test_callee,
838; CHECK-NEXT:    (
839; CHECK-NEXT:    param0,
840; CHECK-NEXT:    param1
841; CHECK-NEXT:    );
842; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
843; CHECK-NEXT:    } // callseq 0
844; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
845; CHECK-NEXT:    ret;
846  %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b)
847  ret <4 x i8> %r
848}
849
850define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
851; CHECK-LABEL: test_call_flipped(
852; CHECK:       {
853; CHECK-NEXT:    .reg .b32 %r<5>;
854; CHECK-EMPTY:
855; CHECK-NEXT:  // %bb.0:
856; CHECK-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
857; CHECK-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
858; CHECK-NEXT:    { // callseq 1, 0
859; CHECK-NEXT:    .param .align 4 .b8 param0[4];
860; CHECK-NEXT:    st.param.b32 [param0], %r2;
861; CHECK-NEXT:    .param .align 4 .b8 param1[4];
862; CHECK-NEXT:    st.param.b32 [param1], %r1;
863; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
864; CHECK-NEXT:    call.uni (retval0),
865; CHECK-NEXT:    test_callee,
866; CHECK-NEXT:    (
867; CHECK-NEXT:    param0,
868; CHECK-NEXT:    param1
869; CHECK-NEXT:    );
870; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
871; CHECK-NEXT:    } // callseq 1
872; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
873; CHECK-NEXT:    ret;
874  %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
875  ret <4 x i8> %r
876}
877
878define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
879; CHECK-LABEL: test_tailcall_flipped(
880; CHECK:       {
881; CHECK-NEXT:    .reg .b32 %r<5>;
882; CHECK-EMPTY:
883; CHECK-NEXT:  // %bb.0:
884; CHECK-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
885; CHECK-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
886; CHECK-NEXT:    { // callseq 2, 0
887; CHECK-NEXT:    .param .align 4 .b8 param0[4];
888; CHECK-NEXT:    st.param.b32 [param0], %r2;
889; CHECK-NEXT:    .param .align 4 .b8 param1[4];
890; CHECK-NEXT:    st.param.b32 [param1], %r1;
891; CHECK-NEXT:    .param .align 4 .b8 retval0[4];
892; CHECK-NEXT:    call.uni (retval0),
893; CHECK-NEXT:    test_callee,
894; CHECK-NEXT:    (
895; CHECK-NEXT:    param0,
896; CHECK-NEXT:    param1
897; CHECK-NEXT:    );
898; CHECK-NEXT:    ld.param.b32 %r3, [retval0];
899; CHECK-NEXT:    } // callseq 2
900; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
901; CHECK-NEXT:    ret;
902  %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
903  ret <4 x i8> %r
904}
905
906define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
907; CHECK-LABEL: test_select(
908; CHECK:       {
909; CHECK-NEXT:    .reg .pred %p<2>;
910; CHECK-NEXT:    .reg .b16 %rs<3>;
911; CHECK-NEXT:    .reg .b32 %r<4>;
912; CHECK-EMPTY:
913; CHECK-NEXT:  // %bb.0:
914; CHECK-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
915; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
916; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
917; CHECK-NEXT:    ld.param.u32 %r2, [test_select_param_1];
918; CHECK-NEXT:    ld.param.u32 %r1, [test_select_param_0];
919; CHECK-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
920; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
921; CHECK-NEXT:    ret;
922  %r = select i1 %c, <4 x i8> %a, <4 x i8> %b
923  ret <4 x i8> %r
924}
925
926define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 {
927; CHECK-LABEL: test_select_cc(
928; CHECK:       {
929; CHECK-NEXT:    .reg .pred %p<5>;
930; CHECK-NEXT:    .reg .b32 %r<28>;
931; CHECK-EMPTY:
932; CHECK-NEXT:  // %bb.0:
933; CHECK-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
934; CHECK-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
935; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
936; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
937; CHECK-NEXT:    bfe.u32 %r5, %r4, 0, 8;
938; CHECK-NEXT:    bfe.u32 %r6, %r3, 0, 8;
939; CHECK-NEXT:    setp.ne.u32 %p1, %r6, %r5;
940; CHECK-NEXT:    bfe.u32 %r7, %r4, 8, 8;
941; CHECK-NEXT:    bfe.u32 %r8, %r3, 8, 8;
942; CHECK-NEXT:    setp.ne.u32 %p2, %r8, %r7;
943; CHECK-NEXT:    bfe.u32 %r9, %r4, 16, 8;
944; CHECK-NEXT:    bfe.u32 %r10, %r3, 16, 8;
945; CHECK-NEXT:    setp.ne.u32 %p3, %r10, %r9;
946; CHECK-NEXT:    bfe.u32 %r11, %r4, 24, 8;
947; CHECK-NEXT:    bfe.u32 %r12, %r3, 24, 8;
948; CHECK-NEXT:    setp.ne.u32 %p4, %r12, %r11;
949; CHECK-NEXT:    bfe.u32 %r13, %r2, 24, 8;
950; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
951; CHECK-NEXT:    selp.b32 %r15, %r14, %r13, %p4;
952; CHECK-NEXT:    bfe.u32 %r16, %r2, 16, 8;
953; CHECK-NEXT:    bfe.u32 %r17, %r1, 16, 8;
954; CHECK-NEXT:    selp.b32 %r18, %r17, %r16, %p3;
955; CHECK-NEXT:    prmt.b32 %r19, %r18, %r15, 0x3340U;
956; CHECK-NEXT:    bfe.u32 %r20, %r2, 8, 8;
957; CHECK-NEXT:    bfe.u32 %r21, %r1, 8, 8;
958; CHECK-NEXT:    selp.b32 %r22, %r21, %r20, %p2;
959; CHECK-NEXT:    bfe.u32 %r23, %r2, 0, 8;
960; CHECK-NEXT:    bfe.u32 %r24, %r1, 0, 8;
961; CHECK-NEXT:    selp.b32 %r25, %r24, %r23, %p1;
962; CHECK-NEXT:    prmt.b32 %r26, %r25, %r22, 0x3340U;
963; CHECK-NEXT:    prmt.b32 %r27, %r26, %r19, 0x5410U;
964; CHECK-NEXT:    st.param.b32 [func_retval0], %r27;
965; CHECK-NEXT:    ret;
966  %cc = icmp ne <4 x i8> %c, %d
967  %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
968  ret <4 x i8> %r
969}
970
971define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
972; CHECK-LABEL: test_select_cc_i32_i8(
973; CHECK:       {
974; CHECK-NEXT:    .reg .pred %p<5>;
975; CHECK-NEXT:    .reg .b32 %r<23>;
976; CHECK-EMPTY:
977; CHECK-NEXT:  // %bb.0:
978; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
979; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
980; CHECK-NEXT:    ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
981; CHECK-NEXT:    ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
982; CHECK-NEXT:    bfe.u32 %r11, %r10, 0, 8;
983; CHECK-NEXT:    bfe.u32 %r12, %r9, 0, 8;
984; CHECK-NEXT:    setp.ne.u32 %p1, %r12, %r11;
985; CHECK-NEXT:    bfe.u32 %r13, %r10, 8, 8;
986; CHECK-NEXT:    bfe.u32 %r14, %r9, 8, 8;
987; CHECK-NEXT:    setp.ne.u32 %p2, %r14, %r13;
988; CHECK-NEXT:    bfe.u32 %r15, %r10, 16, 8;
989; CHECK-NEXT:    bfe.u32 %r16, %r9, 16, 8;
990; CHECK-NEXT:    setp.ne.u32 %p3, %r16, %r15;
991; CHECK-NEXT:    bfe.u32 %r17, %r10, 24, 8;
992; CHECK-NEXT:    bfe.u32 %r18, %r9, 24, 8;
993; CHECK-NEXT:    setp.ne.u32 %p4, %r18, %r17;
994; CHECK-NEXT:    selp.b32 %r19, %r4, %r8, %p4;
995; CHECK-NEXT:    selp.b32 %r20, %r3, %r7, %p3;
996; CHECK-NEXT:    selp.b32 %r21, %r2, %r6, %p2;
997; CHECK-NEXT:    selp.b32 %r22, %r1, %r5, %p1;
998; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19};
999; CHECK-NEXT:    ret;
1000                                           <4 x i8> %c, <4 x i8> %d) #0 {
1001  %cc = icmp ne <4 x i8> %c, %d
1002  %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b
1003  ret <4 x i32> %r
1004}
1005
1006define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
1007; CHECK-LABEL: test_select_cc_i8_i32(
1008; CHECK:       {
1009; CHECK-NEXT:    .reg .pred %p<5>;
1010; CHECK-NEXT:    .reg .b32 %r<26>;
1011; CHECK-EMPTY:
1012; CHECK-NEXT:  // %bb.0:
1013; CHECK-NEXT:    ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
1014; CHECK-NEXT:    ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
1015; CHECK-NEXT:    ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
1016; CHECK-NEXT:    ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
1017; CHECK-NEXT:    setp.ne.s32 %p1, %r3, %r7;
1018; CHECK-NEXT:    setp.ne.s32 %p2, %r4, %r8;
1019; CHECK-NEXT:    setp.ne.s32 %p3, %r5, %r9;
1020; CHECK-NEXT:    setp.ne.s32 %p4, %r6, %r10;
1021; CHECK-NEXT:    bfe.u32 %r11, %r2, 24, 8;
1022; CHECK-NEXT:    bfe.u32 %r12, %r1, 24, 8;
1023; CHECK-NEXT:    selp.b32 %r13, %r12, %r11, %p4;
1024; CHECK-NEXT:    bfe.u32 %r14, %r2, 16, 8;
1025; CHECK-NEXT:    bfe.u32 %r15, %r1, 16, 8;
1026; CHECK-NEXT:    selp.b32 %r16, %r15, %r14, %p3;
1027; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x3340U;
1028; CHECK-NEXT:    bfe.u32 %r18, %r2, 8, 8;
1029; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
1030; CHECK-NEXT:    selp.b32 %r20, %r19, %r18, %p2;
1031; CHECK-NEXT:    bfe.u32 %r21, %r2, 0, 8;
1032; CHECK-NEXT:    bfe.u32 %r22, %r1, 0, 8;
1033; CHECK-NEXT:    selp.b32 %r23, %r22, %r21, %p1;
1034; CHECK-NEXT:    prmt.b32 %r24, %r23, %r20, 0x3340U;
1035; CHECK-NEXT:    prmt.b32 %r25, %r24, %r17, 0x5410U;
1036; CHECK-NEXT:    st.param.b32 [func_retval0], %r25;
1037; CHECK-NEXT:    ret;
1038                                          <4 x i32> %c, <4 x i32> %d) #0 {
1039  %cc = icmp ne <4 x i32> %c, %d
1040  %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
1041  ret <4 x i8> %r
1042}
1043
1044
1045define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
1046; CHECK-LABEL: test_trunc_2xi32(
1047; CHECK:       {
1048; CHECK-NEXT:    .reg .b32 %r<8>;
1049; CHECK-EMPTY:
1050; CHECK-NEXT:  // %bb.0:
1051; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
1052; CHECK-NEXT:    prmt.b32 %r5, %r3, %r4, 0x3340U;
1053; CHECK-NEXT:    prmt.b32 %r6, %r1, %r2, 0x3340U;
1054; CHECK-NEXT:    prmt.b32 %r7, %r6, %r5, 0x5410U;
1055; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
1056; CHECK-NEXT:    ret;
1057  %r = trunc <4 x i32> %a to <4 x i8>
1058  ret <4 x i8> %r
1059}
1060
1061define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
1062; CHECK-LABEL: test_trunc_2xi64(
1063; CHECK:       {
1064; CHECK-NEXT:    .reg .b32 %r<8>;
1065; CHECK-NEXT:    .reg .b64 %rd<5>;
1066; CHECK-EMPTY:
1067; CHECK-NEXT:  // %bb.0:
1068; CHECK-NEXT:    ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
1069; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
1070; CHECK-NEXT:    cvt.u32.u64 %r1, %rd4;
1071; CHECK-NEXT:    cvt.u32.u64 %r2, %rd3;
1072; CHECK-NEXT:    prmt.b32 %r3, %r2, %r1, 0x3340U;
1073; CHECK-NEXT:    cvt.u32.u64 %r4, %rd2;
1074; CHECK-NEXT:    cvt.u32.u64 %r5, %rd1;
1075; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
1076; CHECK-NEXT:    prmt.b32 %r7, %r6, %r3, 0x5410U;
1077; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
1078; CHECK-NEXT:    ret;
1079  %r = trunc <4 x i64> %a to <4 x i8>
1080  ret <4 x i8> %r
1081}
1082
1083define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
1084; CHECK-LABEL: test_zext_2xi32(
1085; CHECK:       {
1086; CHECK-NEXT:    .reg .b32 %r<6>;
1087; CHECK-EMPTY:
1088; CHECK-NEXT:  // %bb.0:
1089; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi32_param_0];
1090; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
1091; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
1092; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
1093; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
1094; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r5, %r4, %r3, %r2};
1095; CHECK-NEXT:    ret;
1096  %r = zext <4 x i8> %a to <4 x i32>
1097  ret <4 x i32> %r
1098}
1099
1100define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
1101; CHECK-LABEL: test_zext_2xi64(
1102; CHECK:       {
1103; CHECK-NEXT:    .reg .b32 %r<6>;
1104; CHECK-NEXT:    .reg .b64 %rd<9>;
1105; CHECK-EMPTY:
1106; CHECK-NEXT:  // %bb.0:
1107; CHECK-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
1108; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
1109; CHECK-NEXT:    cvt.u64.u32 %rd1, %r2;
1110; CHECK-NEXT:    and.b64 %rd2, %rd1, 255;
1111; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
1112; CHECK-NEXT:    cvt.u64.u32 %rd3, %r3;
1113; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
1114; CHECK-NEXT:    bfe.u32 %r4, %r1, 8, 8;
1115; CHECK-NEXT:    cvt.u64.u32 %rd5, %r4;
1116; CHECK-NEXT:    and.b64 %rd6, %rd5, 255;
1117; CHECK-NEXT:    bfe.u32 %r5, %r1, 0, 8;
1118; CHECK-NEXT:    cvt.u64.u32 %rd7, %r5;
1119; CHECK-NEXT:    and.b64 %rd8, %rd7, 255;
1120; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd8, %rd6};
1121; CHECK-NEXT:    st.param.v2.b64 [func_retval0+16], {%rd4, %rd2};
1122; CHECK-NEXT:    ret;
1123  %r = zext <4 x i8> %a to <4 x i64>
1124  ret <4 x i64> %r
1125}
1126
1127define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
1128; CHECK-LABEL: test_bitcast_i32_to_4xi8(
1129; CHECK:       {
1130; CHECK-NEXT:    .reg .b32 %r<2>;
1131; CHECK-EMPTY:
1132; CHECK-NEXT:  // %bb.0:
1133; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0];
1134; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
1135; CHECK-NEXT:    ret;
1136  %r = bitcast i32 %a to <4 x i8>
1137  ret <4 x i8> %r
1138}
1139
1140define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
1141; CHECK-LABEL: test_bitcast_float_to_4xi8(
1142; CHECK:       {
1143; CHECK-NEXT:    .reg .b32 %r<2>;
1144; CHECK-NEXT:    .reg .f32 %f<2>;
1145; CHECK-EMPTY:
1146; CHECK-NEXT:  // %bb.0:
1147; CHECK-NEXT:    ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0];
1148; CHECK-NEXT:    mov.b32 %r1, %f1;
1149; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
1150; CHECK-NEXT:    ret;
1151  %r = bitcast float %a to <4 x i8>
1152  ret <4 x i8> %r
1153}
1154
1155define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
1156; CHECK-LABEL: test_bitcast_4xi8_to_i32(
1157; CHECK:       {
1158; CHECK-NEXT:    .reg .b32 %r<2>;
1159; CHECK-EMPTY:
1160; CHECK-NEXT:  // %bb.0:
1161; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0];
1162; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
1163; CHECK-NEXT:    ret;
1164  %r = bitcast <4 x i8> %a to i32
1165  ret i32 %r
1166}
1167
1168define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
1169; CHECK-LABEL: test_bitcast_4xi8_to_float(
1170; CHECK:       {
1171; CHECK-NEXT:    .reg .b32 %r<2>;
1172; CHECK-NEXT:    .reg .f32 %f<2>;
1173; CHECK-EMPTY:
1174; CHECK-NEXT:  // %bb.0:
1175; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0];
1176; CHECK-NEXT:    mov.b32 %f1, %r1;
1177; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
1178; CHECK-NEXT:    ret;
1179  %r = bitcast <4 x i8> %a to float
1180  ret float %r
1181}
1182
1183
1184define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
1185; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
1186; CHECK:       {
1187; CHECK-NEXT:    .reg .b16 %rs<2>;
1188; CHECK-NEXT:    .reg .b32 %r<6>;
1189; CHECK-EMPTY:
1190; CHECK-NEXT:  // %bb.0:
1191; CHECK-NEXT:    ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
1192; CHECK-NEXT:    mov.b32 %r1, 6;
1193; CHECK-NEXT:    prmt.b32 %r2, %r1, 7, 0x3340U;
1194; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
1195; CHECK-NEXT:    prmt.b32 %r4, %r3, 5, 0x3340U;
1196; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
1197; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
1198; CHECK-NEXT:    ret;
1199  %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
1200  %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
1201  %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2
1202  %ins.3 = insertelement <4 x i8> %ins.2, i8 7, i32 3
1203  %r = bitcast <4 x i8> %ins.3 to <2 x half>
1204  ret <2 x half> %r
1205}
1206
1207
1208define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
1209; CHECK-LABEL: test_shufflevector(
1210; CHECK:       {
1211; CHECK-NEXT:    .reg .b32 %r<4>;
1212; CHECK-EMPTY:
1213; CHECK-NEXT:  // %bb.0:
1214; CHECK-NEXT:    ld.param.u32 %r1, [test_shufflevector_param_0];
1215; CHECK-NEXT:    // implicit-def: %r3
1216; CHECK-NEXT:    prmt.b32 %r2, %r1, %r3, 0x123U;
1217; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
1218; CHECK-NEXT:    ret;
1219  %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1220  ret <4 x i8> %s
1221}
1222
1223define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
1224; CHECK-LABEL: test_shufflevector_2(
1225; CHECK:       {
1226; CHECK-NEXT:    .reg .b32 %r<4>;
1227; CHECK-EMPTY:
1228; CHECK-NEXT:  // %bb.0:
1229; CHECK-NEXT:    ld.param.u32 %r2, [test_shufflevector_2_param_1];
1230; CHECK-NEXT:    ld.param.u32 %r1, [test_shufflevector_2_param_0];
1231; CHECK-NEXT:    prmt.b32 %r3, %r1, %r2, 0x2537U;
1232; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
1233; CHECK-NEXT:    ret;
1234  %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2>
1235  ret <4 x i8> %s
1236}
1237
1238
1239define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
1240; CHECK-LABEL: test_insertelement(
1241; CHECK:       {
1242; CHECK-NEXT:    .reg .b16 %rs<2>;
1243; CHECK-NEXT:    .reg .b32 %r<4>;
1244; CHECK-EMPTY:
1245; CHECK-NEXT:  // %bb.0:
1246; CHECK-NEXT:    ld.param.u8 %rs1, [test_insertelement_param_1];
1247; CHECK-NEXT:    ld.param.u32 %r1, [test_insertelement_param_0];
1248; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
1249; CHECK-NEXT:    bfi.b32 %r3, %r2, %r1, 8, 8;
1250; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
1251; CHECK-NEXT:    ret;
1252  %i = insertelement <4 x i8> %a, i8 %x, i64 1
1253  ret <4 x i8> %i
1254}
1255
1256define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
1257; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
1258; CHECK:       {
1259; CHECK-NEXT:    .reg .b16 %rs<13>;
1260; CHECK-NEXT:    .reg .b32 %r<12>;
1261; CHECK-EMPTY:
1262; CHECK-NEXT:  // %bb.0:
1263; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
1264; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
1265; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
1266; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
1267; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
1268; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
1269; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
1270; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
1271; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
1272; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
1273; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
1274; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
1275; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
1276; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
1277; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
1278; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
1279; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
1280; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
1281; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
1282; CHECK-NEXT:    ret;
1283  %r = fptosi <4 x half> %a to <4 x i8>
1284  ret <4 x i8> %r
1285}
1286
1287define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
1288; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
1289; CHECK:       {
1290; CHECK-NEXT:    .reg .b16 %rs<13>;
1291; CHECK-NEXT:    .reg .b32 %r<12>;
1292; CHECK-EMPTY:
1293; CHECK-NEXT:  // %bb.0:
1294; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
1295; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
1296; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
1297; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
1298; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
1299; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
1300; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
1301; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
1302; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
1303; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
1304; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
1305; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
1306; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
1307; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
1308; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
1309; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
1310; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
1311; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
1312; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
1313; CHECK-NEXT:    ret;
1314  %r = fptoui <4 x half> %a to <4 x i8>
1315  ret <4 x i8> %r
1316}
1317
1318define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
1319; CHECK-LABEL: test_srem_v4i8(
1320; CHECK:       {
1321; CHECK-NEXT:    .reg .b16 %rs<13>;
1322; CHECK-NEXT:    .reg .b32 %r<18>;
1323; CHECK-NEXT:    .reg .b64 %rd<4>;
1324; CHECK-EMPTY:
1325; CHECK-NEXT:  // %bb.0: // %entry
1326; CHECK-NEXT:    ld.param.u64 %rd3, [test_srem_v4i8_param_2];
1327; CHECK-NEXT:    ld.param.u64 %rd2, [test_srem_v4i8_param_1];
1328; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v4i8_param_0];
1329; CHECK-NEXT:    ld.u32 %r1, [%rd1];
1330; CHECK-NEXT:    ld.u32 %r2, [%rd2];
1331; CHECK-NEXT:    bfe.s32 %r3, %r2, 24, 8;
1332; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
1333; CHECK-NEXT:    bfe.s32 %r4, %r1, 24, 8;
1334; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
1335; CHECK-NEXT:    rem.s16 %rs3, %rs2, %rs1;
1336; CHECK-NEXT:    cvt.u32.u16 %r5, %rs3;
1337; CHECK-NEXT:    bfe.s32 %r6, %r2, 16, 8;
1338; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
1339; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
1340; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
1341; CHECK-NEXT:    rem.s16 %rs6, %rs5, %rs4;
1342; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
1343; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
1344; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
1345; CHECK-NEXT:    cvt.s8.s32 %rs7, %r10;
1346; CHECK-NEXT:    bfe.s32 %r11, %r1, 8, 8;
1347; CHECK-NEXT:    cvt.s8.s32 %rs8, %r11;
1348; CHECK-NEXT:    rem.s16 %rs9, %rs8, %rs7;
1349; CHECK-NEXT:    cvt.u32.u16 %r12, %rs9;
1350; CHECK-NEXT:    bfe.s32 %r13, %r2, 0, 8;
1351; CHECK-NEXT:    cvt.s8.s32 %rs10, %r13;
1352; CHECK-NEXT:    bfe.s32 %r14, %r1, 0, 8;
1353; CHECK-NEXT:    cvt.s8.s32 %rs11, %r14;
1354; CHECK-NEXT:    rem.s16 %rs12, %rs11, %rs10;
1355; CHECK-NEXT:    cvt.u32.u16 %r15, %rs12;
1356; CHECK-NEXT:    prmt.b32 %r16, %r15, %r12, 0x3340U;
1357; CHECK-NEXT:    prmt.b32 %r17, %r16, %r9, 0x5410U;
1358; CHECK-NEXT:    st.u32 [%rd3], %r17;
1359; CHECK-NEXT:    ret;
1360entry:
1361  %t57 = load <4 x i8>, ptr %a, align 4
1362  %t59 = load <4 x i8>, ptr %b, align 4
1363  %x = srem <4 x i8> %t57, %t59
1364  store <4 x i8> %x, ptr %c, align 4
1365  ret void
1366}
1367
1368;; v3i8 lowering, especially for unaligned loads is terrible. We end up doing
1369;; tons of pointless scalar_to_vector/bitcast/extract_elt on v2i16/v4i8, which
1370;; is further complicated by LLVM trying to use i16 as an intermediate type,
1371;; because we don't have i8 registers. It's a mess.
1372;; Ideally we want to split it into element-wise ops, but legalizer can't handle
1373;; odd-sized vectors.  TL;DR; don't use odd-sized vectors of v8.
1374define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
1375; CHECK-LABEL: test_srem_v3i8(
1376; CHECK:       {
1377; CHECK-NEXT:    .reg .b16 %rs<20>;
1378; CHECK-NEXT:    .reg .b32 %r<14>;
1379; CHECK-NEXT:    .reg .b64 %rd<4>;
1380; CHECK-EMPTY:
1381; CHECK-NEXT:  // %bb.0: // %entry
1382; CHECK-NEXT:    ld.param.u64 %rd3, [test_srem_v3i8_param_2];
1383; CHECK-NEXT:    ld.param.u64 %rd2, [test_srem_v3i8_param_1];
1384; CHECK-NEXT:    ld.param.u64 %rd1, [test_srem_v3i8_param_0];
1385; CHECK-NEXT:    ld.u8 %rs1, [%rd1];
1386; CHECK-NEXT:    ld.u8 %rs2, [%rd1+1];
1387; CHECK-NEXT:    shl.b16 %rs3, %rs2, 8;
1388; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs1;
1389; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
1390; CHECK-NEXT:    ld.s8 %rs5, [%rd1+2];
1391; CHECK-NEXT:    ld.u8 %rs6, [%rd2];
1392; CHECK-NEXT:    ld.u8 %rs7, [%rd2+1];
1393; CHECK-NEXT:    shl.b16 %rs8, %rs7, 8;
1394; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
1395; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
1396; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
1397; CHECK-NEXT:    bfe.s32 %r3, %r2, 8, 8;
1398; CHECK-NEXT:    cvt.s8.s32 %rs11, %r3;
1399; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
1400; CHECK-NEXT:    cvt.s8.s32 %rs12, %r4;
1401; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
1402; CHECK-NEXT:    cvt.u32.u16 %r5, %rs13;
1403; CHECK-NEXT:    bfe.s32 %r6, %r2, 0, 8;
1404; CHECK-NEXT:    cvt.s8.s32 %rs14, %r6;
1405; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
1406; CHECK-NEXT:    cvt.s8.s32 %rs15, %r7;
1407; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
1408; CHECK-NEXT:    cvt.u32.u16 %r8, %rs16;
1409; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
1410; CHECK-NEXT:    // implicit-def: %r11
1411; CHECK-NEXT:    // implicit-def: %r12
1412; CHECK-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
1413; CHECK-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
1414; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
1415; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; }
1416; CHECK-NEXT:    st.u8 [%rd3], %rs18;
1417; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
1418; CHECK-NEXT:    st.u8 [%rd3+1], %rs19;
1419; CHECK-NEXT:    st.u8 [%rd3+2], %rs17;
1420; CHECK-NEXT:    ret;
1421entry:
1422  %t57 = load <3 x i8>, ptr %a, align 1
1423  %t59 = load <3 x i8>, ptr %b, align 1
1424  %x = srem <3 x i8> %t57, %t59
1425  store <3 x i8> %x, ptr %c, align 1
1426  ret void
1427}
1428
1429define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
1430; CHECK-LABEL: test_sext_v4i1_to_v4i8(
1431; CHECK:       {
1432; CHECK-NEXT:    .reg .pred %p<5>;
1433; CHECK-NEXT:    .reg .b32 %r<18>;
1434; CHECK-NEXT:    .reg .b64 %rd<4>;
1435; CHECK-EMPTY:
1436; CHECK-NEXT:  // %bb.0: // %entry
1437; CHECK-NEXT:    ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
1438; CHECK-NEXT:    ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
1439; CHECK-NEXT:    ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
1440; CHECK-NEXT:    ld.u32 %r1, [%rd1];
1441; CHECK-NEXT:    ld.u32 %r2, [%rd2];
1442; CHECK-NEXT:    bfe.u32 %r3, %r2, 0, 8;
1443; CHECK-NEXT:    bfe.u32 %r4, %r1, 0, 8;
1444; CHECK-NEXT:    setp.hi.u32 %p1, %r4, %r3;
1445; CHECK-NEXT:    bfe.u32 %r5, %r2, 8, 8;
1446; CHECK-NEXT:    bfe.u32 %r6, %r1, 8, 8;
1447; CHECK-NEXT:    setp.hi.u32 %p2, %r6, %r5;
1448; CHECK-NEXT:    bfe.u32 %r7, %r2, 16, 8;
1449; CHECK-NEXT:    bfe.u32 %r8, %r1, 16, 8;
1450; CHECK-NEXT:    setp.hi.u32 %p3, %r8, %r7;
1451; CHECK-NEXT:    bfe.u32 %r9, %r2, 24, 8;
1452; CHECK-NEXT:    bfe.u32 %r10, %r1, 24, 8;
1453; CHECK-NEXT:    setp.hi.u32 %p4, %r10, %r9;
1454; CHECK-NEXT:    selp.s32 %r11, -1, 0, %p4;
1455; CHECK-NEXT:    selp.s32 %r12, -1, 0, %p3;
1456; CHECK-NEXT:    prmt.b32 %r13, %r12, %r11, 0x3340U;
1457; CHECK-NEXT:    selp.s32 %r14, -1, 0, %p2;
1458; CHECK-NEXT:    selp.s32 %r15, -1, 0, %p1;
1459; CHECK-NEXT:    prmt.b32 %r16, %r15, %r14, 0x3340U;
1460; CHECK-NEXT:    prmt.b32 %r17, %r16, %r13, 0x5410U;
1461; CHECK-NEXT:    st.u32 [%rd3], %r17;
1462; CHECK-NEXT:    ret;
1463entry:
1464  %t1 = load <4 x i8>, ptr %a, align 4
1465  %t2 = load <4 x i8>, ptr %b, align 4
1466  %t5 = icmp ugt <4 x i8> %t1, %t2
1467  %t6 = sext <4 x i1> %t5 to <4 x i8>
1468  store <4 x i8> %t6, ptr %c, align 4
1469  ret void
1470}
1471
1472attributes #0 = { nounwind }
1473