xref: /llvm-project/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll (revision 310e79875752886a7713911e2a1ec14bc75bd4b3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; ## Support i16x2 instructions
3; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80        \
4; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
5; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,I16x2 %s
6; RUN: %if ptxas %{                                                           \
7; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90                    \
8; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
9; RUN:   | %ptxas-verify -arch=sm_90                                          \
10; RUN: %}
11; ## No support for i16x2 instructions
12; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53                      \
13; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
14; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefixes COMMON,NO-I16x2 %s
15; RUN: %if ptxas %{                                                           \
16; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53                    \
17; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
18; RUN:   | %ptxas-verify -arch=sm_53                                          \
19; RUN: %}
20
21target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
22
23define <2 x i16> @test_ret_const() #0 {
24; COMMON-LABEL: test_ret_const(
25; COMMON:       {
26; COMMON-NEXT:    .reg .b32 %r<2>;
27; COMMON-EMPTY:
28; COMMON-NEXT:  // %bb.0:
29; COMMON-NEXT:    mov.b32 %r1, 131073;
30; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
31; COMMON-NEXT:    ret;
32  ret <2 x i16> <i16 1, i16 2>
33}
34
35define i16 @test_extract_0(<2 x i16> %a) #0 {
36; COMMON-LABEL: test_extract_0(
37; COMMON:       {
38; COMMON-NEXT:    .reg .b16 %rs<2>;
39; COMMON-NEXT:    .reg .b32 %r<3>;
40; COMMON-EMPTY:
41; COMMON-NEXT:  // %bb.0:
42; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_0_param_0];
43; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
44; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
45; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
46; COMMON-NEXT:    ret;
47  %e = extractelement <2 x i16> %a, i32 0
48  ret i16 %e
49}
50
51define i16 @test_extract_1(<2 x i16> %a) #0 {
52; COMMON-LABEL: test_extract_1(
53; COMMON:       {
54; COMMON-NEXT:    .reg .b16 %rs<2>;
55; COMMON-NEXT:    .reg .b32 %r<3>;
56; COMMON-EMPTY:
57; COMMON-NEXT:  // %bb.0:
58; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_1_param_0];
59; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
60; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
61; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
62; COMMON-NEXT:    ret;
63  %e = extractelement <2 x i16> %a, i32 1
64  ret i16 %e
65}
66
67define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
68; COMMON-LABEL: test_extract_i(
69; COMMON:       {
70; COMMON-NEXT:    .reg .pred %p<2>;
71; COMMON-NEXT:    .reg .b16 %rs<4>;
72; COMMON-NEXT:    .reg .b32 %r<3>;
73; COMMON-NEXT:    .reg .b64 %rd<2>;
74; COMMON-EMPTY:
75; COMMON-NEXT:  // %bb.0:
76; COMMON-NEXT:    ld.param.u64 %rd1, [test_extract_i_param_1];
77; COMMON-NEXT:    ld.param.u32 %r1, [test_extract_i_param_0];
78; COMMON-NEXT:    setp.eq.s64 %p1, %rd1, 0;
79; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
80; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
81; COMMON-NEXT:    cvt.u32.u16 %r2, %rs3;
82; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
83; COMMON-NEXT:    ret;
84  %e = extractelement <2 x i16> %a, i64 %idx
85  ret i16 %e
86}
87
88define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
89; I16x2-LABEL: test_add(
90; I16x2:       {
91; I16x2-NEXT:    .reg .b32 %r<4>;
92; I16x2-EMPTY:
93; I16x2-NEXT:  // %bb.0:
94; I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
95; I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
96; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
97; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
98; I16x2-NEXT:    ret;
99;
100; NO-I16x2-LABEL: test_add(
101; NO-I16x2:       {
102; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
103; NO-I16x2-NEXT:    .reg .b32 %r<4>;
104; NO-I16x2-EMPTY:
105; NO-I16x2-NEXT:  // %bb.0:
106; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_add_param_1];
107; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_param_0];
108; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
109; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
110; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
111; NO-I16x2-NEXT:    add.s16 %rs6, %rs3, %rs1;
112; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
113; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
114; NO-I16x2-NEXT:    ret;
115  %r = add <2 x i16> %a, %b
116  ret <2 x i16> %r
117}
118
119; Check that we can lower add with immediate arguments.
120define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
121; I16x2-LABEL: test_add_imm_0(
122; I16x2:       {
123; I16x2-NEXT:    .reg .b32 %r<4>;
124; I16x2-EMPTY:
125; I16x2-NEXT:  // %bb.0:
126; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
127; I16x2-NEXT:    mov.b32 %r2, 131073;
128; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
129; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
130; I16x2-NEXT:    ret;
131;
132; NO-I16x2-LABEL: test_add_imm_0(
133; NO-I16x2:       {
134; NO-I16x2-NEXT:    .reg .b16 %rs<5>;
135; NO-I16x2-NEXT:    .reg .b32 %r<3>;
136; NO-I16x2-EMPTY:
137; NO-I16x2-NEXT:  // %bb.0:
138; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_0_param_0];
139; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
140; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
141; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
142; NO-I16x2-NEXT:    mov.b32 %r2, {%rs4, %rs3};
143; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
144; NO-I16x2-NEXT:    ret;
145  %r = add <2 x i16> <i16 1, i16 2>, %a
146  ret <2 x i16> %r
147}
148
149define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
150; I16x2-LABEL: test_add_imm_1(
151; I16x2:       {
152; I16x2-NEXT:    .reg .b32 %r<4>;
153; I16x2-EMPTY:
154; I16x2-NEXT:  // %bb.0:
155; I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
156; I16x2-NEXT:    mov.b32 %r2, 131073;
157; I16x2-NEXT:    add.s16x2 %r3, %r1, %r2;
158; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
159; I16x2-NEXT:    ret;
160;
161; NO-I16x2-LABEL: test_add_imm_1(
162; NO-I16x2:       {
163; NO-I16x2-NEXT:    .reg .b16 %rs<5>;
164; NO-I16x2-NEXT:    .reg .b32 %r<3>;
165; NO-I16x2-EMPTY:
166; NO-I16x2-NEXT:  // %bb.0:
167; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_add_imm_1_param_0];
168; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
169; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
170; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
171; NO-I16x2-NEXT:    mov.b32 %r2, {%rs4, %rs3};
172; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
173; NO-I16x2-NEXT:    ret;
174  %r = add <2 x i16> %a, <i16 1, i16 2>
175  ret <2 x i16> %r
176}
177
178define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
179; COMMON-LABEL: test_sub(
180; COMMON:       {
181; COMMON-NEXT:    .reg .b16 %rs<7>;
182; COMMON-NEXT:    .reg .b32 %r<4>;
183; COMMON-EMPTY:
184; COMMON-NEXT:  // %bb.0:
185; COMMON-NEXT:    ld.param.u32 %r2, [test_sub_param_1];
186; COMMON-NEXT:    ld.param.u32 %r1, [test_sub_param_0];
187; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
188; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
189; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
190; COMMON-NEXT:    sub.s16 %rs6, %rs3, %rs1;
191; COMMON-NEXT:    mov.b32 %r3, {%rs6, %rs5};
192; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
193; COMMON-NEXT:    ret;
194  %r = sub <2 x i16> %a, %b
195  ret <2 x i16> %r
196}
197
198define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
199; I16x2-LABEL: test_smax(
200; I16x2:       {
201; I16x2-NEXT:    .reg .b32 %r<4>;
202; I16x2-EMPTY:
203; I16x2-NEXT:  // %bb.0:
204; I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
205; I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
206; I16x2-NEXT:    max.s16x2 %r3, %r1, %r2;
207; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
208; I16x2-NEXT:    ret;
209;
210; NO-I16x2-LABEL: test_smax(
211; NO-I16x2:       {
212; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
213; NO-I16x2-NEXT:    .reg .b32 %r<4>;
214; NO-I16x2-EMPTY:
215; NO-I16x2-NEXT:  // %bb.0:
216; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smax_param_1];
217; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smax_param_0];
218; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
219; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
220; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
221; NO-I16x2-NEXT:    max.s16 %rs6, %rs3, %rs1;
222; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
223; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
224; NO-I16x2-NEXT:    ret;
225  %cmp = icmp sgt <2 x i16> %a, %b
226  %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
227  ret <2 x i16> %r
228}
229
230define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
231; I16x2-LABEL: test_umax(
232; I16x2:       {
233; I16x2-NEXT:    .reg .b32 %r<4>;
234; I16x2-EMPTY:
235; I16x2-NEXT:  // %bb.0:
236; I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
237; I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
238; I16x2-NEXT:    max.u16x2 %r3, %r1, %r2;
239; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
240; I16x2-NEXT:    ret;
241;
242; NO-I16x2-LABEL: test_umax(
243; NO-I16x2:       {
244; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
245; NO-I16x2-NEXT:    .reg .b32 %r<4>;
246; NO-I16x2-EMPTY:
247; NO-I16x2-NEXT:  // %bb.0:
248; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umax_param_1];
249; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umax_param_0];
250; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
251; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
252; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
253; NO-I16x2-NEXT:    max.u16 %rs6, %rs3, %rs1;
254; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
255; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
256; NO-I16x2-NEXT:    ret;
257  %cmp = icmp ugt <2 x i16> %a, %b
258  %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
259  ret <2 x i16> %r
260}
261
262define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
263; I16x2-LABEL: test_smin(
264; I16x2:       {
265; I16x2-NEXT:    .reg .b32 %r<4>;
266; I16x2-EMPTY:
267; I16x2-NEXT:  // %bb.0:
268; I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
269; I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
270; I16x2-NEXT:    min.s16x2 %r3, %r1, %r2;
271; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
272; I16x2-NEXT:    ret;
273;
274; NO-I16x2-LABEL: test_smin(
275; NO-I16x2:       {
276; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
277; NO-I16x2-NEXT:    .reg .b32 %r<4>;
278; NO-I16x2-EMPTY:
279; NO-I16x2-NEXT:  // %bb.0:
280; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_smin_param_1];
281; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_smin_param_0];
282; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
283; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
284; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
285; NO-I16x2-NEXT:    min.s16 %rs6, %rs3, %rs1;
286; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
287; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
288; NO-I16x2-NEXT:    ret;
289  %cmp = icmp sle <2 x i16> %a, %b
290  %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
291  ret <2 x i16> %r
292}
293
294define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
295; I16x2-LABEL: test_umin(
296; I16x2:       {
297; I16x2-NEXT:    .reg .b32 %r<4>;
298; I16x2-EMPTY:
299; I16x2-NEXT:  // %bb.0:
300; I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
301; I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
302; I16x2-NEXT:    min.u16x2 %r3, %r1, %r2;
303; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
304; I16x2-NEXT:    ret;
305;
306; NO-I16x2-LABEL: test_umin(
307; NO-I16x2:       {
308; NO-I16x2-NEXT:    .reg .b16 %rs<7>;
309; NO-I16x2-NEXT:    .reg .b32 %r<4>;
310; NO-I16x2-EMPTY:
311; NO-I16x2-NEXT:  // %bb.0:
312; NO-I16x2-NEXT:    ld.param.u32 %r2, [test_umin_param_1];
313; NO-I16x2-NEXT:    ld.param.u32 %r1, [test_umin_param_0];
314; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
315; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
316; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
317; NO-I16x2-NEXT:    min.u16 %rs6, %rs3, %rs1;
318; NO-I16x2-NEXT:    mov.b32 %r3, {%rs6, %rs5};
319; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
320; NO-I16x2-NEXT:    ret;
321  %cmp = icmp ule <2 x i16> %a, %b
322  %r = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
323  ret <2 x i16> %r
324}
325
326define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
327; COMMON-LABEL: test_mul(
328; COMMON:       {
329; COMMON-NEXT:    .reg .b16 %rs<7>;
330; COMMON-NEXT:    .reg .b32 %r<4>;
331; COMMON-EMPTY:
332; COMMON-NEXT:  // %bb.0:
333; COMMON-NEXT:    ld.param.u32 %r2, [test_mul_param_1];
334; COMMON-NEXT:    ld.param.u32 %r1, [test_mul_param_0];
335; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
336; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
337; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
338; COMMON-NEXT:    mul.lo.s16 %rs6, %rs3, %rs1;
339; COMMON-NEXT:    mov.b32 %r3, {%rs6, %rs5};
340; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
341; COMMON-NEXT:    ret;
342  %r = mul <2 x i16> %a, %b
343  ret <2 x i16> %r
344}
345
346;; Logical ops are available on all GPUs as regular 32-bit logical ops
347define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
348; COMMON-LABEL: test_or(
349; COMMON:       {
350; COMMON-NEXT:    .reg .b32 %r<4>;
351; COMMON-EMPTY:
352; COMMON-NEXT:  // %bb.0:
353; COMMON-NEXT:    ld.param.u32 %r2, [test_or_param_1];
354; COMMON-NEXT:    ld.param.u32 %r1, [test_or_param_0];
355; COMMON-NEXT:    or.b32 %r3, %r1, %r2;
356; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
357; COMMON-NEXT:    ret;
358  %r = or <2 x i16> %a, %b
359  ret <2 x i16> %r
360}
361
362; Ops that operate on computed arguments go though a different lowering path.
363; compared to the ones that operate on loaded data. So we test them separately.
364define <2 x i16> @test_or_computed(i16 %a) {
365; COMMON-LABEL: test_or_computed(
366; COMMON:       {
367; COMMON-NEXT:    .reg .b16 %rs<4>;
368; COMMON-NEXT:    .reg .b32 %r<4>;
369; COMMON-EMPTY:
370; COMMON-NEXT:  // %bb.0:
371; COMMON-NEXT:    ld.param.u16 %rs1, [test_or_computed_param_0];
372; COMMON-NEXT:    mov.b16 %rs2, 0;
373; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
374; COMMON-NEXT:    mov.b16 %rs3, 5;
375; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
376; COMMON-NEXT:    or.b32 %r3, %r2, %r1;
377; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
378; COMMON-NEXT:    ret;
379  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
380  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
381  %r = or <2 x i16> %ins.1, %ins.0
382  ret <2 x i16> %r
383}
384
385; Check that we can lower or with immediate arguments.
386define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
387; COMMON-LABEL: test_or_imm_0(
388; COMMON:       {
389; COMMON-NEXT:    .reg .b32 %r<3>;
390; COMMON-EMPTY:
391; COMMON-NEXT:  // %bb.0:
392; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_0_param_0];
393; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
394; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
395; COMMON-NEXT:    ret;
396  %r = or <2 x i16> <i16 1, i16 2>, %a
397  ret <2 x i16> %r
398}
399
400define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
401; COMMON-LABEL: test_or_imm_1(
402; COMMON:       {
403; COMMON-NEXT:    .reg .b32 %r<3>;
404; COMMON-EMPTY:
405; COMMON-NEXT:  // %bb.0:
406; COMMON-NEXT:    ld.param.u32 %r1, [test_or_imm_1_param_0];
407; COMMON-NEXT:    or.b32 %r2, %r1, 131073;
408; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
409; COMMON-NEXT:    ret;
410  %r = or <2 x i16> %a, <i16 1, i16 2>
411  ret <2 x i16> %r
412}
413
414define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
415; COMMON-LABEL: test_xor(
416; COMMON:       {
417; COMMON-NEXT:    .reg .b32 %r<4>;
418; COMMON-EMPTY:
419; COMMON-NEXT:  // %bb.0:
420; COMMON-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
421; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
422; COMMON-NEXT:    xor.b32 %r3, %r1, %r2;
423; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
424; COMMON-NEXT:    ret;
425  %r = xor <2 x i16> %a, %b
426  ret <2 x i16> %r
427}
428
429define <2 x i16> @test_xor_computed(i16 %a) {
430; COMMON-LABEL: test_xor_computed(
431; COMMON:       {
432; COMMON-NEXT:    .reg .b16 %rs<4>;
433; COMMON-NEXT:    .reg .b32 %r<4>;
434; COMMON-EMPTY:
435; COMMON-NEXT:  // %bb.0:
436; COMMON-NEXT:    ld.param.u16 %rs1, [test_xor_computed_param_0];
437; COMMON-NEXT:    mov.b16 %rs2, 0;
438; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
439; COMMON-NEXT:    mov.b16 %rs3, 5;
440; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
441; COMMON-NEXT:    xor.b32 %r3, %r2, %r1;
442; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
443; COMMON-NEXT:    ret;
444  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
445  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
446  %r = xor <2 x i16> %ins.1, %ins.0
447  ret <2 x i16> %r
448}
449
450; Check that we can lower xor with immediate arguments.
451define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
452; COMMON-LABEL: test_xor_imm_0(
453; COMMON:       {
454; COMMON-NEXT:    .reg .b32 %r<3>;
455; COMMON-EMPTY:
456; COMMON-NEXT:  // %bb.0:
457; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_0_param_0];
458; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
459; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
460; COMMON-NEXT:    ret;
461  %r = xor <2 x i16> <i16 1, i16 2>, %a
462  ret <2 x i16> %r
463}
464
465define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
466; COMMON-LABEL: test_xor_imm_1(
467; COMMON:       {
468; COMMON-NEXT:    .reg .b32 %r<3>;
469; COMMON-EMPTY:
470; COMMON-NEXT:  // %bb.0:
471; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_imm_1_param_0];
472; COMMON-NEXT:    xor.b32 %r2, %r1, 131073;
473; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
474; COMMON-NEXT:    ret;
475  %r = xor <2 x i16> %a, <i16 1, i16 2>
476  ret <2 x i16> %r
477}
478
479define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
480; COMMON-LABEL: test_and(
481; COMMON:       {
482; COMMON-NEXT:    .reg .b32 %r<4>;
483; COMMON-EMPTY:
484; COMMON-NEXT:  // %bb.0:
485; COMMON-NEXT:    ld.param.u32 %r2, [test_and_param_1];
486; COMMON-NEXT:    ld.param.u32 %r1, [test_and_param_0];
487; COMMON-NEXT:    and.b32 %r3, %r1, %r2;
488; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
489; COMMON-NEXT:    ret;
490  %r = and <2 x i16> %a, %b
491  ret <2 x i16> %r
492}
493
494; Ops that operate on computed arguments go though a different lowering path.
495; compared to the ones that operate on loaded data. So we test them separately.
496define <2 x i16> @test_and_computed(i16 %a) {
497; COMMON-LABEL: test_and_computed(
498; COMMON:       {
499; COMMON-NEXT:    .reg .b16 %rs<4>;
500; COMMON-NEXT:    .reg .b32 %r<4>;
501; COMMON-EMPTY:
502; COMMON-NEXT:  // %bb.0:
503; COMMON-NEXT:    ld.param.u16 %rs1, [test_and_computed_param_0];
504; COMMON-NEXT:    mov.b16 %rs2, 0;
505; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
506; COMMON-NEXT:    mov.b16 %rs3, 5;
507; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
508; COMMON-NEXT:    and.b32 %r3, %r2, %r1;
509; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
510; COMMON-NEXT:    ret;
511  %ins.0 = insertelement <2 x i16> zeroinitializer, i16 %a, i32 0
512  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
513  %r = and <2 x i16> %ins.1, %ins.0
514  ret <2 x i16> %r
515}
516
517; Check that we can lower and with immediate arguments.
518define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
519; COMMON-LABEL: test_and_imm_0(
520; COMMON:       {
521; COMMON-NEXT:    .reg .b32 %r<3>;
522; COMMON-EMPTY:
523; COMMON-NEXT:  // %bb.0:
524; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_0_param_0];
525; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
526; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
527; COMMON-NEXT:    ret;
528  %r = and <2 x i16> <i16 1, i16 2>, %a
529  ret <2 x i16> %r
530}
531
532define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
533; COMMON-LABEL: test_and_imm_1(
534; COMMON:       {
535; COMMON-NEXT:    .reg .b32 %r<3>;
536; COMMON-EMPTY:
537; COMMON-NEXT:  // %bb.0:
538; COMMON-NEXT:    ld.param.u32 %r1, [test_and_imm_1_param_0];
539; COMMON-NEXT:    and.b32 %r2, %r1, 131073;
540; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
541; COMMON-NEXT:    ret;
542  %r = and <2 x i16> %a, <i16 1, i16 2>
543  ret <2 x i16> %r
544}
545
546define void @test_ldst_v2i16(ptr %a, ptr %b) {
547; COMMON-LABEL: test_ldst_v2i16(
548; COMMON:       {
549; COMMON-NEXT:    .reg .b32 %r<2>;
550; COMMON-NEXT:    .reg .b64 %rd<3>;
551; COMMON-EMPTY:
552; COMMON-NEXT:  // %bb.0:
553; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v2i16_param_1];
554; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v2i16_param_0];
555; COMMON-NEXT:    ld.u32 %r1, [%rd1];
556; COMMON-NEXT:    st.u32 [%rd2], %r1;
557; COMMON-NEXT:    ret;
558  %t1 = load <2 x i16>, ptr %a
559  store <2 x i16> %t1, ptr %b, align 16
560  ret void
561}
562
563; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
564;    number of bitshifting instructions that may change at llvm's whim.
565;    So we only verify that we only issue correct number of writes using
566;    correct offset, but not the values we write.
567define void @test_ldst_v3i16(ptr %a, ptr %b) {
568; COMMON-LABEL: test_ldst_v3i16(
569; COMMON:       {
570; COMMON-NEXT:    .reg .b64 %rd<5>;
571; COMMON-EMPTY:
572; COMMON-NEXT:  // %bb.0:
573; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v3i16_param_1];
574; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i16_param_0];
575; COMMON-NEXT:    ld.u64 %rd3, [%rd1];
576; COMMON-NEXT:    shr.u64 %rd4, %rd3, 32;
577; COMMON-NEXT:    st.u32 [%rd2], %rd3;
578; COMMON-NEXT:    st.u16 [%rd2+4], %rd4;
579; COMMON-NEXT:    ret;
580  %t1 = load <3 x i16>, ptr %a
581  store <3 x i16> %t1, ptr %b, align 16
582  ret void
583}
584
585define void @test_ldst_v4i16(ptr %a, ptr %b) {
586; COMMON-LABEL: test_ldst_v4i16(
587; COMMON:       {
588; COMMON-NEXT:    .reg .b16 %rs<5>;
589; COMMON-NEXT:    .reg .b64 %rd<3>;
590; COMMON-EMPTY:
591; COMMON-NEXT:  // %bb.0:
592; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v4i16_param_1];
593; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v4i16_param_0];
594; COMMON-NEXT:    ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
595; COMMON-NEXT:    st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
596; COMMON-NEXT:    ret;
597  %t1 = load <4 x i16>, ptr %a
598  store <4 x i16> %t1, ptr %b, align 16
599  ret void
600}
601
602define void @test_ldst_v8i16(ptr %a, ptr %b) {
603; COMMON-LABEL: test_ldst_v8i16(
604; COMMON:       {
605; COMMON-NEXT:    .reg .b32 %r<5>;
606; COMMON-NEXT:    .reg .b64 %rd<3>;
607; COMMON-EMPTY:
608; COMMON-NEXT:  // %bb.0:
609; COMMON-NEXT:    ld.param.u64 %rd2, [test_ldst_v8i16_param_1];
610; COMMON-NEXT:    ld.param.u64 %rd1, [test_ldst_v8i16_param_0];
611; COMMON-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
612; COMMON-NEXT:    st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
613; COMMON-NEXT:    ret;
614  %t1 = load <8 x i16>, ptr %a
615  store <8 x i16> %t1, ptr %b, align 16
616  ret void
617}
618
619declare <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b) #0
620
621define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
622; COMMON-LABEL: test_call(
623; COMMON:       {
624; COMMON-NEXT:    .reg .b32 %r<5>;
625; COMMON-EMPTY:
626; COMMON-NEXT:  // %bb.0:
627; COMMON-NEXT:    ld.param.u32 %r2, [test_call_param_1];
628; COMMON-NEXT:    ld.param.u32 %r1, [test_call_param_0];
629; COMMON-NEXT:    { // callseq 0, 0
630; COMMON-NEXT:    .param .align 4 .b8 param0[4];
631; COMMON-NEXT:    st.param.b32 [param0], %r1;
632; COMMON-NEXT:    .param .align 4 .b8 param1[4];
633; COMMON-NEXT:    st.param.b32 [param1], %r2;
634; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
635; COMMON-NEXT:    call.uni (retval0),
636; COMMON-NEXT:    test_callee,
637; COMMON-NEXT:    (
638; COMMON-NEXT:    param0,
639; COMMON-NEXT:    param1
640; COMMON-NEXT:    );
641; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
642; COMMON-NEXT:    } // callseq 0
643; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
644; COMMON-NEXT:    ret;
645  %r = call <2 x i16> @test_callee(<2 x i16> %a, <2 x i16> %b)
646  ret <2 x i16> %r
647}
648
649define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
650; COMMON-LABEL: test_call_flipped(
651; COMMON:       {
652; COMMON-NEXT:    .reg .b32 %r<5>;
653; COMMON-EMPTY:
654; COMMON-NEXT:  // %bb.0:
655; COMMON-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
656; COMMON-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
657; COMMON-NEXT:    { // callseq 1, 0
658; COMMON-NEXT:    .param .align 4 .b8 param0[4];
659; COMMON-NEXT:    st.param.b32 [param0], %r2;
660; COMMON-NEXT:    .param .align 4 .b8 param1[4];
661; COMMON-NEXT:    st.param.b32 [param1], %r1;
662; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
663; COMMON-NEXT:    call.uni (retval0),
664; COMMON-NEXT:    test_callee,
665; COMMON-NEXT:    (
666; COMMON-NEXT:    param0,
667; COMMON-NEXT:    param1
668; COMMON-NEXT:    );
669; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
670; COMMON-NEXT:    } // callseq 1
671; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
672; COMMON-NEXT:    ret;
673  %r = call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a)
674  ret <2 x i16> %r
675}
676
677define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
678; COMMON-LABEL: test_tailcall_flipped(
679; COMMON:       {
680; COMMON-NEXT:    .reg .b32 %r<5>;
681; COMMON-EMPTY:
682; COMMON-NEXT:  // %bb.0:
683; COMMON-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
684; COMMON-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
685; COMMON-NEXT:    { // callseq 2, 0
686; COMMON-NEXT:    .param .align 4 .b8 param0[4];
687; COMMON-NEXT:    st.param.b32 [param0], %r2;
688; COMMON-NEXT:    .param .align 4 .b8 param1[4];
689; COMMON-NEXT:    st.param.b32 [param1], %r1;
690; COMMON-NEXT:    .param .align 4 .b8 retval0[4];
691; COMMON-NEXT:    call.uni (retval0),
692; COMMON-NEXT:    test_callee,
693; COMMON-NEXT:    (
694; COMMON-NEXT:    param0,
695; COMMON-NEXT:    param1
696; COMMON-NEXT:    );
697; COMMON-NEXT:    ld.param.b32 %r3, [retval0];
698; COMMON-NEXT:    } // callseq 2
699; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
700; COMMON-NEXT:    ret;
701  %r = tail call <2 x i16> @test_callee(<2 x i16> %b, <2 x i16> %a)
702  ret <2 x i16> %r
703}
704
705define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 {
706; COMMON-LABEL: test_select(
707; COMMON:       {
708; COMMON-NEXT:    .reg .pred %p<2>;
709; COMMON-NEXT:    .reg .b16 %rs<3>;
710; COMMON-NEXT:    .reg .b32 %r<4>;
711; COMMON-EMPTY:
712; COMMON-NEXT:  // %bb.0:
713; COMMON-NEXT:    ld.param.u8 %rs1, [test_select_param_2];
714; COMMON-NEXT:    and.b16 %rs2, %rs1, 1;
715; COMMON-NEXT:    setp.eq.b16 %p1, %rs2, 1;
716; COMMON-NEXT:    ld.param.u32 %r2, [test_select_param_1];
717; COMMON-NEXT:    ld.param.u32 %r1, [test_select_param_0];
718; COMMON-NEXT:    selp.b32 %r3, %r1, %r2, %p1;
719; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
720; COMMON-NEXT:    ret;
721  %r = select i1 %c, <2 x i16> %a, <2 x i16> %b
722  ret <2 x i16> %r
723}
724
725define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) #0 {
726; COMMON-LABEL: test_select_cc(
727; COMMON:       {
728; COMMON-NEXT:    .reg .pred %p<3>;
729; COMMON-NEXT:    .reg .b16 %rs<11>;
730; COMMON-NEXT:    .reg .b32 %r<6>;
731; COMMON-EMPTY:
732; COMMON-NEXT:  // %bb.0:
733; COMMON-NEXT:    ld.param.u32 %r4, [test_select_cc_param_3];
734; COMMON-NEXT:    ld.param.u32 %r3, [test_select_cc_param_2];
735; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_param_1];
736; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_param_0];
737; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
738; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
739; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
740; COMMON-NEXT:    setp.ne.s16 %p2, %rs4, %rs2;
741; COMMON-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
742; COMMON-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
743; COMMON-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
744; COMMON-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
745; COMMON-NEXT:    mov.b32 %r5, {%rs10, %rs9};
746; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
747; COMMON-NEXT:    ret;
748  %cc = icmp ne <2 x i16> %c, %d
749  %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b
750  ret <2 x i16> %r
751}
752
753define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
754; COMMON-LABEL: test_select_cc_i32_i16(
755; COMMON:       {
756; COMMON-NEXT:    .reg .pred %p<3>;
757; COMMON-NEXT:    .reg .b16 %rs<5>;
758; COMMON-NEXT:    .reg .b32 %r<9>;
759; COMMON-EMPTY:
760; COMMON-NEXT:  // %bb.0:
761; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
762; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
763; COMMON-NEXT:    ld.param.u32 %r6, [test_select_cc_i32_i16_param_3];
764; COMMON-NEXT:    ld.param.u32 %r5, [test_select_cc_i32_i16_param_2];
765; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
766; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
767; COMMON-NEXT:    setp.ne.s16 %p1, %rs3, %rs1;
768; COMMON-NEXT:    setp.ne.s16 %p2, %rs4, %rs2;
769; COMMON-NEXT:    selp.b32 %r7, %r2, %r4, %p2;
770; COMMON-NEXT:    selp.b32 %r8, %r1, %r3, %p1;
771; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
772; COMMON-NEXT:    ret;
773                                           <2 x i16> %c, <2 x i16> %d) #0 {
774  %cc = icmp ne <2 x i16> %c, %d
775  %r = select <2 x i1> %cc, <2 x i32> %a, <2 x i32> %b
776  ret <2 x i32> %r
777}
778
779define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
780; COMMON-LABEL: test_select_cc_i16_i32(
781; COMMON:       {
782; COMMON-NEXT:    .reg .pred %p<3>;
783; COMMON-NEXT:    .reg .b16 %rs<7>;
784; COMMON-NEXT:    .reg .b32 %r<8>;
785; COMMON-EMPTY:
786; COMMON-NEXT:  // %bb.0:
787; COMMON-NEXT:    ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
788; COMMON-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
789; COMMON-NEXT:    ld.param.u32 %r2, [test_select_cc_i16_i32_param_1];
790; COMMON-NEXT:    ld.param.u32 %r1, [test_select_cc_i16_i32_param_0];
791; COMMON-NEXT:    setp.ne.s32 %p1, %r3, %r5;
792; COMMON-NEXT:    setp.ne.s32 %p2, %r4, %r6;
793; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
794; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
795; COMMON-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
796; COMMON-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
797; COMMON-NEXT:    mov.b32 %r7, {%rs6, %rs5};
798; COMMON-NEXT:    st.param.b32 [func_retval0], %r7;
799; COMMON-NEXT:    ret;
800                                          <2 x i32> %c, <2 x i32> %d) #0 {
801  %cc = icmp ne <2 x i32> %c, %d
802  %r = select <2 x i1> %cc, <2 x i16> %a, <2 x i16> %b
803  ret <2 x i16> %r
804}
805
806
807define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
808; COMMON-LABEL: test_trunc_2xi32(
809; COMMON:       {
810; COMMON-NEXT:    .reg .b32 %r<4>;
811; COMMON-EMPTY:
812; COMMON-NEXT:  // %bb.0:
813; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0];
814; COMMON-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
815; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
816; COMMON-NEXT:    ret;
817  %r = trunc <2 x i32> %a to <2 x i16>
818  ret <2 x i16> %r
819}
820
821define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
822; I16x2-LABEL: test_trunc_2xi32_muliple_use0(
823; I16x2:       {
824; I16x2-NEXT:    .reg .b32 %r<6>;
825; I16x2-NEXT:    .reg .b64 %rd<2>;
826; I16x2-EMPTY:
827; I16x2-NEXT:  // %bb.0:
828; I16x2-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
829; I16x2-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
830; I16x2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
831; I16x2-NEXT:    mov.b32 %r4, 65537;
832; I16x2-NEXT:    add.s16x2 %r5, %r3, %r4;
833; I16x2-NEXT:    st.u32 [%rd1], %r5;
834; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
835; I16x2-NEXT:    ret;
836;
837; NO-I16x2-LABEL: test_trunc_2xi32_muliple_use0(
838; NO-I16x2:       {
839; NO-I16x2-NEXT:    .reg .b16 %rs<5>;
840; NO-I16x2-NEXT:    .reg .b32 %r<5>;
841; NO-I16x2-NEXT:    .reg .b64 %rd<2>;
842; NO-I16x2-EMPTY:
843; NO-I16x2-NEXT:  // %bb.0:
844; NO-I16x2-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
845; NO-I16x2-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
846; NO-I16x2-NEXT:    cvt.u16.u32 %rs1, %r2;
847; NO-I16x2-NEXT:    cvt.u16.u32 %rs2, %r1;
848; NO-I16x2-NEXT:    mov.b32 %r3, {%rs2, %rs1};
849; NO-I16x2-NEXT:    add.s16 %rs3, %rs1, 1;
850; NO-I16x2-NEXT:    add.s16 %rs4, %rs2, 1;
851; NO-I16x2-NEXT:    mov.b32 %r4, {%rs4, %rs3};
852; NO-I16x2-NEXT:    st.u32 [%rd1], %r4;
853; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
854; NO-I16x2-NEXT:    ret;
855  %r = trunc <2 x i32> %a to <2 x i16>
856  ; Reuse the truncate - optimizing to PRMT when we don't have i16x2 vectors
857  ; would increase register pressure
858  %s = add <2 x i16> %r, splat (i16 1)
859  store <2 x i16> %s, ptr %p
860  ret <2 x i16> %r
861}
862
863define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 {
864; COMMON-LABEL: test_trunc_2xi32_muliple_use1(
865; COMMON:       {
866; COMMON-NEXT:    .reg .b32 %r<6>;
867; COMMON-NEXT:    .reg .b64 %rd<2>;
868; COMMON-EMPTY:
869; COMMON-NEXT:  // %bb.0:
870; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
871; COMMON-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
872; COMMON-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
873; COMMON-NEXT:    add.s32 %r4, %r2, 1;
874; COMMON-NEXT:    add.s32 %r5, %r1, 1;
875; COMMON-NEXT:    st.v2.u32 [%rd1], {%r5, %r4};
876; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
877; COMMON-NEXT:    ret;
878  %r = trunc <2 x i32> %a to <2 x i16>
879  ; Reuse the original value - optimizing to PRMT does not increase register
880  ; pressure
881  %s = add <2 x i32> %a, splat (i32 1)
882  store <2 x i32> %s, ptr %p
883  ret <2 x i16> %r
884}
885
886define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 {
887; COMMON-LABEL: test_trunc_2xi64(
888; COMMON:       {
889; COMMON-NEXT:    .reg .b16 %rs<3>;
890; COMMON-NEXT:    .reg .b32 %r<2>;
891; COMMON-NEXT:    .reg .b64 %rd<3>;
892; COMMON-EMPTY:
893; COMMON-NEXT:  // %bb.0:
894; COMMON-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
895; COMMON-NEXT:    cvt.u16.u64 %rs1, %rd2;
896; COMMON-NEXT:    cvt.u16.u64 %rs2, %rd1;
897; COMMON-NEXT:    mov.b32 %r1, {%rs2, %rs1};
898; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
899; COMMON-NEXT:    ret;
900  %r = trunc <2 x i64> %a to <2 x i16>
901  ret <2 x i16> %r
902}
903
904define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
905; COMMON-LABEL: test_zext_2xi32(
906; COMMON:       {
907; COMMON-NEXT:    .reg .b16 %rs<3>;
908; COMMON-NEXT:    .reg .b32 %r<4>;
909; COMMON-EMPTY:
910; COMMON-NEXT:  // %bb.0:
911; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi32_param_0];
912; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
913; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
914; COMMON-NEXT:    cvt.u32.u16 %r3, %rs2;
915; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r3};
916; COMMON-NEXT:    ret;
917  %r = zext <2 x i16> %a to <2 x i32>
918  ret <2 x i32> %r
919}
920
921define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
922; COMMON-LABEL: test_zext_2xi64(
923; COMMON:       {
924; COMMON-NEXT:    .reg .b16 %rs<3>;
925; COMMON-NEXT:    .reg .b32 %r<2>;
926; COMMON-NEXT:    .reg .b64 %rd<3>;
927; COMMON-EMPTY:
928; COMMON-NEXT:  // %bb.0:
929; COMMON-NEXT:    ld.param.u32 %r1, [test_zext_2xi64_param_0];
930; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
931; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
932; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
933; COMMON-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
934; COMMON-NEXT:    ret;
935  %r = zext <2 x i16> %a to <2 x i64>
936  ret <2 x i64> %r
937}
938
939define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
940; COMMON-LABEL: test_bitcast_i32_to_2xi16(
941; COMMON:       {
942; COMMON-NEXT:    .reg .b32 %r<2>;
943; COMMON-EMPTY:
944; COMMON-NEXT:  // %bb.0:
945; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0];
946; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
947; COMMON-NEXT:    ret;
948  %r = bitcast i32 %a to <2 x i16>
949  ret <2 x i16> %r
950}
951
952define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 {
953; COMMON-LABEL: test_bitcast_2xi16_to_i32(
954; COMMON:       {
955; COMMON-NEXT:    .reg .b32 %r<2>;
956; COMMON-EMPTY:
957; COMMON-NEXT:  // %bb.0:
958; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0];
959; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
960; COMMON-NEXT:    ret;
961  %r = bitcast <2 x i16> %a to i32
962  ret i32 %r
963}
964
965define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
966; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf(
967; COMMON:       {
968; COMMON-NEXT:    .reg .b16 %rs<3>;
969; COMMON-NEXT:    .reg .b32 %r<2>;
970; COMMON-EMPTY:
971; COMMON-NEXT:  // %bb.0:
972; COMMON-NEXT:    ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
973; COMMON-NEXT:    mov.b16 %rs2, 5;
974; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
975; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
976; COMMON-NEXT:    ret;
977  %ins.0 = insertelement <2 x i16> undef, i16 %a, i32 0
978  %ins.1 = insertelement <2 x i16> %ins.0, i16 5, i32 1
979  %r = bitcast <2 x i16> %ins.1 to <2 x half>
980  ret <2 x half> %r
981}
982
983
984define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
985; COMMON-LABEL: test_shufflevector(
986; COMMON:       {
987; COMMON-NEXT:    .reg .b16 %rs<3>;
988; COMMON-NEXT:    .reg .b32 %r<3>;
989; COMMON-EMPTY:
990; COMMON-NEXT:  // %bb.0:
991; COMMON-NEXT:    ld.param.u32 %r1, [test_shufflevector_param_0];
992; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
993; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
994; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
995; COMMON-NEXT:    ret;
996  %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
997  ret <2 x i16> %s
998}
999
1000define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
1001; COMMON-LABEL: test_insertelement(
1002; COMMON:       {
1003; COMMON-NEXT:    .reg .b16 %rs<3>;
1004; COMMON-NEXT:    .reg .b32 %r<3>;
1005; COMMON-EMPTY:
1006; COMMON-NEXT:  // %bb.0:
1007; COMMON-NEXT:    ld.param.u16 %rs1, [test_insertelement_param_1];
1008; COMMON-NEXT:    ld.param.u32 %r1, [test_insertelement_param_0];
1009; COMMON-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
1010; COMMON-NEXT:    mov.b32 %r2, {%rs2, %rs1};
1011; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
1012; COMMON-NEXT:    ret;
1013  %i = insertelement <2 x i16> %a, i16 %x, i64 1
1014  ret <2 x i16> %i
1015}
1016
1017define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 {
1018; COMMON-LABEL: test_fptosi_2xhalf_to_2xi16(
1019; COMMON:       {
1020; COMMON-NEXT:    .reg .b16 %rs<5>;
1021; COMMON-NEXT:    .reg .b32 %r<3>;
1022; COMMON-EMPTY:
1023; COMMON-NEXT:  // %bb.0:
1024; COMMON-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0];
1025; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
1026; COMMON-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
1027; COMMON-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
1028; COMMON-NEXT:    mov.b32 %r2, {%rs4, %rs3};
1029; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
1030; COMMON-NEXT:    ret;
1031  %r = fptosi <2 x half> %a to <2 x i16>
1032  ret <2 x i16> %r
1033}
1034
1035define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 {
1036; COMMON-LABEL: test_fptoui_2xhalf_to_2xi16(
1037; COMMON:       {
1038; COMMON-NEXT:    .reg .b16 %rs<5>;
1039; COMMON-NEXT:    .reg .b32 %r<3>;
1040; COMMON-EMPTY:
1041; COMMON-NEXT:  // %bb.0:
1042; COMMON-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0];
1043; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
1044; COMMON-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
1045; COMMON-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
1046; COMMON-NEXT:    mov.b32 %r2, {%rs4, %rs3};
1047; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
1048; COMMON-NEXT:    ret;
1049  %r = fptoui <2 x half> %a to <2 x i16>
1050  ret <2 x i16> %r
1051}
1052
1053attributes #0 = { nounwind }
1054