xref: /llvm-project/llvm/test/CodeGen/NVPTX/load-store-vectors.ll (revision 932d9c13faa3de1deca3874d3b864901aa5ec9a5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
4
5; TODO: add i1, and <6 x i8> vector tests.
6
7; TODO: add test for vectors that exceed 128-bit length
8; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
9; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
10
11; TODO: generate PTX that preserves Concurrent Forward Progress
12;       for atomic operations to local statespace
13;       by generating atomic or volatile operations.
14
15; TODO: design exposure for atomic operations on vector types.
16
17; TODO: add weak,atomic,volatile,atomic volatile tests
18;       for .const and .param statespaces.
19
20;; generic statespace
21
22; generic
23
24; TODO: make the lowering of this weak vector ops consistent with
25;       the ones of the next tests. This test lowers to a weak PTX
26;       vector op, but next test lowers to a vector PTX op.
27define void @generic_2xi8(ptr %a) {
28; CHECK-LABEL: generic_2xi8(
29; CHECK:       {
30; CHECK-NEXT:    .reg .b16 %rs<5>;
31; CHECK-NEXT:    .reg .b64 %rd<2>;
32; CHECK-EMPTY:
33; CHECK-NEXT:  // %bb.0:
34; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi8_param_0];
35; CHECK-NEXT:    ld.v2.u8 {%rs1, %rs2}, [%rd1];
36; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
37; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
38; CHECK-NEXT:    st.v2.u8 [%rd1], {%rs4, %rs3};
39; CHECK-NEXT:    ret;
40  %a.load = load <2 x i8>, ptr %a
41  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
42  store <2 x i8> %a.add, ptr %a
43  ret void
44}
45
46; TODO: make the lowering of this weak vector ops consistent with
47;       the ones of the previous test. This test lowers to a weak
48;       PTX scalar op, but prior test lowers to a vector PTX op.
49define void @generic_4xi8(ptr %a) {
50; CHECK-LABEL: generic_4xi8(
51; CHECK:       {
52; CHECK-NEXT:    .reg .b16 %rs<9>;
53; CHECK-NEXT:    .reg .b32 %r<13>;
54; CHECK-NEXT:    .reg .b64 %rd<2>;
55; CHECK-EMPTY:
56; CHECK-NEXT:  // %bb.0:
57; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi8_param_0];
58; CHECK-NEXT:    ld.u32 %r1, [%rd1];
59; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
60; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
61; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
62; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
63; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
64; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
65; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
66; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
67; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
68; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
69; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
70; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
71; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
72; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
73; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
74; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
75; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
76; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
77; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
78; CHECK-NEXT:    st.u32 [%rd1], %r12;
79; CHECK-NEXT:    ret;
80  %a.load = load <4 x i8>, ptr %a
81  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
82  store <4 x i8> %a.add, ptr %a
83  ret void
84}
85
86define void @generic_8xi8(ptr %a) {
87; CHECK-LABEL: generic_8xi8(
88; CHECK:       {
89; CHECK-NEXT:    .reg .b16 %rs<17>;
90; CHECK-NEXT:    .reg .b32 %r<25>;
91; CHECK-NEXT:    .reg .b64 %rd<2>;
92; CHECK-EMPTY:
93; CHECK-NEXT:  // %bb.0:
94; CHECK-NEXT:    ld.param.u64 %rd1, [generic_8xi8_param_0];
95; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
96; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
97; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
98; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
99; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
100; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
101; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
102; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
103; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
104; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
105; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
106; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
107; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
108; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
109; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
110; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
111; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
112; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
113; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
114; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
115; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
116; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
117; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
118; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
119; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
120; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
121; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
122; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
123; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
124; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
125; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
126; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
127; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
128; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
129; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
130; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
131; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
132; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
133; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
134; CHECK-NEXT:    st.v2.b32 [%rd1], {%r24, %r13};
135; CHECK-NEXT:    ret;
136  %a.load = load <8 x i8>, ptr %a
137  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
138  store <8 x i8> %a.add, ptr %a
139  ret void
140}
141
142define void @generic_16xi8(ptr %a) {
143; CHECK-LABEL: generic_16xi8(
144; CHECK:       {
145; CHECK-NEXT:    .reg .b16 %rs<33>;
146; CHECK-NEXT:    .reg .b32 %r<49>;
147; CHECK-NEXT:    .reg .b64 %rd<2>;
148; CHECK-EMPTY:
149; CHECK-NEXT:  // %bb.0:
150; CHECK-NEXT:    ld.param.u64 %rd1, [generic_16xi8_param_0];
151; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
152; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
153; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
154; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
155; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
156; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
157; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
158; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
159; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
160; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
161; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
162; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
163; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
164; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
165; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
166; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
167; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
168; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
169; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
170; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
171; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
172; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
173; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
174; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
175; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
176; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
177; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
178; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
179; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
180; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
181; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
182; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
183; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
184; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
185; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
186; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
187; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
188; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
189; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
190; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
191; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
192; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
193; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
194; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
195; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
196; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
197; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
198; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
199; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
200; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
201; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
202; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
203; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
204; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
205; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
206; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
207; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
208; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
209; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
210; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
211; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
212; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
213; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
214; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
215; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
216; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
217; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
218; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
219; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
220; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
221; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
222; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
223; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
224; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
225; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
226; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
227; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
228; CHECK-NEXT:    st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
229; CHECK-NEXT:    ret;
230  %a.load = load <16 x i8>, ptr %a
231  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
232  store <16 x i8> %a.add, ptr %a
233  ret void
234}
235
236define void @generic_2xi16(ptr %a) {
237; CHECK-LABEL: generic_2xi16(
238; CHECK:       {
239; CHECK-NEXT:    .reg .b16 %rs<5>;
240; CHECK-NEXT:    .reg .b32 %r<3>;
241; CHECK-NEXT:    .reg .b64 %rd<2>;
242; CHECK-EMPTY:
243; CHECK-NEXT:  // %bb.0:
244; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi16_param_0];
245; CHECK-NEXT:    ld.u32 %r1, [%rd1];
246; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
247; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
248; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
249; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
250; CHECK-NEXT:    st.u32 [%rd1], %r2;
251; CHECK-NEXT:    ret;
252  %a.load = load <2 x i16>, ptr %a
253  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
254  store <2 x i16> %a.add, ptr %a
255  ret void
256}
257
258define void @generic_4xi16(ptr %a) {
259; CHECK-LABEL: generic_4xi16(
260; CHECK:       {
261; CHECK-NEXT:    .reg .b16 %rs<9>;
262; CHECK-NEXT:    .reg .b64 %rd<2>;
263; CHECK-EMPTY:
264; CHECK-NEXT:  // %bb.0:
265; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi16_param_0];
266; CHECK-NEXT:    ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
267; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
268; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
269; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
270; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
271; CHECK-NEXT:    st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
272; CHECK-NEXT:    ret;
273  %a.load = load <4 x i16>, ptr %a
274  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
275  store <4 x i16> %a.add, ptr %a
276  ret void
277}
278
279define void @generic_8xi16(ptr %a) {
280; CHECK-LABEL: generic_8xi16(
281; CHECK:       {
282; CHECK-NEXT:    .reg .b16 %rs<17>;
283; CHECK-NEXT:    .reg .b32 %r<9>;
284; CHECK-NEXT:    .reg .b64 %rd<2>;
285; CHECK-EMPTY:
286; CHECK-NEXT:  // %bb.0:
287; CHECK-NEXT:    ld.param.u64 %rd1, [generic_8xi16_param_0];
288; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
289; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
290; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
291; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
292; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
293; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
294; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
295; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
296; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
297; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
298; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
299; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
300; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
301; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
302; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
303; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
304; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
305; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
306; CHECK-NEXT:    ret;
307  %a.load = load <8 x i16>, ptr %a
308  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
309  store <8 x i16> %a.add, ptr %a
310  ret void
311}
312
313define void @generic_2xi32(ptr %a) {
314; CHECK-LABEL: generic_2xi32(
315; CHECK:       {
316; CHECK-NEXT:    .reg .b32 %r<5>;
317; CHECK-NEXT:    .reg .b64 %rd<2>;
318; CHECK-EMPTY:
319; CHECK-NEXT:  // %bb.0:
320; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi32_param_0];
321; CHECK-NEXT:    ld.v2.u32 {%r1, %r2}, [%rd1];
322; CHECK-NEXT:    add.s32 %r3, %r2, 1;
323; CHECK-NEXT:    add.s32 %r4, %r1, 1;
324; CHECK-NEXT:    st.v2.u32 [%rd1], {%r4, %r3};
325; CHECK-NEXT:    ret;
326  %a.load = load <2 x i32>, ptr %a
327  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
328  store <2 x i32> %a.add, ptr %a
329  ret void
330}
331
332define void @generic_4xi32(ptr %a) {
333; CHECK-LABEL: generic_4xi32(
334; CHECK:       {
335; CHECK-NEXT:    .reg .b32 %r<9>;
336; CHECK-NEXT:    .reg .b64 %rd<2>;
337; CHECK-EMPTY:
338; CHECK-NEXT:  // %bb.0:
339; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xi32_param_0];
340; CHECK-NEXT:    ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
341; CHECK-NEXT:    add.s32 %r5, %r4, 1;
342; CHECK-NEXT:    add.s32 %r6, %r3, 1;
343; CHECK-NEXT:    add.s32 %r7, %r2, 1;
344; CHECK-NEXT:    add.s32 %r8, %r1, 1;
345; CHECK-NEXT:    st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
346; CHECK-NEXT:    ret;
347  %a.load = load <4 x i32>, ptr %a
348  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
349  store <4 x i32> %a.add, ptr %a
350  ret void
351}
352
353define void @generic_2xi64(ptr %a) {
354; CHECK-LABEL: generic_2xi64(
355; CHECK:       {
356; CHECK-NEXT:    .reg .b64 %rd<6>;
357; CHECK-EMPTY:
358; CHECK-NEXT:  // %bb.0:
359; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xi64_param_0];
360; CHECK-NEXT:    ld.v2.u64 {%rd2, %rd3}, [%rd1];
361; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
362; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
363; CHECK-NEXT:    st.v2.u64 [%rd1], {%rd5, %rd4};
364; CHECK-NEXT:    ret;
365  %a.load = load <2 x i64>, ptr %a
366  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
367  store <2 x i64> %a.add, ptr %a
368  ret void
369}
370
371define void @generic_2xfloat(ptr %a) {
372; CHECK-LABEL: generic_2xfloat(
373; CHECK:       {
374; CHECK-NEXT:    .reg .f32 %f<5>;
375; CHECK-NEXT:    .reg .b64 %rd<2>;
376; CHECK-EMPTY:
377; CHECK-NEXT:  // %bb.0:
378; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xfloat_param_0];
379; CHECK-NEXT:    ld.v2.f32 {%f1, %f2}, [%rd1];
380; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
381; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
382; CHECK-NEXT:    st.v2.f32 [%rd1], {%f4, %f3};
383; CHECK-NEXT:    ret;
384  %a.load = load <2 x float>, ptr %a
385  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
386  store <2 x float> %a.add, ptr %a
387  ret void
388}
389
390define void @generic_4xfloat(ptr %a) {
391; CHECK-LABEL: generic_4xfloat(
392; CHECK:       {
393; CHECK-NEXT:    .reg .f32 %f<9>;
394; CHECK-NEXT:    .reg .b64 %rd<2>;
395; CHECK-EMPTY:
396; CHECK-NEXT:  // %bb.0:
397; CHECK-NEXT:    ld.param.u64 %rd1, [generic_4xfloat_param_0];
398; CHECK-NEXT:    ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
399; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
400; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
401; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
402; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
403; CHECK-NEXT:    st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
404; CHECK-NEXT:    ret;
405  %a.load = load <4 x float>, ptr %a
406  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
407  store <4 x float> %a.add, ptr %a
408  ret void
409}
410
411define void @generic_2xdouble(ptr %a) {
412; CHECK-LABEL: generic_2xdouble(
413; CHECK:       {
414; CHECK-NEXT:    .reg .b64 %rd<2>;
415; CHECK-NEXT:    .reg .f64 %fd<5>;
416; CHECK-EMPTY:
417; CHECK-NEXT:  // %bb.0:
418; CHECK-NEXT:    ld.param.u64 %rd1, [generic_2xdouble_param_0];
419; CHECK-NEXT:    ld.v2.f64 {%fd1, %fd2}, [%rd1];
420; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
421; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
422; CHECK-NEXT:    st.v2.f64 [%rd1], {%fd4, %fd3};
423; CHECK-NEXT:    ret;
424  %a.load = load <2 x double>, ptr %a
425  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
426  store <2 x double> %a.add, ptr %a
427  ret void
428}
429
430; generic_volatile
431
432; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
433; Currently, LLVM:
434; - does not allow atomic operations on vectors.
435; - it allows volatile operations but not clear what that means.
436; Following both semantics make sense in general and PTX supports both:
437; - volatile/atomic/volatile atomic applies to the whole vector
438; - volatile/atomic/volatile atomic applies elementwise
439; Actions required:
440; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those
441;   Below tests show that the current implementation picks the semantics in an inconsistent way
442;   * volatile <2 x i8> lowers to "elementwise volatile"
443;   * <4 x i8> lowers to "full vector volatile"
444; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics
445; - update tests in load-store-sm70.ll as well.
446
447; TODO: make this operation consistent with the one for <4 x i8>
448; This operation lowers to a "element wise volatile PTX operation".
449define void @generic_volatile_2xi8(ptr %a) {
450; CHECK-LABEL: generic_volatile_2xi8(
451; CHECK:       {
452; CHECK-NEXT:    .reg .b16 %rs<5>;
453; CHECK-NEXT:    .reg .b64 %rd<2>;
454; CHECK-EMPTY:
455; CHECK-NEXT:  // %bb.0:
456; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi8_param_0];
457; CHECK-NEXT:    ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1];
458; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
459; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
460; CHECK-NEXT:    st.volatile.v2.u8 [%rd1], {%rs4, %rs3};
461; CHECK-NEXT:    ret;
462  %a.load = load volatile <2 x i8>, ptr %a
463  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
464  store volatile <2 x i8> %a.add, ptr %a
465  ret void
466}
467
468; TODO: make this operation consistent with the one for <2 x i8>
469; This operation lowers to a "full vector volatile PTX operation".
470define void @generic_volatile_4xi8(ptr %a) {
471; CHECK-LABEL: generic_volatile_4xi8(
472; CHECK:       {
473; CHECK-NEXT:    .reg .b16 %rs<9>;
474; CHECK-NEXT:    .reg .b32 %r<13>;
475; CHECK-NEXT:    .reg .b64 %rd<2>;
476; CHECK-EMPTY:
477; CHECK-NEXT:  // %bb.0:
478; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi8_param_0];
479; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
480; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
481; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
482; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
483; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
484; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
485; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
486; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
487; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
488; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
489; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
490; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
491; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
492; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
493; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
494; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
495; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
496; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
497; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
498; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
499; CHECK-NEXT:    st.volatile.u32 [%rd1], %r12;
500; CHECK-NEXT:    ret;
501  %a.load = load volatile <4 x i8>, ptr %a
502  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
503  store volatile <4 x i8> %a.add, ptr %a
504  ret void
505}
506
507define void @generic_volatile_8xi8(ptr %a) {
508; CHECK-LABEL: generic_volatile_8xi8(
509; CHECK:       {
510; CHECK-NEXT:    .reg .b16 %rs<17>;
511; CHECK-NEXT:    .reg .b32 %r<25>;
512; CHECK-NEXT:    .reg .b64 %rd<2>;
513; CHECK-EMPTY:
514; CHECK-NEXT:  // %bb.0:
515; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_8xi8_param_0];
516; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
517; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
518; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
519; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
520; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
521; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
522; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
523; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
524; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
525; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
526; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
527; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
528; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
529; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
530; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
531; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
532; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
533; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
534; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
535; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
536; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
537; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
538; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
539; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
540; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
541; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
542; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
543; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
544; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
545; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
546; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
547; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
548; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
549; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
550; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
551; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
552; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
553; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
554; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
555; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r24, %r13};
556; CHECK-NEXT:    ret;
557  %a.load = load volatile <8 x i8>, ptr %a
558  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
559  store volatile <8 x i8> %a.add, ptr %a
560  ret void
561}
562
563define void @generic_volatile_16xi8(ptr %a) {
564; CHECK-LABEL: generic_volatile_16xi8(
565; CHECK:       {
566; CHECK-NEXT:    .reg .b16 %rs<33>;
567; CHECK-NEXT:    .reg .b32 %r<49>;
568; CHECK-NEXT:    .reg .b64 %rd<2>;
569; CHECK-EMPTY:
570; CHECK-NEXT:  // %bb.0:
571; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_16xi8_param_0];
572; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
573; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
574; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
575; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
576; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
577; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
578; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
579; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
580; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
581; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
582; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
583; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
584; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
585; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
586; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
587; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
588; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
589; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
590; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
591; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
592; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
593; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
594; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
595; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
596; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
597; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
598; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
599; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
600; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
601; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
602; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
603; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
604; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
605; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
606; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
607; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
608; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
609; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
610; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
611; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
612; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
613; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
614; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
615; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
616; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
617; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
618; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
619; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
620; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
621; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
622; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
623; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
624; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
625; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
626; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
627; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
628; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
629; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
630; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
631; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
632; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
633; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
634; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
635; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
636; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
637; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
638; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
639; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
640; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
641; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
642; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
643; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
644; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
645; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
646; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
647; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
648; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
649; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
650; CHECK-NEXT:    ret;
651  %a.load = load volatile <16 x i8>, ptr %a
652  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
653  store volatile <16 x i8> %a.add, ptr %a
654  ret void
655}
656
657define void @generic_volatile_2xi16(ptr %a) {
658; CHECK-LABEL: generic_volatile_2xi16(
659; CHECK:       {
660; CHECK-NEXT:    .reg .b16 %rs<5>;
661; CHECK-NEXT:    .reg .b32 %r<3>;
662; CHECK-NEXT:    .reg .b64 %rd<2>;
663; CHECK-EMPTY:
664; CHECK-NEXT:  // %bb.0:
665; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi16_param_0];
666; CHECK-NEXT:    ld.volatile.u32 %r1, [%rd1];
667; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
668; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
669; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
670; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
671; CHECK-NEXT:    st.volatile.u32 [%rd1], %r2;
672; CHECK-NEXT:    ret;
673  %a.load = load volatile <2 x i16>, ptr %a
674  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
675  store volatile <2 x i16> %a.add, ptr %a
676  ret void
677}
678
679define void @generic_volatile_4xi16(ptr %a) {
680; CHECK-LABEL: generic_volatile_4xi16(
681; CHECK:       {
682; CHECK-NEXT:    .reg .b16 %rs<9>;
683; CHECK-NEXT:    .reg .b64 %rd<2>;
684; CHECK-EMPTY:
685; CHECK-NEXT:  // %bb.0:
686; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi16_param_0];
687; CHECK-NEXT:    ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
688; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
689; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
690; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
691; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
692; CHECK-NEXT:    st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
693; CHECK-NEXT:    ret;
694  %a.load = load volatile <4 x i16>, ptr %a
695  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
696  store volatile <4 x i16> %a.add, ptr %a
697  ret void
698}
699
700define void @generic_volatile_8xi16(ptr %a) {
701; CHECK-LABEL: generic_volatile_8xi16(
702; CHECK:       {
703; CHECK-NEXT:    .reg .b16 %rs<17>;
704; CHECK-NEXT:    .reg .b32 %r<9>;
705; CHECK-NEXT:    .reg .b64 %rd<2>;
706; CHECK-EMPTY:
707; CHECK-NEXT:  // %bb.0:
708; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_8xi16_param_0];
709; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
710; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
711; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
712; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
713; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
714; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
715; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
716; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
717; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
718; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
719; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
720; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
721; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
722; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
723; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
724; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
725; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
726; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
727; CHECK-NEXT:    ret;
728  %a.load = load volatile <8 x i16>, ptr %a
729  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
730  store volatile <8 x i16> %a.add, ptr %a
731  ret void
732}
733
734define void @generic_volatile_2xi32(ptr %a) {
735; CHECK-LABEL: generic_volatile_2xi32(
736; CHECK:       {
737; CHECK-NEXT:    .reg .b32 %r<5>;
738; CHECK-NEXT:    .reg .b64 %rd<2>;
739; CHECK-EMPTY:
740; CHECK-NEXT:  // %bb.0:
741; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi32_param_0];
742; CHECK-NEXT:    ld.volatile.v2.u32 {%r1, %r2}, [%rd1];
743; CHECK-NEXT:    add.s32 %r3, %r2, 1;
744; CHECK-NEXT:    add.s32 %r4, %r1, 1;
745; CHECK-NEXT:    st.volatile.v2.u32 [%rd1], {%r4, %r3};
746; CHECK-NEXT:    ret;
747  %a.load = load volatile <2 x i32>, ptr %a
748  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
749  store volatile <2 x i32> %a.add, ptr %a
750  ret void
751}
752
753define void @generic_volatile_4xi32(ptr %a) {
754; CHECK-LABEL: generic_volatile_4xi32(
755; CHECK:       {
756; CHECK-NEXT:    .reg .b32 %r<9>;
757; CHECK-NEXT:    .reg .b64 %rd<2>;
758; CHECK-EMPTY:
759; CHECK-NEXT:  // %bb.0:
760; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xi32_param_0];
761; CHECK-NEXT:    ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
762; CHECK-NEXT:    add.s32 %r5, %r4, 1;
763; CHECK-NEXT:    add.s32 %r6, %r3, 1;
764; CHECK-NEXT:    add.s32 %r7, %r2, 1;
765; CHECK-NEXT:    add.s32 %r8, %r1, 1;
766; CHECK-NEXT:    st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
767; CHECK-NEXT:    ret;
768  %a.load = load volatile <4 x i32>, ptr %a
769  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
770  store volatile <4 x i32> %a.add, ptr %a
771  ret void
772}
773
774define void @generic_volatile_2xi64(ptr %a) {
775; CHECK-LABEL: generic_volatile_2xi64(
776; CHECK:       {
777; CHECK-NEXT:    .reg .b64 %rd<6>;
778; CHECK-EMPTY:
779; CHECK-NEXT:  // %bb.0:
780; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xi64_param_0];
781; CHECK-NEXT:    ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1];
782; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
783; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
784; CHECK-NEXT:    st.volatile.v2.u64 [%rd1], {%rd5, %rd4};
785; CHECK-NEXT:    ret;
786  %a.load = load volatile <2 x i64>, ptr %a
787  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
788  store volatile <2 x i64> %a.add, ptr %a
789  ret void
790}
791
792define void @generic_volatile_2xfloat(ptr %a) {
793; CHECK-LABEL: generic_volatile_2xfloat(
794; CHECK:       {
795; CHECK-NEXT:    .reg .f32 %f<5>;
796; CHECK-NEXT:    .reg .b64 %rd<2>;
797; CHECK-EMPTY:
798; CHECK-NEXT:  // %bb.0:
799; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0];
800; CHECK-NEXT:    ld.volatile.v2.f32 {%f1, %f2}, [%rd1];
801; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
802; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
803; CHECK-NEXT:    st.volatile.v2.f32 [%rd1], {%f4, %f3};
804; CHECK-NEXT:    ret;
805  %a.load = load volatile <2 x float>, ptr %a
806  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
807  store volatile <2 x float> %a.add, ptr %a
808  ret void
809}
810
811define void @generic_volatile_4xfloat(ptr %a) {
812; CHECK-LABEL: generic_volatile_4xfloat(
813; CHECK:       {
814; CHECK-NEXT:    .reg .f32 %f<9>;
815; CHECK-NEXT:    .reg .b64 %rd<2>;
816; CHECK-EMPTY:
817; CHECK-NEXT:  // %bb.0:
818; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0];
819; CHECK-NEXT:    ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
820; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
821; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
822; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
823; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
824; CHECK-NEXT:    st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
825; CHECK-NEXT:    ret;
826  %a.load = load volatile <4 x float>, ptr %a
827  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
828  store volatile <4 x float> %a.add, ptr %a
829  ret void
830}
831
832define void @generic_volatile_2xdouble(ptr %a) {
833; CHECK-LABEL: generic_volatile_2xdouble(
834; CHECK:       {
835; CHECK-NEXT:    .reg .b64 %rd<2>;
836; CHECK-NEXT:    .reg .f64 %fd<5>;
837; CHECK-EMPTY:
838; CHECK-NEXT:  // %bb.0:
839; CHECK-NEXT:    ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0];
840; CHECK-NEXT:    ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1];
841; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
842; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
843; CHECK-NEXT:    st.volatile.v2.f64 [%rd1], {%fd4, %fd3};
844; CHECK-NEXT:    ret;
845  %a.load = load volatile <2 x double>, ptr %a
846  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
847  store volatile <2 x double> %a.add, ptr %a
848  ret void
849}
850
851;; global statespace
852
853; global
854
855define void @global_2xi8(ptr addrspace(1) %a) {
856; CHECK-LABEL: global_2xi8(
857; CHECK:       {
858; CHECK-NEXT:    .reg .b16 %rs<5>;
859; CHECK-NEXT:    .reg .b64 %rd<2>;
860; CHECK-EMPTY:
861; CHECK-NEXT:  // %bb.0:
862; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi8_param_0];
863; CHECK-NEXT:    ld.global.v2.u8 {%rs1, %rs2}, [%rd1];
864; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
865; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
866; CHECK-NEXT:    st.global.v2.u8 [%rd1], {%rs4, %rs3};
867; CHECK-NEXT:    ret;
868  %a.load = load <2 x i8>, ptr addrspace(1) %a
869  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
870  store <2 x i8> %a.add, ptr addrspace(1) %a
871  ret void
872}
873
874define void @global_4xi8(ptr addrspace(1) %a) {
875; CHECK-LABEL: global_4xi8(
876; CHECK:       {
877; CHECK-NEXT:    .reg .b16 %rs<9>;
878; CHECK-NEXT:    .reg .b32 %r<13>;
879; CHECK-NEXT:    .reg .b64 %rd<2>;
880; CHECK-EMPTY:
881; CHECK-NEXT:  // %bb.0:
882; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi8_param_0];
883; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
884; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
885; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
886; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
887; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
888; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
889; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
890; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
891; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
892; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
893; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
894; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
895; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
896; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
897; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
898; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
899; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
900; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
901; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
902; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
903; CHECK-NEXT:    st.global.u32 [%rd1], %r12;
904; CHECK-NEXT:    ret;
905  %a.load = load <4 x i8>, ptr addrspace(1) %a
906  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
907  store <4 x i8> %a.add, ptr addrspace(1) %a
908  ret void
909}
910
911define void @global_8xi8(ptr addrspace(1) %a) {
912; CHECK-LABEL: global_8xi8(
913; CHECK:       {
914; CHECK-NEXT:    .reg .b16 %rs<17>;
915; CHECK-NEXT:    .reg .b32 %r<25>;
916; CHECK-NEXT:    .reg .b64 %rd<2>;
917; CHECK-EMPTY:
918; CHECK-NEXT:  // %bb.0:
919; CHECK-NEXT:    ld.param.u64 %rd1, [global_8xi8_param_0];
920; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
921; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
922; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
923; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
924; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
925; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
926; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
927; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
928; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
929; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
930; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
931; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
932; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
933; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
934; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
935; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
936; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
937; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
938; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
939; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
940; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
941; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
942; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
943; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
944; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
945; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
946; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
947; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
948; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
949; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
950; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
951; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
952; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
953; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
954; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
955; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
956; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
957; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
958; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
959; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r24, %r13};
960; CHECK-NEXT:    ret;
961  %a.load = load <8 x i8>, ptr addrspace(1) %a
962  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
963  store <8 x i8> %a.add, ptr addrspace(1) %a
964  ret void
965}
966
967define void @global_16xi8(ptr addrspace(1) %a) {
968; CHECK-LABEL: global_16xi8(
969; CHECK:       {
970; CHECK-NEXT:    .reg .b16 %rs<33>;
971; CHECK-NEXT:    .reg .b32 %r<49>;
972; CHECK-NEXT:    .reg .b64 %rd<2>;
973; CHECK-EMPTY:
974; CHECK-NEXT:  // %bb.0:
975; CHECK-NEXT:    ld.param.u64 %rd1, [global_16xi8_param_0];
976; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
977; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
978; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
979; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
980; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
981; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
982; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
983; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
984; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
985; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
986; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
987; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
988; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
989; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
990; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
991; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
992; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
993; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
994; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
995; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
996; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
997; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
998; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
999; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
1000; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
1001; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
1002; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
1003; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
1004; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
1005; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
1006; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
1007; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
1008; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
1009; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
1010; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
1011; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
1012; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
1013; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
1014; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
1015; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
1016; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
1017; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
1018; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
1019; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
1020; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
1021; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
1022; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
1023; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
1024; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
1025; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
1026; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
1027; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
1028; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
1029; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
1030; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
1031; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
1032; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
1033; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
1034; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
1035; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
1036; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
1037; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
1038; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
1039; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
1040; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
1041; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
1042; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
1043; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
1044; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
1045; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
1046; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
1047; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
1048; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
1049; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
1050; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
1051; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
1052; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
1053; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
1054; CHECK-NEXT:    ret;
1055  %a.load = load <16 x i8>, ptr addrspace(1) %a
1056  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1057  store <16 x i8> %a.add, ptr addrspace(1) %a
1058  ret void
1059}
1060
1061define void @global_2xi16(ptr addrspace(1) %a) {
1062; CHECK-LABEL: global_2xi16(
1063; CHECK:       {
1064; CHECK-NEXT:    .reg .b16 %rs<5>;
1065; CHECK-NEXT:    .reg .b32 %r<3>;
1066; CHECK-NEXT:    .reg .b64 %rd<2>;
1067; CHECK-EMPTY:
1068; CHECK-NEXT:  // %bb.0:
1069; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi16_param_0];
1070; CHECK-NEXT:    ld.global.u32 %r1, [%rd1];
1071; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
1072; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1073; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1074; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
1075; CHECK-NEXT:    st.global.u32 [%rd1], %r2;
1076; CHECK-NEXT:    ret;
1077  %a.load = load <2 x i16>, ptr addrspace(1) %a
1078  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
1079  store <2 x i16> %a.add, ptr addrspace(1) %a
1080  ret void
1081}
1082
1083define void @global_4xi16(ptr addrspace(1) %a) {
1084; CHECK-LABEL: global_4xi16(
1085; CHECK:       {
1086; CHECK-NEXT:    .reg .b16 %rs<9>;
1087; CHECK-NEXT:    .reg .b64 %rd<2>;
1088; CHECK-EMPTY:
1089; CHECK-NEXT:  // %bb.0:
1090; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi16_param_0];
1091; CHECK-NEXT:    ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
1092; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
1093; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
1094; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
1095; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
1096; CHECK-NEXT:    st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
1097; CHECK-NEXT:    ret;
1098  %a.load = load <4 x i16>, ptr addrspace(1) %a
1099  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
1100  store <4 x i16> %a.add, ptr addrspace(1) %a
1101  ret void
1102}
1103
1104define void @global_8xi16(ptr addrspace(1) %a) {
1105; CHECK-LABEL: global_8xi16(
1106; CHECK:       {
1107; CHECK-NEXT:    .reg .b16 %rs<17>;
1108; CHECK-NEXT:    .reg .b32 %r<9>;
1109; CHECK-NEXT:    .reg .b64 %rd<2>;
1110; CHECK-EMPTY:
1111; CHECK-NEXT:  // %bb.0:
1112; CHECK-NEXT:    ld.param.u64 %rd1, [global_8xi16_param_0];
1113; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
1114; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
1115; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1116; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1117; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
1118; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
1119; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
1120; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
1121; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
1122; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
1123; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
1124; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
1125; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
1126; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
1127; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
1128; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
1129; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
1130; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
1131; CHECK-NEXT:    ret;
1132  %a.load = load <8 x i16>, ptr addrspace(1) %a
1133  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1134  store <8 x i16> %a.add, ptr addrspace(1) %a
1135  ret void
1136}
1137
1138define void @global_2xi32(ptr addrspace(1) %a) {
1139; CHECK-LABEL: global_2xi32(
1140; CHECK:       {
1141; CHECK-NEXT:    .reg .b32 %r<5>;
1142; CHECK-NEXT:    .reg .b64 %rd<2>;
1143; CHECK-EMPTY:
1144; CHECK-NEXT:  // %bb.0:
1145; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi32_param_0];
1146; CHECK-NEXT:    ld.global.v2.u32 {%r1, %r2}, [%rd1];
1147; CHECK-NEXT:    add.s32 %r3, %r2, 1;
1148; CHECK-NEXT:    add.s32 %r4, %r1, 1;
1149; CHECK-NEXT:    st.global.v2.u32 [%rd1], {%r4, %r3};
1150; CHECK-NEXT:    ret;
1151  %a.load = load <2 x i32>, ptr addrspace(1) %a
1152  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
1153  store <2 x i32> %a.add, ptr addrspace(1) %a
1154  ret void
1155}
1156
1157define void @global_4xi32(ptr addrspace(1) %a) {
1158; CHECK-LABEL: global_4xi32(
1159; CHECK:       {
1160; CHECK-NEXT:    .reg .b32 %r<9>;
1161; CHECK-NEXT:    .reg .b64 %rd<2>;
1162; CHECK-EMPTY:
1163; CHECK-NEXT:  // %bb.0:
1164; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xi32_param_0];
1165; CHECK-NEXT:    ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
1166; CHECK-NEXT:    add.s32 %r5, %r4, 1;
1167; CHECK-NEXT:    add.s32 %r6, %r3, 1;
1168; CHECK-NEXT:    add.s32 %r7, %r2, 1;
1169; CHECK-NEXT:    add.s32 %r8, %r1, 1;
1170; CHECK-NEXT:    st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
1171; CHECK-NEXT:    ret;
1172  %a.load = load <4 x i32>, ptr addrspace(1) %a
1173  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
1174  store <4 x i32> %a.add, ptr addrspace(1) %a
1175  ret void
1176}
1177
1178define void @global_2xi64(ptr addrspace(1) %a) {
1179; CHECK-LABEL: global_2xi64(
1180; CHECK:       {
1181; CHECK-NEXT:    .reg .b64 %rd<6>;
1182; CHECK-EMPTY:
1183; CHECK-NEXT:  // %bb.0:
1184; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xi64_param_0];
1185; CHECK-NEXT:    ld.global.v2.u64 {%rd2, %rd3}, [%rd1];
1186; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
1187; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
1188; CHECK-NEXT:    st.global.v2.u64 [%rd1], {%rd5, %rd4};
1189; CHECK-NEXT:    ret;
1190  %a.load = load <2 x i64>, ptr addrspace(1) %a
1191  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
1192  store <2 x i64> %a.add, ptr addrspace(1) %a
1193  ret void
1194}
1195
1196define void @global_2xfloat(ptr addrspace(1) %a) {
1197; CHECK-LABEL: global_2xfloat(
1198; CHECK:       {
1199; CHECK-NEXT:    .reg .f32 %f<5>;
1200; CHECK-NEXT:    .reg .b64 %rd<2>;
1201; CHECK-EMPTY:
1202; CHECK-NEXT:  // %bb.0:
1203; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xfloat_param_0];
1204; CHECK-NEXT:    ld.global.v2.f32 {%f1, %f2}, [%rd1];
1205; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
1206; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
1207; CHECK-NEXT:    st.global.v2.f32 [%rd1], {%f4, %f3};
1208; CHECK-NEXT:    ret;
1209  %a.load = load <2 x float>, ptr addrspace(1) %a
1210  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
1211  store <2 x float> %a.add, ptr addrspace(1) %a
1212  ret void
1213}
1214
1215define void @global_4xfloat(ptr addrspace(1) %a) {
1216; CHECK-LABEL: global_4xfloat(
1217; CHECK:       {
1218; CHECK-NEXT:    .reg .f32 %f<9>;
1219; CHECK-NEXT:    .reg .b64 %rd<2>;
1220; CHECK-EMPTY:
1221; CHECK-NEXT:  // %bb.0:
1222; CHECK-NEXT:    ld.param.u64 %rd1, [global_4xfloat_param_0];
1223; CHECK-NEXT:    ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
1224; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
1225; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
1226; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
1227; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
1228; CHECK-NEXT:    st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
1229; CHECK-NEXT:    ret;
1230  %a.load = load <4 x float>, ptr addrspace(1) %a
1231  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
1232  store <4 x float> %a.add, ptr addrspace(1) %a
1233  ret void
1234}
1235
1236define void @global_2xdouble(ptr addrspace(1) %a) {
1237; CHECK-LABEL: global_2xdouble(
1238; CHECK:       {
1239; CHECK-NEXT:    .reg .b64 %rd<2>;
1240; CHECK-NEXT:    .reg .f64 %fd<5>;
1241; CHECK-EMPTY:
1242; CHECK-NEXT:  // %bb.0:
1243; CHECK-NEXT:    ld.param.u64 %rd1, [global_2xdouble_param_0];
1244; CHECK-NEXT:    ld.global.v2.f64 {%fd1, %fd2}, [%rd1];
1245; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
1246; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
1247; CHECK-NEXT:    st.global.v2.f64 [%rd1], {%fd4, %fd3};
1248; CHECK-NEXT:    ret;
1249  %a.load = load <2 x double>, ptr addrspace(1) %a
1250  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
1251  store <2 x double> %a.add, ptr addrspace(1) %a
1252  ret void
1253}
1254
1255; global_volatile
1256
1257define void @global_volatile_2xi8(ptr addrspace(1) %a) {
1258; CHECK-LABEL: global_volatile_2xi8(
1259; CHECK:       {
1260; CHECK-NEXT:    .reg .b16 %rs<5>;
1261; CHECK-NEXT:    .reg .b64 %rd<2>;
1262; CHECK-EMPTY:
1263; CHECK-NEXT:  // %bb.0:
1264; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi8_param_0];
1265; CHECK-NEXT:    ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1];
1266; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1267; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1268; CHECK-NEXT:    st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3};
1269; CHECK-NEXT:    ret;
1270  %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
1271  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
1272  store volatile <2 x i8> %a.add, ptr addrspace(1) %a
1273  ret void
1274}
1275
1276define void @global_volatile_4xi8(ptr addrspace(1) %a) {
1277; CHECK-LABEL: global_volatile_4xi8(
1278; CHECK:       {
1279; CHECK-NEXT:    .reg .b16 %rs<9>;
1280; CHECK-NEXT:    .reg .b32 %r<13>;
1281; CHECK-NEXT:    .reg .b64 %rd<2>;
1282; CHECK-EMPTY:
1283; CHECK-NEXT:  // %bb.0:
1284; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi8_param_0];
1285; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
1286; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
1287; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
1288; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1289; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
1290; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
1291; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
1292; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1293; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
1294; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
1295; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
1296; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
1297; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1298; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
1299; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
1300; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
1301; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1302; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
1303; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
1304; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
1305; CHECK-NEXT:    st.volatile.global.u32 [%rd1], %r12;
1306; CHECK-NEXT:    ret;
1307  %a.load = load volatile <4 x i8>, ptr addrspace(1) %a
1308  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
1309  store volatile <4 x i8> %a.add, ptr addrspace(1) %a
1310  ret void
1311}
1312
1313define void @global_volatile_8xi8(ptr addrspace(1) %a) {
1314; CHECK-LABEL: global_volatile_8xi8(
1315; CHECK:       {
1316; CHECK-NEXT:    .reg .b16 %rs<17>;
1317; CHECK-NEXT:    .reg .b32 %r<25>;
1318; CHECK-NEXT:    .reg .b64 %rd<2>;
1319; CHECK-EMPTY:
1320; CHECK-NEXT:  // %bb.0:
1321; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_8xi8_param_0];
1322; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
1323; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
1324; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
1325; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1326; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
1327; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
1328; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
1329; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1330; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
1331; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
1332; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
1333; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
1334; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1335; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
1336; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
1337; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
1338; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1339; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
1340; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
1341; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
1342; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
1343; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
1344; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
1345; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
1346; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
1347; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
1348; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
1349; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
1350; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
1351; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
1352; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
1353; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
1354; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
1355; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
1356; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
1357; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
1358; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
1359; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
1360; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
1361; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r24, %r13};
1362; CHECK-NEXT:    ret;
1363  %a.load = load volatile <8 x i8>, ptr addrspace(1) %a
1364  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1365  store volatile <8 x i8> %a.add, ptr addrspace(1) %a
1366  ret void
1367}
1368
1369define void @global_volatile_16xi8(ptr addrspace(1) %a) {
1370; CHECK-LABEL: global_volatile_16xi8(
1371; CHECK:       {
1372; CHECK-NEXT:    .reg .b16 %rs<33>;
1373; CHECK-NEXT:    .reg .b32 %r<49>;
1374; CHECK-NEXT:    .reg .b64 %rd<2>;
1375; CHECK-EMPTY:
1376; CHECK-NEXT:  // %bb.0:
1377; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_16xi8_param_0];
1378; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
1379; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
1380; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
1381; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1382; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
1383; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
1384; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
1385; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1386; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
1387; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
1388; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
1389; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
1390; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1391; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
1392; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
1393; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
1394; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1395; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
1396; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
1397; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
1398; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
1399; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
1400; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
1401; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
1402; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
1403; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
1404; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
1405; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
1406; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
1407; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
1408; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
1409; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
1410; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
1411; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
1412; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
1413; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
1414; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
1415; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
1416; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
1417; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
1418; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
1419; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
1420; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
1421; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
1422; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
1423; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
1424; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
1425; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
1426; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
1427; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
1428; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
1429; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
1430; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
1431; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
1432; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
1433; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
1434; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
1435; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
1436; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
1437; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
1438; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
1439; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
1440; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
1441; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
1442; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
1443; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
1444; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
1445; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
1446; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
1447; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
1448; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
1449; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
1450; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
1451; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
1452; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
1453; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
1454; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
1455; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
1456; CHECK-NEXT:    ret;
1457  %a.load = load volatile <16 x i8>, ptr addrspace(1) %a
1458  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1459  store volatile <16 x i8> %a.add, ptr addrspace(1) %a
1460  ret void
1461}
1462
1463define void @global_volatile_2xi16(ptr addrspace(1) %a) {
1464; CHECK-LABEL: global_volatile_2xi16(
1465; CHECK:       {
1466; CHECK-NEXT:    .reg .b16 %rs<5>;
1467; CHECK-NEXT:    .reg .b32 %r<3>;
1468; CHECK-NEXT:    .reg .b64 %rd<2>;
1469; CHECK-EMPTY:
1470; CHECK-NEXT:  // %bb.0:
1471; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi16_param_0];
1472; CHECK-NEXT:    ld.volatile.global.u32 %r1, [%rd1];
1473; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
1474; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1475; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1476; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
1477; CHECK-NEXT:    st.volatile.global.u32 [%rd1], %r2;
1478; CHECK-NEXT:    ret;
1479  %a.load = load volatile <2 x i16>, ptr addrspace(1) %a
1480  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
1481  store volatile <2 x i16> %a.add, ptr addrspace(1) %a
1482  ret void
1483}
1484
1485define void @global_volatile_4xi16(ptr addrspace(1) %a) {
1486; CHECK-LABEL: global_volatile_4xi16(
1487; CHECK:       {
1488; CHECK-NEXT:    .reg .b16 %rs<9>;
1489; CHECK-NEXT:    .reg .b64 %rd<2>;
1490; CHECK-EMPTY:
1491; CHECK-NEXT:  // %bb.0:
1492; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi16_param_0];
1493; CHECK-NEXT:    ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
1494; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
1495; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
1496; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
1497; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
1498; CHECK-NEXT:    st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
1499; CHECK-NEXT:    ret;
1500  %a.load = load volatile <4 x i16>, ptr addrspace(1) %a
1501  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
1502  store volatile <4 x i16> %a.add, ptr addrspace(1) %a
1503  ret void
1504}
1505
1506define void @global_volatile_8xi16(ptr addrspace(1) %a) {
1507; CHECK-LABEL: global_volatile_8xi16(
1508; CHECK:       {
1509; CHECK-NEXT:    .reg .b16 %rs<17>;
1510; CHECK-NEXT:    .reg .b32 %r<9>;
1511; CHECK-NEXT:    .reg .b64 %rd<2>;
1512; CHECK-EMPTY:
1513; CHECK-NEXT:  // %bb.0:
1514; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_8xi16_param_0];
1515; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
1516; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
1517; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1518; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1519; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
1520; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
1521; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
1522; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
1523; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
1524; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
1525; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
1526; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
1527; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
1528; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
1529; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
1530; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
1531; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
1532; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
1533; CHECK-NEXT:    ret;
1534  %a.load = load volatile <8 x i16>, ptr addrspace(1) %a
1535  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1536  store volatile <8 x i16> %a.add, ptr addrspace(1) %a
1537  ret void
1538}
1539
1540define void @global_volatile_2xi32(ptr addrspace(1) %a) {
1541; CHECK-LABEL: global_volatile_2xi32(
1542; CHECK:       {
1543; CHECK-NEXT:    .reg .b32 %r<5>;
1544; CHECK-NEXT:    .reg .b64 %rd<2>;
1545; CHECK-EMPTY:
1546; CHECK-NEXT:  // %bb.0:
1547; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi32_param_0];
1548; CHECK-NEXT:    ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1];
1549; CHECK-NEXT:    add.s32 %r3, %r2, 1;
1550; CHECK-NEXT:    add.s32 %r4, %r1, 1;
1551; CHECK-NEXT:    st.volatile.global.v2.u32 [%rd1], {%r4, %r3};
1552; CHECK-NEXT:    ret;
1553  %a.load = load volatile <2 x i32>, ptr addrspace(1) %a
1554  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
1555  store volatile <2 x i32> %a.add, ptr addrspace(1) %a
1556  ret void
1557}
1558
1559define void @global_volatile_4xi32(ptr addrspace(1) %a) {
1560; CHECK-LABEL: global_volatile_4xi32(
1561; CHECK:       {
1562; CHECK-NEXT:    .reg .b32 %r<9>;
1563; CHECK-NEXT:    .reg .b64 %rd<2>;
1564; CHECK-EMPTY:
1565; CHECK-NEXT:  // %bb.0:
1566; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xi32_param_0];
1567; CHECK-NEXT:    ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
1568; CHECK-NEXT:    add.s32 %r5, %r4, 1;
1569; CHECK-NEXT:    add.s32 %r6, %r3, 1;
1570; CHECK-NEXT:    add.s32 %r7, %r2, 1;
1571; CHECK-NEXT:    add.s32 %r8, %r1, 1;
1572; CHECK-NEXT:    st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
1573; CHECK-NEXT:    ret;
1574  %a.load = load volatile <4 x i32>, ptr addrspace(1) %a
1575  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
1576  store volatile <4 x i32> %a.add, ptr addrspace(1) %a
1577  ret void
1578}
1579
1580define void @global_volatile_2xi64(ptr addrspace(1) %a) {
1581; CHECK-LABEL: global_volatile_2xi64(
1582; CHECK:       {
1583; CHECK-NEXT:    .reg .b64 %rd<6>;
1584; CHECK-EMPTY:
1585; CHECK-NEXT:  // %bb.0:
1586; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xi64_param_0];
1587; CHECK-NEXT:    ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1];
1588; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
1589; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
1590; CHECK-NEXT:    st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4};
1591; CHECK-NEXT:    ret;
1592  %a.load = load volatile <2 x i64>, ptr addrspace(1) %a
1593  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
1594  store volatile <2 x i64> %a.add, ptr addrspace(1) %a
1595  ret void
1596}
1597
1598define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
1599; CHECK-LABEL: global_volatile_2xfloat(
1600; CHECK:       {
1601; CHECK-NEXT:    .reg .f32 %f<5>;
1602; CHECK-NEXT:    .reg .b64 %rd<2>;
1603; CHECK-EMPTY:
1604; CHECK-NEXT:  // %bb.0:
1605; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xfloat_param_0];
1606; CHECK-NEXT:    ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1];
1607; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
1608; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
1609; CHECK-NEXT:    st.volatile.global.v2.f32 [%rd1], {%f4, %f3};
1610; CHECK-NEXT:    ret;
1611  %a.load = load volatile <2 x float>, ptr addrspace(1) %a
1612  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
1613  store volatile <2 x float> %a.add, ptr addrspace(1) %a
1614  ret void
1615}
1616
1617define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
1618; CHECK-LABEL: global_volatile_4xfloat(
1619; CHECK:       {
1620; CHECK-NEXT:    .reg .f32 %f<9>;
1621; CHECK-NEXT:    .reg .b64 %rd<2>;
1622; CHECK-EMPTY:
1623; CHECK-NEXT:  // %bb.0:
1624; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_4xfloat_param_0];
1625; CHECK-NEXT:    ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
1626; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
1627; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
1628; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
1629; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
1630; CHECK-NEXT:    st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
1631; CHECK-NEXT:    ret;
1632  %a.load = load volatile <4 x float>, ptr addrspace(1) %a
1633  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
1634  store volatile <4 x float> %a.add, ptr addrspace(1) %a
1635  ret void
1636}
1637
1638define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
1639; CHECK-LABEL: global_volatile_2xdouble(
1640; CHECK:       {
1641; CHECK-NEXT:    .reg .b64 %rd<2>;
1642; CHECK-NEXT:    .reg .f64 %fd<5>;
1643; CHECK-EMPTY:
1644; CHECK-NEXT:  // %bb.0:
1645; CHECK-NEXT:    ld.param.u64 %rd1, [global_volatile_2xdouble_param_0];
1646; CHECK-NEXT:    ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1];
1647; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
1648; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
1649; CHECK-NEXT:    st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3};
1650; CHECK-NEXT:    ret;
1651  %a.load = load volatile <2 x double>, ptr addrspace(1) %a
1652  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
1653  store volatile <2 x double> %a.add, ptr addrspace(1) %a
1654  ret void
1655}
1656
1657;; shared statespace
1658
1659; shared
1660
1661define void @shared_2xi8(ptr addrspace(3) %a) {
1662; CHECK-LABEL: shared_2xi8(
1663; CHECK:       {
1664; CHECK-NEXT:    .reg .b16 %rs<5>;
1665; CHECK-NEXT:    .reg .b64 %rd<2>;
1666; CHECK-EMPTY:
1667; CHECK-NEXT:  // %bb.0:
1668; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi8_param_0];
1669; CHECK-NEXT:    ld.shared.v2.u8 {%rs1, %rs2}, [%rd1];
1670; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1671; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1672; CHECK-NEXT:    st.shared.v2.u8 [%rd1], {%rs4, %rs3};
1673; CHECK-NEXT:    ret;
1674  %a.load = load <2 x i8>, ptr addrspace(3) %a
1675  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
1676  store <2 x i8> %a.add, ptr addrspace(3) %a
1677  ret void
1678}
1679
1680define void @shared_4xi8(ptr addrspace(3) %a) {
1681; CHECK-LABEL: shared_4xi8(
1682; CHECK:       {
1683; CHECK-NEXT:    .reg .b16 %rs<9>;
1684; CHECK-NEXT:    .reg .b32 %r<13>;
1685; CHECK-NEXT:    .reg .b64 %rd<2>;
1686; CHECK-EMPTY:
1687; CHECK-NEXT:  // %bb.0:
1688; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi8_param_0];
1689; CHECK-NEXT:    ld.shared.u32 %r1, [%rd1];
1690; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
1691; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
1692; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1693; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
1694; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
1695; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
1696; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1697; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
1698; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
1699; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
1700; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
1701; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1702; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
1703; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
1704; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
1705; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1706; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
1707; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
1708; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
1709; CHECK-NEXT:    st.shared.u32 [%rd1], %r12;
1710; CHECK-NEXT:    ret;
1711  %a.load = load <4 x i8>, ptr addrspace(3) %a
1712  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
1713  store <4 x i8> %a.add, ptr addrspace(3) %a
1714  ret void
1715}
1716
1717define void @shared_8xi8(ptr addrspace(3) %a) {
1718; CHECK-LABEL: shared_8xi8(
1719; CHECK:       {
1720; CHECK-NEXT:    .reg .b16 %rs<17>;
1721; CHECK-NEXT:    .reg .b32 %r<25>;
1722; CHECK-NEXT:    .reg .b64 %rd<2>;
1723; CHECK-EMPTY:
1724; CHECK-NEXT:  // %bb.0:
1725; CHECK-NEXT:    ld.param.u64 %rd1, [shared_8xi8_param_0];
1726; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
1727; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
1728; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
1729; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1730; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
1731; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
1732; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
1733; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1734; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
1735; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
1736; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
1737; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
1738; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1739; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
1740; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
1741; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
1742; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1743; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
1744; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
1745; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
1746; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
1747; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
1748; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
1749; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
1750; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
1751; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
1752; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
1753; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
1754; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
1755; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
1756; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
1757; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
1758; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
1759; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
1760; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
1761; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
1762; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
1763; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
1764; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
1765; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r24, %r13};
1766; CHECK-NEXT:    ret;
1767  %a.load = load <8 x i8>, ptr addrspace(3) %a
1768  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1769  store <8 x i8> %a.add, ptr addrspace(3) %a
1770  ret void
1771}
1772
1773define void @shared_16xi8(ptr addrspace(3) %a) {
1774; CHECK-LABEL: shared_16xi8(
1775; CHECK:       {
1776; CHECK-NEXT:    .reg .b16 %rs<33>;
1777; CHECK-NEXT:    .reg .b32 %r<49>;
1778; CHECK-NEXT:    .reg .b64 %rd<2>;
1779; CHECK-EMPTY:
1780; CHECK-NEXT:  // %bb.0:
1781; CHECK-NEXT:    ld.param.u64 %rd1, [shared_16xi8_param_0];
1782; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
1783; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
1784; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
1785; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
1786; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
1787; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
1788; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
1789; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
1790; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
1791; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
1792; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
1793; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
1794; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
1795; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
1796; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
1797; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
1798; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
1799; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
1800; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
1801; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
1802; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
1803; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
1804; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
1805; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
1806; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
1807; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
1808; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
1809; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
1810; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
1811; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
1812; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
1813; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
1814; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
1815; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
1816; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
1817; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
1818; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
1819; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
1820; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
1821; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
1822; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
1823; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
1824; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
1825; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
1826; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
1827; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
1828; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
1829; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
1830; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
1831; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
1832; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
1833; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
1834; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
1835; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
1836; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
1837; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
1838; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
1839; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
1840; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
1841; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
1842; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
1843; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
1844; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
1845; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
1846; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
1847; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
1848; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
1849; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
1850; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
1851; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
1852; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
1853; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
1854; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
1855; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
1856; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
1857; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
1858; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
1859; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
1860; CHECK-NEXT:    ret;
1861  %a.load = load <16 x i8>, ptr addrspace(3) %a
1862  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1863  store <16 x i8> %a.add, ptr addrspace(3) %a
1864  ret void
1865}
1866
1867define void @shared_2xi16(ptr addrspace(3) %a) {
1868; CHECK-LABEL: shared_2xi16(
1869; CHECK:       {
1870; CHECK-NEXT:    .reg .b16 %rs<5>;
1871; CHECK-NEXT:    .reg .b32 %r<3>;
1872; CHECK-NEXT:    .reg .b64 %rd<2>;
1873; CHECK-EMPTY:
1874; CHECK-NEXT:  // %bb.0:
1875; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi16_param_0];
1876; CHECK-NEXT:    ld.shared.u32 %r1, [%rd1];
1877; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
1878; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1879; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1880; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
1881; CHECK-NEXT:    st.shared.u32 [%rd1], %r2;
1882; CHECK-NEXT:    ret;
1883  %a.load = load <2 x i16>, ptr addrspace(3) %a
1884  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
1885  store <2 x i16> %a.add, ptr addrspace(3) %a
1886  ret void
1887}
1888
1889define void @shared_4xi16(ptr addrspace(3) %a) {
1890; CHECK-LABEL: shared_4xi16(
1891; CHECK:       {
1892; CHECK-NEXT:    .reg .b16 %rs<9>;
1893; CHECK-NEXT:    .reg .b64 %rd<2>;
1894; CHECK-EMPTY:
1895; CHECK-NEXT:  // %bb.0:
1896; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi16_param_0];
1897; CHECK-NEXT:    ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
1898; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
1899; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
1900; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
1901; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
1902; CHECK-NEXT:    st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
1903; CHECK-NEXT:    ret;
1904  %a.load = load <4 x i16>, ptr addrspace(3) %a
1905  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
1906  store <4 x i16> %a.add, ptr addrspace(3) %a
1907  ret void
1908}
1909
1910define void @shared_8xi16(ptr addrspace(3) %a) {
1911; CHECK-LABEL: shared_8xi16(
1912; CHECK:       {
1913; CHECK-NEXT:    .reg .b16 %rs<17>;
1914; CHECK-NEXT:    .reg .b32 %r<9>;
1915; CHECK-NEXT:    .reg .b64 %rd<2>;
1916; CHECK-EMPTY:
1917; CHECK-NEXT:  // %bb.0:
1918; CHECK-NEXT:    ld.param.u64 %rd1, [shared_8xi16_param_0];
1919; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
1920; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
1921; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
1922; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
1923; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
1924; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
1925; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
1926; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
1927; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
1928; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
1929; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
1930; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
1931; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
1932; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
1933; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
1934; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
1935; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
1936; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
1937; CHECK-NEXT:    ret;
1938  %a.load = load <8 x i16>, ptr addrspace(3) %a
1939  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1940  store <8 x i16> %a.add, ptr addrspace(3) %a
1941  ret void
1942}
1943
1944define void @shared_2xi32(ptr addrspace(3) %a) {
1945; CHECK-LABEL: shared_2xi32(
1946; CHECK:       {
1947; CHECK-NEXT:    .reg .b32 %r<5>;
1948; CHECK-NEXT:    .reg .b64 %rd<2>;
1949; CHECK-EMPTY:
1950; CHECK-NEXT:  // %bb.0:
1951; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi32_param_0];
1952; CHECK-NEXT:    ld.shared.v2.u32 {%r1, %r2}, [%rd1];
1953; CHECK-NEXT:    add.s32 %r3, %r2, 1;
1954; CHECK-NEXT:    add.s32 %r4, %r1, 1;
1955; CHECK-NEXT:    st.shared.v2.u32 [%rd1], {%r4, %r3};
1956; CHECK-NEXT:    ret;
1957  %a.load = load <2 x i32>, ptr addrspace(3) %a
1958  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
1959  store <2 x i32> %a.add, ptr addrspace(3) %a
1960  ret void
1961}
1962
1963define void @shared_4xi32(ptr addrspace(3) %a) {
1964; CHECK-LABEL: shared_4xi32(
1965; CHECK:       {
1966; CHECK-NEXT:    .reg .b32 %r<9>;
1967; CHECK-NEXT:    .reg .b64 %rd<2>;
1968; CHECK-EMPTY:
1969; CHECK-NEXT:  // %bb.0:
1970; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xi32_param_0];
1971; CHECK-NEXT:    ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
1972; CHECK-NEXT:    add.s32 %r5, %r4, 1;
1973; CHECK-NEXT:    add.s32 %r6, %r3, 1;
1974; CHECK-NEXT:    add.s32 %r7, %r2, 1;
1975; CHECK-NEXT:    add.s32 %r8, %r1, 1;
1976; CHECK-NEXT:    st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
1977; CHECK-NEXT:    ret;
1978  %a.load = load <4 x i32>, ptr addrspace(3) %a
1979  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
1980  store <4 x i32> %a.add, ptr addrspace(3) %a
1981  ret void
1982}
1983
1984define void @shared_2xi64(ptr addrspace(3) %a) {
1985; CHECK-LABEL: shared_2xi64(
1986; CHECK:       {
1987; CHECK-NEXT:    .reg .b64 %rd<6>;
1988; CHECK-EMPTY:
1989; CHECK-NEXT:  // %bb.0:
1990; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xi64_param_0];
1991; CHECK-NEXT:    ld.shared.v2.u64 {%rd2, %rd3}, [%rd1];
1992; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
1993; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
1994; CHECK-NEXT:    st.shared.v2.u64 [%rd1], {%rd5, %rd4};
1995; CHECK-NEXT:    ret;
1996  %a.load = load <2 x i64>, ptr addrspace(3) %a
1997  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
1998  store <2 x i64> %a.add, ptr addrspace(3) %a
1999  ret void
2000}
2001
2002define void @shared_2xfloat(ptr addrspace(3) %a) {
2003; CHECK-LABEL: shared_2xfloat(
2004; CHECK:       {
2005; CHECK-NEXT:    .reg .f32 %f<5>;
2006; CHECK-NEXT:    .reg .b64 %rd<2>;
2007; CHECK-EMPTY:
2008; CHECK-NEXT:  // %bb.0:
2009; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xfloat_param_0];
2010; CHECK-NEXT:    ld.shared.v2.f32 {%f1, %f2}, [%rd1];
2011; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
2012; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
2013; CHECK-NEXT:    st.shared.v2.f32 [%rd1], {%f4, %f3};
2014; CHECK-NEXT:    ret;
2015  %a.load = load <2 x float>, ptr addrspace(3) %a
2016  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
2017  store <2 x float> %a.add, ptr addrspace(3) %a
2018  ret void
2019}
2020
2021define void @shared_4xfloat(ptr addrspace(3) %a) {
2022; CHECK-LABEL: shared_4xfloat(
2023; CHECK:       {
2024; CHECK-NEXT:    .reg .f32 %f<9>;
2025; CHECK-NEXT:    .reg .b64 %rd<2>;
2026; CHECK-EMPTY:
2027; CHECK-NEXT:  // %bb.0:
2028; CHECK-NEXT:    ld.param.u64 %rd1, [shared_4xfloat_param_0];
2029; CHECK-NEXT:    ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
2030; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
2031; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
2032; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
2033; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
2034; CHECK-NEXT:    st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
2035; CHECK-NEXT:    ret;
2036  %a.load = load <4 x float>, ptr addrspace(3) %a
2037  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
2038  store <4 x float> %a.add, ptr addrspace(3) %a
2039  ret void
2040}
2041
2042define void @shared_2xdouble(ptr addrspace(3) %a) {
2043; CHECK-LABEL: shared_2xdouble(
2044; CHECK:       {
2045; CHECK-NEXT:    .reg .b64 %rd<2>;
2046; CHECK-NEXT:    .reg .f64 %fd<5>;
2047; CHECK-EMPTY:
2048; CHECK-NEXT:  // %bb.0:
2049; CHECK-NEXT:    ld.param.u64 %rd1, [shared_2xdouble_param_0];
2050; CHECK-NEXT:    ld.shared.v2.f64 {%fd1, %fd2}, [%rd1];
2051; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
2052; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
2053; CHECK-NEXT:    st.shared.v2.f64 [%rd1], {%fd4, %fd3};
2054; CHECK-NEXT:    ret;
2055  %a.load = load <2 x double>, ptr addrspace(3) %a
2056  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
2057  store <2 x double> %a.add, ptr addrspace(3) %a
2058  ret void
2059}
2060
2061; shared_volatile
2062
2063define void @shared_volatile_2xi8(ptr addrspace(3) %a) {
2064; CHECK-LABEL: shared_volatile_2xi8(
2065; CHECK:       {
2066; CHECK-NEXT:    .reg .b16 %rs<5>;
2067; CHECK-NEXT:    .reg .b64 %rd<2>;
2068; CHECK-EMPTY:
2069; CHECK-NEXT:  // %bb.0:
2070; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi8_param_0];
2071; CHECK-NEXT:    ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1];
2072; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2073; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2074; CHECK-NEXT:    st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3};
2075; CHECK-NEXT:    ret;
2076  %a.load = load volatile <2 x i8>, ptr addrspace(3) %a
2077  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
2078  store volatile <2 x i8> %a.add, ptr addrspace(3) %a
2079  ret void
2080}
2081
2082define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
2083; CHECK-LABEL: shared_volatile_4xi8(
2084; CHECK:       {
2085; CHECK-NEXT:    .reg .b16 %rs<9>;
2086; CHECK-NEXT:    .reg .b32 %r<13>;
2087; CHECK-NEXT:    .reg .b64 %rd<2>;
2088; CHECK-EMPTY:
2089; CHECK-NEXT:  // %bb.0:
2090; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi8_param_0];
2091; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
2092; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
2093; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
2094; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2095; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
2096; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
2097; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
2098; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2099; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
2100; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
2101; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
2102; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
2103; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2104; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
2105; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
2106; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
2107; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2108; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
2109; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
2110; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
2111; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r12;
2112; CHECK-NEXT:    ret;
2113  %a.load = load volatile <4 x i8>, ptr addrspace(3) %a
2114  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
2115  store volatile <4 x i8> %a.add, ptr addrspace(3) %a
2116  ret void
2117}
2118
2119define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
2120; CHECK-LABEL: shared_volatile_8xi8(
2121; CHECK:       {
2122; CHECK-NEXT:    .reg .b16 %rs<17>;
2123; CHECK-NEXT:    .reg .b32 %r<25>;
2124; CHECK-NEXT:    .reg .b64 %rd<2>;
2125; CHECK-EMPTY:
2126; CHECK-NEXT:  // %bb.0:
2127; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_8xi8_param_0];
2128; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
2129; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
2130; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
2131; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2132; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
2133; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
2134; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
2135; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2136; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
2137; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
2138; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
2139; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
2140; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2141; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
2142; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
2143; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
2144; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2145; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
2146; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
2147; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
2148; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
2149; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
2150; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
2151; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
2152; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
2153; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
2154; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
2155; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
2156; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
2157; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
2158; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
2159; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
2160; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
2161; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
2162; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
2163; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
2164; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
2165; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
2166; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
2167; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r24, %r13};
2168; CHECK-NEXT:    ret;
2169  %a.load = load volatile <8 x i8>, ptr addrspace(3) %a
2170  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2171  store volatile <8 x i8> %a.add, ptr addrspace(3) %a
2172  ret void
2173}
2174
2175define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
2176; CHECK-LABEL: shared_volatile_16xi8(
2177; CHECK:       {
2178; CHECK-NEXT:    .reg .b16 %rs<33>;
2179; CHECK-NEXT:    .reg .b32 %r<49>;
2180; CHECK-NEXT:    .reg .b64 %rd<2>;
2181; CHECK-EMPTY:
2182; CHECK-NEXT:  // %bb.0:
2183; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_16xi8_param_0];
2184; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
2185; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
2186; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
2187; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2188; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
2189; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
2190; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
2191; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2192; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
2193; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
2194; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
2195; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
2196; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2197; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
2198; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
2199; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
2200; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2201; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
2202; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
2203; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
2204; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
2205; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
2206; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
2207; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
2208; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
2209; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
2210; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
2211; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
2212; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
2213; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
2214; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
2215; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
2216; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
2217; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
2218; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
2219; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
2220; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
2221; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
2222; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
2223; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
2224; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
2225; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
2226; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
2227; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
2228; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
2229; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
2230; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
2231; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
2232; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
2233; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
2234; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
2235; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
2236; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
2237; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
2238; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
2239; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
2240; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
2241; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
2242; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
2243; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
2244; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
2245; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
2246; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
2247; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
2248; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
2249; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
2250; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
2251; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
2252; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
2253; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
2254; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
2255; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
2256; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
2257; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
2258; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
2259; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
2260; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
2261; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
2262; CHECK-NEXT:    ret;
2263  %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
2264  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2265  store volatile <16 x i8> %a.add, ptr addrspace(3) %a
2266  ret void
2267}
2268
2269define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
2270; CHECK-LABEL: shared_volatile_2xi16(
2271; CHECK:       {
2272; CHECK-NEXT:    .reg .b16 %rs<5>;
2273; CHECK-NEXT:    .reg .b32 %r<3>;
2274; CHECK-NEXT:    .reg .b64 %rd<2>;
2275; CHECK-EMPTY:
2276; CHECK-NEXT:  // %bb.0:
2277; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi16_param_0];
2278; CHECK-NEXT:    ld.volatile.shared.u32 %r1, [%rd1];
2279; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
2280; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2281; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2282; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
2283; CHECK-NEXT:    st.volatile.shared.u32 [%rd1], %r2;
2284; CHECK-NEXT:    ret;
2285  %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
2286  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
2287  store volatile <2 x i16> %a.add, ptr addrspace(3) %a
2288  ret void
2289}
2290
2291define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
2292; CHECK-LABEL: shared_volatile_4xi16(
2293; CHECK:       {
2294; CHECK-NEXT:    .reg .b16 %rs<9>;
2295; CHECK-NEXT:    .reg .b64 %rd<2>;
2296; CHECK-EMPTY:
2297; CHECK-NEXT:  // %bb.0:
2298; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi16_param_0];
2299; CHECK-NEXT:    ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
2300; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
2301; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
2302; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
2303; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
2304; CHECK-NEXT:    st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
2305; CHECK-NEXT:    ret;
2306  %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
2307  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
2308  store volatile <4 x i16> %a.add, ptr addrspace(3) %a
2309  ret void
2310}
2311
2312define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
2313; CHECK-LABEL: shared_volatile_8xi16(
2314; CHECK:       {
2315; CHECK-NEXT:    .reg .b16 %rs<17>;
2316; CHECK-NEXT:    .reg .b32 %r<9>;
2317; CHECK-NEXT:    .reg .b64 %rd<2>;
2318; CHECK-EMPTY:
2319; CHECK-NEXT:  // %bb.0:
2320; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_8xi16_param_0];
2321; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
2322; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
2323; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2324; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2325; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
2326; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
2327; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
2328; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
2329; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
2330; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
2331; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
2332; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
2333; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
2334; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
2335; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
2336; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
2337; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
2338; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
2339; CHECK-NEXT:    ret;
2340  %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
2341  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2342  store volatile <8 x i16> %a.add, ptr addrspace(3) %a
2343  ret void
2344}
2345
2346define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
2347; CHECK-LABEL: shared_volatile_2xi32(
2348; CHECK:       {
2349; CHECK-NEXT:    .reg .b32 %r<5>;
2350; CHECK-NEXT:    .reg .b64 %rd<2>;
2351; CHECK-EMPTY:
2352; CHECK-NEXT:  // %bb.0:
2353; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi32_param_0];
2354; CHECK-NEXT:    ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1];
2355; CHECK-NEXT:    add.s32 %r3, %r2, 1;
2356; CHECK-NEXT:    add.s32 %r4, %r1, 1;
2357; CHECK-NEXT:    st.volatile.shared.v2.u32 [%rd1], {%r4, %r3};
2358; CHECK-NEXT:    ret;
2359  %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
2360  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
2361  store volatile <2 x i32> %a.add, ptr addrspace(3) %a
2362  ret void
2363}
2364
2365define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
2366; CHECK-LABEL: shared_volatile_4xi32(
2367; CHECK:       {
2368; CHECK-NEXT:    .reg .b32 %r<9>;
2369; CHECK-NEXT:    .reg .b64 %rd<2>;
2370; CHECK-EMPTY:
2371; CHECK-NEXT:  // %bb.0:
2372; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xi32_param_0];
2373; CHECK-NEXT:    ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
2374; CHECK-NEXT:    add.s32 %r5, %r4, 1;
2375; CHECK-NEXT:    add.s32 %r6, %r3, 1;
2376; CHECK-NEXT:    add.s32 %r7, %r2, 1;
2377; CHECK-NEXT:    add.s32 %r8, %r1, 1;
2378; CHECK-NEXT:    st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
2379; CHECK-NEXT:    ret;
2380  %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
2381  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
2382  store volatile <4 x i32> %a.add, ptr addrspace(3) %a
2383  ret void
2384}
2385
2386define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
2387; CHECK-LABEL: shared_volatile_2xi64(
2388; CHECK:       {
2389; CHECK-NEXT:    .reg .b64 %rd<6>;
2390; CHECK-EMPTY:
2391; CHECK-NEXT:  // %bb.0:
2392; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xi64_param_0];
2393; CHECK-NEXT:    ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1];
2394; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
2395; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
2396; CHECK-NEXT:    st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4};
2397; CHECK-NEXT:    ret;
2398  %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
2399  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
2400  store volatile <2 x i64> %a.add, ptr addrspace(3) %a
2401  ret void
2402}
2403
2404define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
2405; CHECK-LABEL: shared_volatile_2xfloat(
2406; CHECK:       {
2407; CHECK-NEXT:    .reg .f32 %f<5>;
2408; CHECK-NEXT:    .reg .b64 %rd<2>;
2409; CHECK-EMPTY:
2410; CHECK-NEXT:  // %bb.0:
2411; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0];
2412; CHECK-NEXT:    ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1];
2413; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
2414; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
2415; CHECK-NEXT:    st.volatile.shared.v2.f32 [%rd1], {%f4, %f3};
2416; CHECK-NEXT:    ret;
2417  %a.load = load volatile <2 x float>, ptr addrspace(3) %a
2418  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
2419  store volatile <2 x float> %a.add, ptr addrspace(3) %a
2420  ret void
2421}
2422
2423define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
2424; CHECK-LABEL: shared_volatile_4xfloat(
2425; CHECK:       {
2426; CHECK-NEXT:    .reg .f32 %f<9>;
2427; CHECK-NEXT:    .reg .b64 %rd<2>;
2428; CHECK-EMPTY:
2429; CHECK-NEXT:  // %bb.0:
2430; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0];
2431; CHECK-NEXT:    ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
2432; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
2433; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
2434; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
2435; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
2436; CHECK-NEXT:    st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
2437; CHECK-NEXT:    ret;
2438  %a.load = load volatile <4 x float>, ptr addrspace(3) %a
2439  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
2440  store volatile <4 x float> %a.add, ptr addrspace(3) %a
2441  ret void
2442}
2443
2444define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
2445; CHECK-LABEL: shared_volatile_2xdouble(
2446; CHECK:       {
2447; CHECK-NEXT:    .reg .b64 %rd<2>;
2448; CHECK-NEXT:    .reg .f64 %fd<5>;
2449; CHECK-EMPTY:
2450; CHECK-NEXT:  // %bb.0:
2451; CHECK-NEXT:    ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0];
2452; CHECK-NEXT:    ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1];
2453; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
2454; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
2455; CHECK-NEXT:    st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3};
2456; CHECK-NEXT:    ret;
2457  %a.load = load volatile <2 x double>, ptr addrspace(3) %a
2458  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
2459  store volatile <2 x double> %a.add, ptr addrspace(3) %a
2460  ret void
2461}
2462
2463;; local statespace
2464
2465; local
2466
2467define void @local_2xi8(ptr addrspace(5) %a) {
2468; CHECK-LABEL: local_2xi8(
2469; CHECK:       {
2470; CHECK-NEXT:    .reg .b16 %rs<5>;
2471; CHECK-NEXT:    .reg .b64 %rd<2>;
2472; CHECK-EMPTY:
2473; CHECK-NEXT:  // %bb.0:
2474; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi8_param_0];
2475; CHECK-NEXT:    ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
2476; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2477; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2478; CHECK-NEXT:    st.local.v2.u8 [%rd1], {%rs4, %rs3};
2479; CHECK-NEXT:    ret;
2480  %a.load = load <2 x i8>, ptr addrspace(5) %a
2481  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
2482  store <2 x i8> %a.add, ptr addrspace(5) %a
2483  ret void
2484}
2485
2486define void @local_4xi8(ptr addrspace(5) %a) {
2487; CHECK-LABEL: local_4xi8(
2488; CHECK:       {
2489; CHECK-NEXT:    .reg .b16 %rs<9>;
2490; CHECK-NEXT:    .reg .b32 %r<13>;
2491; CHECK-NEXT:    .reg .b64 %rd<2>;
2492; CHECK-EMPTY:
2493; CHECK-NEXT:  // %bb.0:
2494; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi8_param_0];
2495; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
2496; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
2497; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
2498; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2499; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
2500; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
2501; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
2502; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2503; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
2504; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
2505; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
2506; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
2507; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2508; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
2509; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
2510; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
2511; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2512; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
2513; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
2514; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
2515; CHECK-NEXT:    st.local.u32 [%rd1], %r12;
2516; CHECK-NEXT:    ret;
2517  %a.load = load <4 x i8>, ptr addrspace(5) %a
2518  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
2519  store <4 x i8> %a.add, ptr addrspace(5) %a
2520  ret void
2521}
2522
2523define void @local_8xi8(ptr addrspace(5) %a) {
2524; CHECK-LABEL: local_8xi8(
2525; CHECK:       {
2526; CHECK-NEXT:    .reg .b16 %rs<17>;
2527; CHECK-NEXT:    .reg .b32 %r<25>;
2528; CHECK-NEXT:    .reg .b64 %rd<2>;
2529; CHECK-EMPTY:
2530; CHECK-NEXT:  // %bb.0:
2531; CHECK-NEXT:    ld.param.u64 %rd1, [local_8xi8_param_0];
2532; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
2533; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
2534; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
2535; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2536; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
2537; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
2538; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
2539; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2540; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
2541; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
2542; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
2543; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
2544; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2545; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
2546; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
2547; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
2548; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2549; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
2550; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
2551; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
2552; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
2553; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
2554; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
2555; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
2556; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
2557; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
2558; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
2559; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
2560; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
2561; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
2562; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
2563; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
2564; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
2565; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
2566; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
2567; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
2568; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
2569; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
2570; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
2571; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r24, %r13};
2572; CHECK-NEXT:    ret;
2573  %a.load = load <8 x i8>, ptr addrspace(5) %a
2574  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2575  store <8 x i8> %a.add, ptr addrspace(5) %a
2576  ret void
2577}
2578
2579define void @local_16xi8(ptr addrspace(5) %a) {
2580; CHECK-LABEL: local_16xi8(
2581; CHECK:       {
2582; CHECK-NEXT:    .reg .b16 %rs<33>;
2583; CHECK-NEXT:    .reg .b32 %r<49>;
2584; CHECK-NEXT:    .reg .b64 %rd<2>;
2585; CHECK-EMPTY:
2586; CHECK-NEXT:  // %bb.0:
2587; CHECK-NEXT:    ld.param.u64 %rd1, [local_16xi8_param_0];
2588; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
2589; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
2590; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
2591; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2592; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
2593; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
2594; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
2595; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2596; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
2597; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
2598; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
2599; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
2600; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2601; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
2602; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
2603; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
2604; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2605; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
2606; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
2607; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
2608; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
2609; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
2610; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
2611; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
2612; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
2613; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
2614; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
2615; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
2616; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
2617; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
2618; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
2619; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
2620; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
2621; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
2622; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
2623; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
2624; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
2625; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
2626; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
2627; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
2628; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
2629; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
2630; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
2631; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
2632; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
2633; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
2634; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
2635; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
2636; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
2637; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
2638; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
2639; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
2640; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
2641; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
2642; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
2643; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
2644; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
2645; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
2646; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
2647; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
2648; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
2649; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
2650; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
2651; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
2652; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
2653; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
2654; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
2655; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
2656; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
2657; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
2658; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
2659; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
2660; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
2661; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
2662; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
2663; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
2664; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
2665; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
2666; CHECK-NEXT:    ret;
2667  %a.load = load <16 x i8>, ptr addrspace(5) %a
2668  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2669  store <16 x i8> %a.add, ptr addrspace(5) %a
2670  ret void
2671}
2672
2673define void @local_2xi16(ptr addrspace(5) %a) {
2674; CHECK-LABEL: local_2xi16(
2675; CHECK:       {
2676; CHECK-NEXT:    .reg .b16 %rs<5>;
2677; CHECK-NEXT:    .reg .b32 %r<3>;
2678; CHECK-NEXT:    .reg .b64 %rd<2>;
2679; CHECK-EMPTY:
2680; CHECK-NEXT:  // %bb.0:
2681; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi16_param_0];
2682; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
2683; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
2684; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2685; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2686; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
2687; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
2688; CHECK-NEXT:    ret;
2689  %a.load = load <2 x i16>, ptr addrspace(5) %a
2690  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
2691  store <2 x i16> %a.add, ptr addrspace(5) %a
2692  ret void
2693}
2694
2695define void @local_4xi16(ptr addrspace(5) %a) {
2696; CHECK-LABEL: local_4xi16(
2697; CHECK:       {
2698; CHECK-NEXT:    .reg .b16 %rs<9>;
2699; CHECK-NEXT:    .reg .b64 %rd<2>;
2700; CHECK-EMPTY:
2701; CHECK-NEXT:  // %bb.0:
2702; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi16_param_0];
2703; CHECK-NEXT:    ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
2704; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
2705; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
2706; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
2707; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
2708; CHECK-NEXT:    st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
2709; CHECK-NEXT:    ret;
2710  %a.load = load <4 x i16>, ptr addrspace(5) %a
2711  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
2712  store <4 x i16> %a.add, ptr addrspace(5) %a
2713  ret void
2714}
2715
2716define void @local_8xi16(ptr addrspace(5) %a) {
2717; CHECK-LABEL: local_8xi16(
2718; CHECK:       {
2719; CHECK-NEXT:    .reg .b16 %rs<17>;
2720; CHECK-NEXT:    .reg .b32 %r<9>;
2721; CHECK-NEXT:    .reg .b64 %rd<2>;
2722; CHECK-EMPTY:
2723; CHECK-NEXT:  // %bb.0:
2724; CHECK-NEXT:    ld.param.u64 %rd1, [local_8xi16_param_0];
2725; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
2726; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
2727; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2728; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2729; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
2730; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
2731; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
2732; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
2733; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
2734; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
2735; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
2736; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
2737; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
2738; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
2739; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
2740; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
2741; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
2742; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
2743; CHECK-NEXT:    ret;
2744  %a.load = load <8 x i16>, ptr addrspace(5) %a
2745  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
2746  store <8 x i16> %a.add, ptr addrspace(5) %a
2747  ret void
2748}
2749
2750define void @local_2xi32(ptr addrspace(5) %a) {
2751; CHECK-LABEL: local_2xi32(
2752; CHECK:       {
2753; CHECK-NEXT:    .reg .b32 %r<5>;
2754; CHECK-NEXT:    .reg .b64 %rd<2>;
2755; CHECK-EMPTY:
2756; CHECK-NEXT:  // %bb.0:
2757; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi32_param_0];
2758; CHECK-NEXT:    ld.local.v2.u32 {%r1, %r2}, [%rd1];
2759; CHECK-NEXT:    add.s32 %r3, %r2, 1;
2760; CHECK-NEXT:    add.s32 %r4, %r1, 1;
2761; CHECK-NEXT:    st.local.v2.u32 [%rd1], {%r4, %r3};
2762; CHECK-NEXT:    ret;
2763  %a.load = load <2 x i32>, ptr addrspace(5) %a
2764  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
2765  store <2 x i32> %a.add, ptr addrspace(5) %a
2766  ret void
2767}
2768
2769define void @local_4xi32(ptr addrspace(5) %a) {
2770; CHECK-LABEL: local_4xi32(
2771; CHECK:       {
2772; CHECK-NEXT:    .reg .b32 %r<9>;
2773; CHECK-NEXT:    .reg .b64 %rd<2>;
2774; CHECK-EMPTY:
2775; CHECK-NEXT:  // %bb.0:
2776; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xi32_param_0];
2777; CHECK-NEXT:    ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
2778; CHECK-NEXT:    add.s32 %r5, %r4, 1;
2779; CHECK-NEXT:    add.s32 %r6, %r3, 1;
2780; CHECK-NEXT:    add.s32 %r7, %r2, 1;
2781; CHECK-NEXT:    add.s32 %r8, %r1, 1;
2782; CHECK-NEXT:    st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
2783; CHECK-NEXT:    ret;
2784  %a.load = load <4 x i32>, ptr addrspace(5) %a
2785  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
2786  store <4 x i32> %a.add, ptr addrspace(5) %a
2787  ret void
2788}
2789
2790define void @local_2xi64(ptr addrspace(5) %a) {
2791; CHECK-LABEL: local_2xi64(
2792; CHECK:       {
2793; CHECK-NEXT:    .reg .b64 %rd<6>;
2794; CHECK-EMPTY:
2795; CHECK-NEXT:  // %bb.0:
2796; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xi64_param_0];
2797; CHECK-NEXT:    ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
2798; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
2799; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
2800; CHECK-NEXT:    st.local.v2.u64 [%rd1], {%rd5, %rd4};
2801; CHECK-NEXT:    ret;
2802  %a.load = load <2 x i64>, ptr addrspace(5) %a
2803  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
2804  store <2 x i64> %a.add, ptr addrspace(5) %a
2805  ret void
2806}
2807
2808define void @local_2xfloat(ptr addrspace(5) %a) {
2809; CHECK-LABEL: local_2xfloat(
2810; CHECK:       {
2811; CHECK-NEXT:    .reg .f32 %f<5>;
2812; CHECK-NEXT:    .reg .b64 %rd<2>;
2813; CHECK-EMPTY:
2814; CHECK-NEXT:  // %bb.0:
2815; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xfloat_param_0];
2816; CHECK-NEXT:    ld.local.v2.f32 {%f1, %f2}, [%rd1];
2817; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
2818; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
2819; CHECK-NEXT:    st.local.v2.f32 [%rd1], {%f4, %f3};
2820; CHECK-NEXT:    ret;
2821  %a.load = load <2 x float>, ptr addrspace(5) %a
2822  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
2823  store <2 x float> %a.add, ptr addrspace(5) %a
2824  ret void
2825}
2826
2827define void @local_4xfloat(ptr addrspace(5) %a) {
2828; CHECK-LABEL: local_4xfloat(
2829; CHECK:       {
2830; CHECK-NEXT:    .reg .f32 %f<9>;
2831; CHECK-NEXT:    .reg .b64 %rd<2>;
2832; CHECK-EMPTY:
2833; CHECK-NEXT:  // %bb.0:
2834; CHECK-NEXT:    ld.param.u64 %rd1, [local_4xfloat_param_0];
2835; CHECK-NEXT:    ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
2836; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
2837; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
2838; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
2839; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
2840; CHECK-NEXT:    st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
2841; CHECK-NEXT:    ret;
2842  %a.load = load <4 x float>, ptr addrspace(5) %a
2843  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
2844  store <4 x float> %a.add, ptr addrspace(5) %a
2845  ret void
2846}
2847
2848define void @local_2xdouble(ptr addrspace(5) %a) {
2849; CHECK-LABEL: local_2xdouble(
2850; CHECK:       {
2851; CHECK-NEXT:    .reg .b64 %rd<2>;
2852; CHECK-NEXT:    .reg .f64 %fd<5>;
2853; CHECK-EMPTY:
2854; CHECK-NEXT:  // %bb.0:
2855; CHECK-NEXT:    ld.param.u64 %rd1, [local_2xdouble_param_0];
2856; CHECK-NEXT:    ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
2857; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
2858; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
2859; CHECK-NEXT:    st.local.v2.f64 [%rd1], {%fd4, %fd3};
2860; CHECK-NEXT:    ret;
2861  %a.load = load <2 x double>, ptr addrspace(5) %a
2862  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
2863  store <2 x double> %a.add, ptr addrspace(5) %a
2864  ret void
2865}
2866
2867; local_volatile
2868
2869define void @local_volatile_2xi8(ptr addrspace(5) %a) {
2870; CHECK-LABEL: local_volatile_2xi8(
2871; CHECK:       {
2872; CHECK-NEXT:    .reg .b16 %rs<5>;
2873; CHECK-NEXT:    .reg .b64 %rd<2>;
2874; CHECK-EMPTY:
2875; CHECK-NEXT:  // %bb.0:
2876; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi8_param_0];
2877; CHECK-NEXT:    ld.local.v2.u8 {%rs1, %rs2}, [%rd1];
2878; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
2879; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
2880; CHECK-NEXT:    st.local.v2.u8 [%rd1], {%rs4, %rs3};
2881; CHECK-NEXT:    ret;
2882  %a.load = load volatile <2 x i8>, ptr addrspace(5) %a
2883  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
2884  store volatile <2 x i8> %a.add, ptr addrspace(5) %a
2885  ret void
2886}
2887
2888define void @local_volatile_4xi8(ptr addrspace(5) %a) {
2889; CHECK-LABEL: local_volatile_4xi8(
2890; CHECK:       {
2891; CHECK-NEXT:    .reg .b16 %rs<9>;
2892; CHECK-NEXT:    .reg .b32 %r<13>;
2893; CHECK-NEXT:    .reg .b64 %rd<2>;
2894; CHECK-EMPTY:
2895; CHECK-NEXT:  // %bb.0:
2896; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi8_param_0];
2897; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
2898; CHECK-NEXT:    bfe.u32 %r2, %r1, 24, 8;
2899; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
2900; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2901; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
2902; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
2903; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
2904; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2905; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
2906; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
2907; CHECK-NEXT:    bfe.u32 %r7, %r1, 8, 8;
2908; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
2909; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2910; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
2911; CHECK-NEXT:    bfe.u32 %r9, %r1, 0, 8;
2912; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
2913; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2914; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
2915; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
2916; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
2917; CHECK-NEXT:    st.local.u32 [%rd1], %r12;
2918; CHECK-NEXT:    ret;
2919  %a.load = load volatile <4 x i8>, ptr addrspace(5) %a
2920  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
2921  store volatile <4 x i8> %a.add, ptr addrspace(5) %a
2922  ret void
2923}
2924
2925define void @local_volatile_8xi8(ptr addrspace(5) %a) {
2926; CHECK-LABEL: local_volatile_8xi8(
2927; CHECK:       {
2928; CHECK-NEXT:    .reg .b16 %rs<17>;
2929; CHECK-NEXT:    .reg .b32 %r<25>;
2930; CHECK-NEXT:    .reg .b64 %rd<2>;
2931; CHECK-EMPTY:
2932; CHECK-NEXT:  // %bb.0:
2933; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_8xi8_param_0];
2934; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
2935; CHECK-NEXT:    bfe.u32 %r3, %r2, 24, 8;
2936; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
2937; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2938; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
2939; CHECK-NEXT:    bfe.u32 %r5, %r2, 16, 8;
2940; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
2941; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2942; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
2943; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
2944; CHECK-NEXT:    bfe.u32 %r8, %r2, 8, 8;
2945; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
2946; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
2947; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
2948; CHECK-NEXT:    bfe.u32 %r10, %r2, 0, 8;
2949; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
2950; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
2951; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
2952; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
2953; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
2954; CHECK-NEXT:    bfe.u32 %r14, %r1, 24, 8;
2955; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
2956; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
2957; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
2958; CHECK-NEXT:    bfe.u32 %r16, %r1, 16, 8;
2959; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
2960; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
2961; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
2962; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
2963; CHECK-NEXT:    bfe.u32 %r19, %r1, 8, 8;
2964; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
2965; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
2966; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
2967; CHECK-NEXT:    bfe.u32 %r21, %r1, 0, 8;
2968; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
2969; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
2970; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
2971; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
2972; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
2973; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r24, %r13};
2974; CHECK-NEXT:    ret;
2975  %a.load = load volatile <8 x i8>, ptr addrspace(5) %a
2976  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2977  store volatile <8 x i8> %a.add, ptr addrspace(5) %a
2978  ret void
2979}
2980
2981define void @local_volatile_16xi8(ptr addrspace(5) %a) {
2982; CHECK-LABEL: local_volatile_16xi8(
2983; CHECK:       {
2984; CHECK-NEXT:    .reg .b16 %rs<33>;
2985; CHECK-NEXT:    .reg .b32 %r<49>;
2986; CHECK-NEXT:    .reg .b64 %rd<2>;
2987; CHECK-EMPTY:
2988; CHECK-NEXT:  // %bb.0:
2989; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_16xi8_param_0];
2990; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
2991; CHECK-NEXT:    bfe.u32 %r5, %r4, 24, 8;
2992; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
2993; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
2994; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
2995; CHECK-NEXT:    bfe.u32 %r7, %r4, 16, 8;
2996; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
2997; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
2998; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
2999; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
3000; CHECK-NEXT:    bfe.u32 %r10, %r4, 8, 8;
3001; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
3002; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
3003; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
3004; CHECK-NEXT:    bfe.u32 %r12, %r4, 0, 8;
3005; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
3006; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
3007; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
3008; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
3009; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
3010; CHECK-NEXT:    bfe.u32 %r16, %r3, 24, 8;
3011; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
3012; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
3013; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
3014; CHECK-NEXT:    bfe.u32 %r18, %r3, 16, 8;
3015; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
3016; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
3017; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
3018; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
3019; CHECK-NEXT:    bfe.u32 %r21, %r3, 8, 8;
3020; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
3021; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
3022; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
3023; CHECK-NEXT:    bfe.u32 %r23, %r3, 0, 8;
3024; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
3025; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
3026; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
3027; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
3028; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
3029; CHECK-NEXT:    bfe.u32 %r27, %r2, 24, 8;
3030; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
3031; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
3032; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
3033; CHECK-NEXT:    bfe.u32 %r29, %r2, 16, 8;
3034; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
3035; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
3036; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
3037; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
3038; CHECK-NEXT:    bfe.u32 %r32, %r2, 8, 8;
3039; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
3040; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
3041; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
3042; CHECK-NEXT:    bfe.u32 %r34, %r2, 0, 8;
3043; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
3044; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
3045; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
3046; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
3047; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
3048; CHECK-NEXT:    bfe.u32 %r38, %r1, 24, 8;
3049; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
3050; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
3051; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
3052; CHECK-NEXT:    bfe.u32 %r40, %r1, 16, 8;
3053; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
3054; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
3055; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
3056; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
3057; CHECK-NEXT:    bfe.u32 %r43, %r1, 8, 8;
3058; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
3059; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
3060; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
3061; CHECK-NEXT:    bfe.u32 %r45, %r1, 0, 8;
3062; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
3063; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
3064; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
3065; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
3066; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
3067; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
3068; CHECK-NEXT:    ret;
3069  %a.load = load volatile <16 x i8>, ptr addrspace(5) %a
3070  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
3071  store volatile <16 x i8> %a.add, ptr addrspace(5) %a
3072  ret void
3073}
3074
3075define void @local_volatile_2xi16(ptr addrspace(5) %a) {
3076; CHECK-LABEL: local_volatile_2xi16(
3077; CHECK:       {
3078; CHECK-NEXT:    .reg .b16 %rs<5>;
3079; CHECK-NEXT:    .reg .b32 %r<3>;
3080; CHECK-NEXT:    .reg .b64 %rd<2>;
3081; CHECK-EMPTY:
3082; CHECK-NEXT:  // %bb.0:
3083; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi16_param_0];
3084; CHECK-NEXT:    ld.local.u32 %r1, [%rd1];
3085; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
3086; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
3087; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
3088; CHECK-NEXT:    mov.b32 %r2, {%rs4, %rs3};
3089; CHECK-NEXT:    st.local.u32 [%rd1], %r2;
3090; CHECK-NEXT:    ret;
3091  %a.load = load volatile <2 x i16>, ptr addrspace(5) %a
3092  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
3093  store volatile <2 x i16> %a.add, ptr addrspace(5) %a
3094  ret void
3095}
3096
3097define void @local_volatile_4xi16(ptr addrspace(5) %a) {
3098; CHECK-LABEL: local_volatile_4xi16(
3099; CHECK:       {
3100; CHECK-NEXT:    .reg .b16 %rs<9>;
3101; CHECK-NEXT:    .reg .b64 %rd<2>;
3102; CHECK-EMPTY:
3103; CHECK-NEXT:  // %bb.0:
3104; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi16_param_0];
3105; CHECK-NEXT:    ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
3106; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
3107; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
3108; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
3109; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
3110; CHECK-NEXT:    st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
3111; CHECK-NEXT:    ret;
3112  %a.load = load volatile <4 x i16>, ptr addrspace(5) %a
3113  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
3114  store volatile <4 x i16> %a.add, ptr addrspace(5) %a
3115  ret void
3116}
3117
3118define void @local_volatile_8xi16(ptr addrspace(5) %a) {
3119; CHECK-LABEL: local_volatile_8xi16(
3120; CHECK:       {
3121; CHECK-NEXT:    .reg .b16 %rs<17>;
3122; CHECK-NEXT:    .reg .b32 %r<9>;
3123; CHECK-NEXT:    .reg .b64 %rd<2>;
3124; CHECK-EMPTY:
3125; CHECK-NEXT:  // %bb.0:
3126; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_8xi16_param_0];
3127; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
3128; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
3129; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
3130; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
3131; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
3132; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
3133; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
3134; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
3135; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
3136; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
3137; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
3138; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
3139; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
3140; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
3141; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
3142; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
3143; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
3144; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
3145; CHECK-NEXT:    ret;
3146  %a.load = load volatile <8 x i16>, ptr addrspace(5) %a
3147  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
3148  store volatile <8 x i16> %a.add, ptr addrspace(5) %a
3149  ret void
3150}
3151
3152define void @local_volatile_2xi32(ptr addrspace(5) %a) {
3153; CHECK-LABEL: local_volatile_2xi32(
3154; CHECK:       {
3155; CHECK-NEXT:    .reg .b32 %r<5>;
3156; CHECK-NEXT:    .reg .b64 %rd<2>;
3157; CHECK-EMPTY:
3158; CHECK-NEXT:  // %bb.0:
3159; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi32_param_0];
3160; CHECK-NEXT:    ld.local.v2.u32 {%r1, %r2}, [%rd1];
3161; CHECK-NEXT:    add.s32 %r3, %r2, 1;
3162; CHECK-NEXT:    add.s32 %r4, %r1, 1;
3163; CHECK-NEXT:    st.local.v2.u32 [%rd1], {%r4, %r3};
3164; CHECK-NEXT:    ret;
3165  %a.load = load volatile <2 x i32>, ptr addrspace(5) %a
3166  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
3167  store volatile <2 x i32> %a.add, ptr addrspace(5) %a
3168  ret void
3169}
3170
3171define void @local_volatile_4xi32(ptr addrspace(5) %a) {
3172; CHECK-LABEL: local_volatile_4xi32(
3173; CHECK:       {
3174; CHECK-NEXT:    .reg .b32 %r<9>;
3175; CHECK-NEXT:    .reg .b64 %rd<2>;
3176; CHECK-EMPTY:
3177; CHECK-NEXT:  // %bb.0:
3178; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xi32_param_0];
3179; CHECK-NEXT:    ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
3180; CHECK-NEXT:    add.s32 %r5, %r4, 1;
3181; CHECK-NEXT:    add.s32 %r6, %r3, 1;
3182; CHECK-NEXT:    add.s32 %r7, %r2, 1;
3183; CHECK-NEXT:    add.s32 %r8, %r1, 1;
3184; CHECK-NEXT:    st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5};
3185; CHECK-NEXT:    ret;
3186  %a.load = load volatile <4 x i32>, ptr addrspace(5) %a
3187  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
3188  store volatile <4 x i32> %a.add, ptr addrspace(5) %a
3189  ret void
3190}
3191
3192define void @local_volatile_2xi64(ptr addrspace(5) %a) {
3193; CHECK-LABEL: local_volatile_2xi64(
3194; CHECK:       {
3195; CHECK-NEXT:    .reg .b64 %rd<6>;
3196; CHECK-EMPTY:
3197; CHECK-NEXT:  // %bb.0:
3198; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xi64_param_0];
3199; CHECK-NEXT:    ld.local.v2.u64 {%rd2, %rd3}, [%rd1];
3200; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
3201; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
3202; CHECK-NEXT:    st.local.v2.u64 [%rd1], {%rd5, %rd4};
3203; CHECK-NEXT:    ret;
3204  %a.load = load volatile <2 x i64>, ptr addrspace(5) %a
3205  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
3206  store volatile <2 x i64> %a.add, ptr addrspace(5) %a
3207  ret void
3208}
3209
3210define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
3211; CHECK-LABEL: local_volatile_2xfloat(
3212; CHECK:       {
3213; CHECK-NEXT:    .reg .f32 %f<5>;
3214; CHECK-NEXT:    .reg .b64 %rd<2>;
3215; CHECK-EMPTY:
3216; CHECK-NEXT:  // %bb.0:
3217; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xfloat_param_0];
3218; CHECK-NEXT:    ld.local.v2.f32 {%f1, %f2}, [%rd1];
3219; CHECK-NEXT:    add.rn.f32 %f3, %f2, 0f3F800000;
3220; CHECK-NEXT:    add.rn.f32 %f4, %f1, 0f3F800000;
3221; CHECK-NEXT:    st.local.v2.f32 [%rd1], {%f4, %f3};
3222; CHECK-NEXT:    ret;
3223  %a.load = load volatile <2 x float>, ptr addrspace(5) %a
3224  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
3225  store volatile <2 x float> %a.add, ptr addrspace(5) %a
3226  ret void
3227}
3228
3229define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
3230; CHECK-LABEL: local_volatile_4xfloat(
3231; CHECK:       {
3232; CHECK-NEXT:    .reg .f32 %f<9>;
3233; CHECK-NEXT:    .reg .b64 %rd<2>;
3234; CHECK-EMPTY:
3235; CHECK-NEXT:  // %bb.0:
3236; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_4xfloat_param_0];
3237; CHECK-NEXT:    ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
3238; CHECK-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
3239; CHECK-NEXT:    add.rn.f32 %f6, %f3, 0f3F800000;
3240; CHECK-NEXT:    add.rn.f32 %f7, %f2, 0f3F800000;
3241; CHECK-NEXT:    add.rn.f32 %f8, %f1, 0f3F800000;
3242; CHECK-NEXT:    st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5};
3243; CHECK-NEXT:    ret;
3244  %a.load = load volatile <4 x float>, ptr addrspace(5) %a
3245  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
3246  store volatile <4 x float> %a.add, ptr addrspace(5) %a
3247  ret void
3248}
3249
3250define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
3251; CHECK-LABEL: local_volatile_2xdouble(
3252; CHECK:       {
3253; CHECK-NEXT:    .reg .b64 %rd<2>;
3254; CHECK-NEXT:    .reg .f64 %fd<5>;
3255; CHECK-EMPTY:
3256; CHECK-NEXT:  // %bb.0:
3257; CHECK-NEXT:    ld.param.u64 %rd1, [local_volatile_2xdouble_param_0];
3258; CHECK-NEXT:    ld.local.v2.f64 {%fd1, %fd2}, [%rd1];
3259; CHECK-NEXT:    add.rn.f64 %fd3, %fd2, 0d3FF0000000000000;
3260; CHECK-NEXT:    add.rn.f64 %fd4, %fd1, 0d3FF0000000000000;
3261; CHECK-NEXT:    st.local.v2.f64 [%rd1], {%fd4, %fd3};
3262; CHECK-NEXT:    ret;
3263  %a.load = load volatile <2 x double>, ptr addrspace(5) %a
3264  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
3265  store volatile <2 x double> %a.add, ptr addrspace(5) %a
3266  ret void
3267}
3268