xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-access-mode < %s | FileCheck --check-prefix=GFX7-ALIGNED %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX12 %s
8
9; Should not merge this to a dword load
10define i32 @global_load_2xi16_align2(ptr addrspace(1) %p) #0 {
11; GFX7-ALIGNED-LABEL: global_load_2xi16_align2:
12; GFX7-ALIGNED:       ; %bb.0:
13; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
15; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
16; GFX7-ALIGNED-NEXT:    flat_load_ushort v2, v[2:3]
17; GFX7-ALIGNED-NEXT:    flat_load_ushort v0, v[0:1]
18; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
19; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
20; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
21; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
22; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2:
25; GFX7-UNALIGNED:       ; %bb.0:
26; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
28; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
29; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
30;
31; GFX9-LABEL: global_load_2xi16_align2:
32; GFX9:       ; %bb.0:
33; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34; GFX9-NEXT:    global_load_dword v0, v[0:1], off
35; GFX9-NEXT:    s_waitcnt vmcnt(0)
36; GFX9-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX10-LABEL: global_load_2xi16_align2:
39; GFX10:       ; %bb.0:
40; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX10-NEXT:    global_load_dword v0, v[0:1], off
42; GFX10-NEXT:    s_waitcnt vmcnt(0)
43; GFX10-NEXT:    s_setpc_b64 s[30:31]
44;
45; GFX11-LABEL: global_load_2xi16_align2:
46; GFX11:       ; %bb.0:
47; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
49; GFX11-NEXT:    s_waitcnt vmcnt(0)
50; GFX11-NEXT:    s_setpc_b64 s[30:31]
51;
52; GFX12-LABEL: global_load_2xi16_align2:
53; GFX12:       ; %bb.0:
54; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
55; GFX12-NEXT:    s_wait_expcnt 0x0
56; GFX12-NEXT:    s_wait_samplecnt 0x0
57; GFX12-NEXT:    s_wait_bvhcnt 0x0
58; GFX12-NEXT:    s_wait_kmcnt 0x0
59; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
60; GFX12-NEXT:    s_wait_loadcnt 0x0
61; GFX12-NEXT:    s_setpc_b64 s[30:31]
62  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
63  %p.0 = load i16, ptr addrspace(1) %p, align 2
64  %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
65  %zext.0 = zext i16 %p.0 to i32
66  %zext.1 = zext i16 %p.1 to i32
67  %shl.1 = shl i32 %zext.1, 16
68  %or = or i32 %zext.0, %shl.1
69  ret i32 %or
70}
71
72; Should not merge this to a dword store
73define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
74; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
75; GFX7-ALIGNED:       ; %bb.0:
76; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
77; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
78; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
80; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
81; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
82; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
83; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
84; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s2
85; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 2
86; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s3
87; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
88; GFX7-ALIGNED-NEXT:    s_endpgm
89;
90; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
91; GFX7-UNALIGNED:       ; %bb.0:
92; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
93; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
94; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
96; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
97; GFX7-UNALIGNED-NEXT:    flat_store_dword v[0:1], v2
98; GFX7-UNALIGNED-NEXT:    s_endpgm
99;
100; GFX9-LABEL: global_store_2xi16_align2:
101; GFX9:       ; %bb.0:
102; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
103; GFX9-NEXT:    v_mov_b32_e32 v0, 0
104; GFX9-NEXT:    v_mov_b32_e32 v1, 0x20001
105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
107; GFX9-NEXT:    s_endpgm
108;
109; GFX10-LABEL: global_store_2xi16_align2:
110; GFX10:       ; %bb.0:
111; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
112; GFX10-NEXT:    v_mov_b32_e32 v0, 0
113; GFX10-NEXT:    v_mov_b32_e32 v1, 0x20001
114; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
116; GFX10-NEXT:    s_endpgm
117;
118; GFX11-LABEL: global_store_2xi16_align2:
119; GFX11:       ; %bb.0:
120; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
121; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
122; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
124; GFX11-NEXT:    s_endpgm
125;
126; GFX12-LABEL: global_store_2xi16_align2:
127; GFX12:       ; %bb.0:
128; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
129; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
130; GFX12-NEXT:    s_wait_kmcnt 0x0
131; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
132; GFX12-NEXT:    s_endpgm
133  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
134  store i16 1, ptr addrspace(1) %r, align 2
135  store i16 2, ptr addrspace(1) %gep.r, align 2
136  ret void
137}
138
139; Should produce align 1 dword when legal
140define i32 @global_load_2xi16_align1(ptr addrspace(1) %p) #0 {
141; GFX7-ALIGNED-LABEL: global_load_2xi16_align1:
142; GFX7-ALIGNED:       ; %bb.0:
143; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
145; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
146; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
147; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
148; GFX7-ALIGNED-NEXT:    v_add_i32_e32 v6, vcc, 3, v0
149; GFX7-ALIGNED-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
150; GFX7-ALIGNED-NEXT:    flat_load_ubyte v6, v[6:7]
151; GFX7-ALIGNED-NEXT:    flat_load_ubyte v4, v[4:5]
152; GFX7-ALIGNED-NEXT:    flat_load_ubyte v2, v[2:3]
153; GFX7-ALIGNED-NEXT:    flat_load_ubyte v0, v[0:1]
154; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(3)
155; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
156; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(2)
157; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v4
158; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(1)
159; GFX7-ALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
160; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
161; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v1, v0
162; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v1, v3, v2
163; GFX7-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
164; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
165;
166; GFX7-UNALIGNED-LABEL: global_load_2xi16_align1:
167; GFX7-UNALIGNED:       ; %bb.0:
168; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
170; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
171; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
172;
173; GFX9-LABEL: global_load_2xi16_align1:
174; GFX9:       ; %bb.0:
175; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
176; GFX9-NEXT:    global_load_dword v0, v[0:1], off
177; GFX9-NEXT:    s_waitcnt vmcnt(0)
178; GFX9-NEXT:    s_setpc_b64 s[30:31]
179;
180; GFX10-LABEL: global_load_2xi16_align1:
181; GFX10:       ; %bb.0:
182; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183; GFX10-NEXT:    global_load_dword v0, v[0:1], off
184; GFX10-NEXT:    s_waitcnt vmcnt(0)
185; GFX10-NEXT:    s_setpc_b64 s[30:31]
186;
187; GFX11-LABEL: global_load_2xi16_align1:
188; GFX11:       ; %bb.0:
189; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
191; GFX11-NEXT:    s_waitcnt vmcnt(0)
192; GFX11-NEXT:    s_setpc_b64 s[30:31]
193;
194; GFX12-LABEL: global_load_2xi16_align1:
195; GFX12:       ; %bb.0:
196; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
197; GFX12-NEXT:    s_wait_expcnt 0x0
198; GFX12-NEXT:    s_wait_samplecnt 0x0
199; GFX12-NEXT:    s_wait_bvhcnt 0x0
200; GFX12-NEXT:    s_wait_kmcnt 0x0
201; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
202; GFX12-NEXT:    s_wait_loadcnt 0x0
203; GFX12-NEXT:    s_setpc_b64 s[30:31]
204  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
205  %p.0 = load i16, ptr addrspace(1) %p, align 1
206  %p.1 = load i16, ptr addrspace(1) %gep.p, align 1
207  %zext.0 = zext i16 %p.0 to i32
208  %zext.1 = zext i16 %p.1 to i32
209  %shl.1 = shl i32 %zext.1, 16
210  %or = or i32 %zext.0, %shl.1
211  ret i32 %or
212}
213
214; Should produce align 1 dword when legal
215define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
216; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
217; GFX7-ALIGNED:       ; %bb.0:
218; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
219; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
220; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
221; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
223; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
224; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
225; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
226; GFX7-ALIGNED-NEXT:    s_add_u32 s4, s0, 1
227; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
228; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
229; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
230; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
231; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
232; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
233; GFX7-ALIGNED-NEXT:    s_addc_u32 s1, s1, 0
234; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
235; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
236; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
237; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s2
238; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 2
239; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s3
240; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
241; GFX7-ALIGNED-NEXT:    s_endpgm
242;
243; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
244; GFX7-UNALIGNED:       ; %bb.0:
245; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
246; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
247; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
249; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
250; GFX7-UNALIGNED-NEXT:    flat_store_dword v[0:1], v2
251; GFX7-UNALIGNED-NEXT:    s_endpgm
252;
253; GFX9-LABEL: global_store_2xi16_align1:
254; GFX9:       ; %bb.0:
255; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
256; GFX9-NEXT:    v_mov_b32_e32 v0, 0
257; GFX9-NEXT:    v_mov_b32_e32 v1, 0x20001
258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
260; GFX9-NEXT:    s_endpgm
261;
262; GFX10-LABEL: global_store_2xi16_align1:
263; GFX10:       ; %bb.0:
264; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
265; GFX10-NEXT:    v_mov_b32_e32 v0, 0
266; GFX10-NEXT:    v_mov_b32_e32 v1, 0x20001
267; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
269; GFX10-NEXT:    s_endpgm
270;
271; GFX11-LABEL: global_store_2xi16_align1:
272; GFX11:       ; %bb.0:
273; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
274; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
275; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
277; GFX11-NEXT:    s_endpgm
278;
279; GFX12-LABEL: global_store_2xi16_align1:
280; GFX12:       ; %bb.0:
281; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
282; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
283; GFX12-NEXT:    s_wait_kmcnt 0x0
284; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
285; GFX12-NEXT:    s_endpgm
286  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
287  store i16 1, ptr addrspace(1) %r, align 1
288  store i16 2, ptr addrspace(1) %gep.r, align 1
289  ret void
290}
291
292; Should merge this to a dword load
293define i32 @global_load_2xi16_align4(ptr addrspace(1) %p) #0 {
294; GFX7-ALIGNED-LABEL: global_load_2xi16_align4:
295; GFX7-ALIGNED:       ; %bb.0:
296; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX7-ALIGNED-NEXT:    flat_load_dword v0, v[0:1]
298; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
299; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4:
302; GFX7-UNALIGNED:       ; %bb.0:
303; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
305; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
306; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
307;
308; GFX9-LABEL: global_load_2xi16_align4:
309; GFX9:       ; %bb.0:
310; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
311; GFX9-NEXT:    global_load_dword v0, v[0:1], off
312; GFX9-NEXT:    s_waitcnt vmcnt(0)
313; GFX9-NEXT:    s_setpc_b64 s[30:31]
314;
315; GFX10-LABEL: global_load_2xi16_align4:
316; GFX10:       ; %bb.0:
317; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GFX10-NEXT:    global_load_dword v0, v[0:1], off
319; GFX10-NEXT:    s_waitcnt vmcnt(0)
320; GFX10-NEXT:    s_setpc_b64 s[30:31]
321;
322; GFX11-LABEL: global_load_2xi16_align4:
323; GFX11:       ; %bb.0:
324; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
326; GFX11-NEXT:    s_waitcnt vmcnt(0)
327; GFX11-NEXT:    s_setpc_b64 s[30:31]
328;
329; GFX12-LABEL: global_load_2xi16_align4:
330; GFX12:       ; %bb.0:
331; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
332; GFX12-NEXT:    s_wait_expcnt 0x0
333; GFX12-NEXT:    s_wait_samplecnt 0x0
334; GFX12-NEXT:    s_wait_bvhcnt 0x0
335; GFX12-NEXT:    s_wait_kmcnt 0x0
336; GFX12-NEXT:    global_load_b32 v0, v[0:1], off
337; GFX12-NEXT:    s_wait_loadcnt 0x0
338; GFX12-NEXT:    s_setpc_b64 s[30:31]
339  %gep.p = getelementptr i16, ptr addrspace(1) %p, i64 1
340  %p.0 = load i16, ptr addrspace(1) %p, align 4
341  %p.1 = load i16, ptr addrspace(1) %gep.p, align 2
342  %zext.0 = zext i16 %p.0 to i32
343  %zext.1 = zext i16 %p.1 to i32
344  %shl.1 = shl i32 %zext.1, 16
345  %or = or i32 %zext.0, %shl.1
346  ret i32 %or
347}
348
349; Should merge this to a dword store
350define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr addrspace(1) %r) #0 {
351; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
352; GFX7-ALIGNED:       ; %bb.0:
353; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
354; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
355; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
357; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
358; GFX7-ALIGNED-NEXT:    flat_store_dword v[0:1], v2
359; GFX7-ALIGNED-NEXT:    s_endpgm
360;
361; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
362; GFX7-UNALIGNED:       ; %bb.0:
363; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
364; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
365; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
367; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
368; GFX7-UNALIGNED-NEXT:    flat_store_dword v[0:1], v2
369; GFX7-UNALIGNED-NEXT:    s_endpgm
370;
371; GFX9-LABEL: global_store_2xi16_align4:
372; GFX9:       ; %bb.0:
373; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
374; GFX9-NEXT:    v_mov_b32_e32 v0, 0
375; GFX9-NEXT:    v_mov_b32_e32 v1, 0x20001
376; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
378; GFX9-NEXT:    s_endpgm
379;
380; GFX10-LABEL: global_store_2xi16_align4:
381; GFX10:       ; %bb.0:
382; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
383; GFX10-NEXT:    v_mov_b32_e32 v0, 0
384; GFX10-NEXT:    v_mov_b32_e32 v1, 0x20001
385; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
387; GFX10-NEXT:    s_endpgm
388;
389; GFX11-LABEL: global_store_2xi16_align4:
390; GFX11:       ; %bb.0:
391; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
392; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
393; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
395; GFX11-NEXT:    s_endpgm
396;
397; GFX12-LABEL: global_store_2xi16_align4:
398; GFX12:       ; %bb.0:
399; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
400; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001
401; GFX12-NEXT:    s_wait_kmcnt 0x0
402; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
403; GFX12-NEXT:    s_endpgm
404  %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1
405  store i16 1, ptr addrspace(1) %r, align 4
406  store i16 2, ptr addrspace(1) %gep.r, align 2
407  ret void
408}
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442