xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
15
16define amdgpu_kernel void @local_nontemporal_load_0(
17; GFX6-LABEL: local_nontemporal_load_0:
18; GFX6:       ; %bb.0: ; %entry
19; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
20; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
21; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr8
22; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
23; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
24; GFX6-NEXT:    s_mov_b32 s11, s5
25; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
26; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
27; GFX6-NEXT:    s_mov_b32 s10, -1
28; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
29; GFX6-NEXT:    s_mov_b32 s5, s11
30; GFX6-NEXT:    s_mov_b32 s6, s10
31; GFX6-NEXT:    s_mov_b32 s7, s9
32; GFX6-NEXT:    s_mov_b32 m0, -1
33; GFX6-NEXT:    v_mov_b32_e32 v0, s8
34; GFX6-NEXT:    ds_read_b32 v0, v0
35; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
37; GFX6-NEXT:    s_endpgm
38;
39; GFX7-LABEL: local_nontemporal_load_0:
40; GFX7:       ; %bb.0: ; %entry
41; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
42; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
43; GFX7-NEXT:    s_mov_b32 m0, -1
44; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX7-NEXT:    v_mov_b32_e32 v0, s6
46; GFX7-NEXT:    ds_read_b32 v2, v0
47; GFX7-NEXT:    v_mov_b32_e32 v0, s4
48; GFX7-NEXT:    v_mov_b32_e32 v1, s5
49; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX7-NEXT:    flat_store_dword v[0:1], v2
51; GFX7-NEXT:    s_endpgm
52;
53; GFX10-WGP-LABEL: local_nontemporal_load_0:
54; GFX10-WGP:       ; %bb.0: ; %entry
55; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
56; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
57; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
58; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
60; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
61; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
63; GFX10-WGP-NEXT:    s_endpgm
64;
65; GFX10-CU-LABEL: local_nontemporal_load_0:
66; GFX10-CU:       ; %bb.0: ; %entry
67; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
68; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
69; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
70; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
72; GFX10-CU-NEXT:    ds_read_b32 v1, v1
73; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
75; GFX10-CU-NEXT:    s_endpgm
76;
77; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
78; SKIP-CACHE-INV:       ; %bb.0: ; %entry
79; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
80; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
81; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
82; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
83; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
84; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
85; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
86; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
87; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
88; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
89; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
90; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
91; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
92; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
93; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
94; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
95; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
96; SKIP-CACHE-INV-NEXT:    s_endpgm
97;
98; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
99; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
100; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
101; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
102; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
103; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
104; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
105; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
106; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
108; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
109;
110; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
111; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
112; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
113; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
114; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
115; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
117; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
118; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
119; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
120; GFX90A-TGSPLIT-NEXT:    s_endpgm
121;
122; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
123; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
124; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
125; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
126; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
127; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
129; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
130; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
132; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
133;
134; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
135; GFX940-TGSPLIT:       ; %bb.0: ; %entry
136; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
137; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
138; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
139; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
141; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
142; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
144; GFX940-TGSPLIT-NEXT:    s_endpgm
145;
146; GFX11-WGP-LABEL: local_nontemporal_load_0:
147; GFX11-WGP:       ; %bb.0: ; %entry
148; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
149; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
150; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
151; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
153; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
154; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
156; GFX11-WGP-NEXT:    s_endpgm
157;
158; GFX11-CU-LABEL: local_nontemporal_load_0:
159; GFX11-CU:       ; %bb.0: ; %entry
160; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
161; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
162; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
163; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
165; GFX11-CU-NEXT:    ds_load_b32 v1, v1
166; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
168; GFX11-CU-NEXT:    s_endpgm
169;
170; GFX12-WGP-LABEL: local_nontemporal_load_0:
171; GFX12-WGP:       ; %bb.0: ; %entry
172; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
173; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
174; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
175; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
176; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
177; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
178; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
179; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
180; GFX12-WGP-NEXT:    s_endpgm
181;
182; GFX12-CU-LABEL: local_nontemporal_load_0:
183; GFX12-CU:       ; %bb.0: ; %entry
184; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
185; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
186; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
187; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
188; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
189; GFX12-CU-NEXT:    ds_load_b32 v1, v1
190; GFX12-CU-NEXT:    s_wait_dscnt 0x0
191; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
192; GFX12-CU-NEXT:    s_endpgm
193    ptr addrspace(3) %in, ptr addrspace(1) %out) {
194entry:
195  %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0
196  store i32 %val, ptr addrspace(1) %out
197  ret void
198}
199
200define amdgpu_kernel void @local_nontemporal_load_1(
201; GFX6-LABEL: local_nontemporal_load_1:
202; GFX6:       ; %bb.0: ; %entry
203; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
204; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
205; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr8
206; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
207; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX6-NEXT:    s_mov_b32 s11, s5
209; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
210; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
211; GFX6-NEXT:    s_mov_b32 s10, -1
212; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
213; GFX6-NEXT:    s_mov_b32 s5, s11
214; GFX6-NEXT:    s_mov_b32 s6, s10
215; GFX6-NEXT:    s_mov_b32 s7, s9
216; GFX6-NEXT:    s_mov_b32 s9, 2
217; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s9, v0
218; GFX6-NEXT:    v_add_i32_e64 v0, s[8:9], s8, v0
219; GFX6-NEXT:    s_mov_b32 m0, -1
220; GFX6-NEXT:    ds_read_b32 v0, v0
221; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
222; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
223; GFX6-NEXT:    s_endpgm
224;
225; GFX7-LABEL: local_nontemporal_load_1:
226; GFX7:       ; %bb.0: ; %entry
227; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
228; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
229; GFX7-NEXT:    s_mov_b32 s7, 2
230; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
231; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
233; GFX7-NEXT:    s_mov_b32 m0, -1
234; GFX7-NEXT:    ds_read_b32 v2, v0
235; GFX7-NEXT:    v_mov_b32_e32 v0, s4
236; GFX7-NEXT:    v_mov_b32_e32 v1, s5
237; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX7-NEXT:    flat_store_dword v[0:1], v2
239; GFX7-NEXT:    s_endpgm
240;
241; GFX10-WGP-LABEL: local_nontemporal_load_1:
242; GFX10-WGP:       ; %bb.0: ; %entry
243; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
244; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
245; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
246; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
247; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
248; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
250; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
251; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
252; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
253; GFX10-WGP-NEXT:    s_endpgm
254;
255; GFX10-CU-LABEL: local_nontemporal_load_1:
256; GFX10-CU:       ; %bb.0: ; %entry
257; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
258; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
259; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
260; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
261; GFX10-CU-NEXT:    s_mov_b32 s6, 2
262; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
264; GFX10-CU-NEXT:    ds_read_b32 v1, v1
265; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
267; GFX10-CU-NEXT:    s_endpgm
268;
269; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
270; SKIP-CACHE-INV:       ; %bb.0: ; %entry
271; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
272; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
273; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
274; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
275; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
276; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
277; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
278; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
279; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
280; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
281; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
282; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
283; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
284; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
285; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
286; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
287; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
288; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
289; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
290; SKIP-CACHE-INV-NEXT:    s_endpgm
291;
292; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
293; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
294; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
295; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
296; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
297; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
298; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
299; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
300; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
301; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
303; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
304; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
305; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
307; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
308;
309; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
310; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
311; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
312; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
313; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
314; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
315; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
316; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
317; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
318; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
320; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
321; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
322; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
324; GFX90A-TGSPLIT-NEXT:    s_endpgm
325;
326; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
327; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
328; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
329; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
330; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
331; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
332; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
333; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
334; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
335; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
337; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
338; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
339; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
341; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
342;
343; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
344; GFX940-TGSPLIT:       ; %bb.0: ; %entry
345; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
346; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
347; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
348; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
349; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
350; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
351; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 2
352; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
353; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
354; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
355; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
356; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
358; GFX940-TGSPLIT-NEXT:    s_endpgm
359;
360; GFX11-WGP-LABEL: local_nontemporal_load_1:
361; GFX11-WGP:       ; %bb.0: ; %entry
362; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
363; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
364; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
365; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
366; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
367; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
368; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
369; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
371; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
372; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
374; GFX11-WGP-NEXT:    s_endpgm
375;
376; GFX11-CU-LABEL: local_nontemporal_load_1:
377; GFX11-CU:       ; %bb.0: ; %entry
378; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
379; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
380; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
381; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
382; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
383; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
384; GFX11-CU-NEXT:    s_mov_b32 s2, 2
385; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
387; GFX11-CU-NEXT:    ds_load_b32 v1, v1
388; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
390; GFX11-CU-NEXT:    s_endpgm
391;
392; GFX12-WGP-LABEL: local_nontemporal_load_1:
393; GFX12-WGP:       ; %bb.0: ; %entry
394; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
395; GFX12-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
396; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
397; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
398; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
399; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
400; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
401; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
402; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
403; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
404; GFX12-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
405; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
406; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
407; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
408; GFX12-WGP-NEXT:    s_endpgm
409;
410; GFX12-CU-LABEL: local_nontemporal_load_1:
411; GFX12-CU:       ; %bb.0: ; %entry
412; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
413; GFX12-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
414; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
415; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
416; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
417; GFX12-CU-NEXT:    s_wait_alu 0xfffe
418; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s2
419; GFX12-CU-NEXT:    s_mov_b32 s2, 2
420; GFX12-CU-NEXT:    s_wait_alu 0xfffe
421; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
422; GFX12-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
423; GFX12-CU-NEXT:    ds_load_b32 v1, v1
424; GFX12-CU-NEXT:    s_wait_dscnt 0x0
425; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
426; GFX12-CU-NEXT:    s_endpgm
427    ptr addrspace(3) %in, ptr addrspace(1) %out) {
428entry:
429  %tid = call i32 @llvm.amdgcn.workitem.id.x()
430  %val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid
431  %val = load i32, ptr addrspace(3) %val.gep, align 4, !nontemporal !0
432  store i32 %val, ptr addrspace(1) %out
433  ret void
434}
435
436define amdgpu_kernel void @local_nontemporal_store_0(
437; GFX6-LABEL: local_nontemporal_store_0:
438; GFX6:       ; %bb.0: ; %entry
439; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
440; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
441; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
442; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
444; GFX6-NEXT:    s_mov_b32 m0, -1
445; GFX6-NEXT:    v_mov_b32_e32 v0, s5
446; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX6-NEXT:    v_mov_b32_e32 v1, s4
448; GFX6-NEXT:    ds_write_b32 v0, v1
449; GFX6-NEXT:    s_endpgm
450;
451; GFX7-LABEL: local_nontemporal_store_0:
452; GFX7:       ; %bb.0: ; %entry
453; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
454; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
455; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
457; GFX7-NEXT:    s_mov_b32 m0, -1
458; GFX7-NEXT:    v_mov_b32_e32 v0, s5
459; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX7-NEXT:    v_mov_b32_e32 v1, s4
461; GFX7-NEXT:    ds_write_b32 v0, v1
462; GFX7-NEXT:    s_endpgm
463;
464; GFX10-WGP-LABEL: local_nontemporal_store_0:
465; GFX10-WGP:       ; %bb.0: ; %entry
466; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
467; GFX10-WGP-NEXT:    s_load_dword s5, s[8:9], 0x8
468; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
470; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
471; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
473; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
474; GFX10-WGP-NEXT:    s_endpgm
475;
476; GFX10-CU-LABEL: local_nontemporal_store_0:
477; GFX10-CU:       ; %bb.0: ; %entry
478; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
479; GFX10-CU-NEXT:    s_load_dword s5, s[8:9], 0x8
480; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
482; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
483; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
485; GFX10-CU-NEXT:    ds_write_b32 v0, v1
486; GFX10-CU-NEXT:    s_endpgm
487;
488; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
489; SKIP-CACHE-INV:       ; %bb.0: ; %entry
490; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
491; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
492; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
493; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
494; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
495; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
496; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
497; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
498; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
499; SKIP-CACHE-INV-NEXT:    s_endpgm
500;
501; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
502; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
503; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
504; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
505; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
506; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
507; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
508; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
510; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
511; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
512;
513; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
514; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
515; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
516; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[8:9], 0x8
517; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
519; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
520; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
522; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
523; GFX90A-TGSPLIT-NEXT:    s_endpgm
524;
525; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
526; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
527; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
528; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
529; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
530; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
531; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
532; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
534; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
535; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
536;
537; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
538; GFX940-TGSPLIT:       ; %bb.0: ; %entry
539; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
540; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[4:5], 0x8
541; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[2:3], 0x0
543; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
544; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
545; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
546; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
547; GFX940-TGSPLIT-NEXT:    s_endpgm
548;
549; GFX11-WGP-LABEL: local_nontemporal_store_0:
550; GFX11-WGP:       ; %bb.0: ; %entry
551; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
552; GFX11-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
553; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX11-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
555; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
556; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
558; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
559; GFX11-WGP-NEXT:    s_endpgm
560;
561; GFX11-CU-LABEL: local_nontemporal_store_0:
562; GFX11-CU:       ; %bb.0: ; %entry
563; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
564; GFX11-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
565; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX11-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
567; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
568; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
570; GFX11-CU-NEXT:    ds_store_b32 v0, v1
571; GFX11-CU-NEXT:    s_endpgm
572;
573; GFX12-WGP-LABEL: local_nontemporal_store_0:
574; GFX12-WGP:       ; %bb.0: ; %entry
575; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
576; GFX12-WGP-NEXT:    s_load_b32 s1, s[4:5], 0x8
577; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
578; GFX12-WGP-NEXT:    s_load_b32 s0, s[2:3], 0x0
579; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
580; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
581; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
582; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
583; GFX12-WGP-NEXT:    s_endpgm
584;
585; GFX12-CU-LABEL: local_nontemporal_store_0:
586; GFX12-CU:       ; %bb.0: ; %entry
587; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
588; GFX12-CU-NEXT:    s_load_b32 s1, s[4:5], 0x8
589; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
590; GFX12-CU-NEXT:    s_load_b32 s0, s[2:3], 0x0
591; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
592; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
593; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
594; GFX12-CU-NEXT:    ds_store_b32 v0, v1
595; GFX12-CU-NEXT:    s_endpgm
596    ptr addrspace(1) %in, ptr addrspace(3) %out) {
597entry:
598  %val = load i32, ptr addrspace(1) %in, align 4
599  store i32 %val, ptr addrspace(3) %out, !nontemporal !0
600  ret void
601}
602
603define amdgpu_kernel void @local_nontemporal_store_1(
604; GFX6-LABEL: local_nontemporal_store_1:
605; GFX6:       ; %bb.0: ; %entry
606; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
607; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr5
608; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
609; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
611; GFX6-NEXT:    s_mov_b32 s6, 2
612; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
613; GFX6-NEXT:    v_add_i32_e64 v0, s[6:7], s5, v0
614; GFX6-NEXT:    s_mov_b32 m0, -1
615; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
616; GFX6-NEXT:    v_mov_b32_e32 v1, s4
617; GFX6-NEXT:    ds_write_b32 v0, v1
618; GFX6-NEXT:    s_endpgm
619;
620; GFX7-LABEL: local_nontemporal_store_1:
621; GFX7:       ; %bb.0: ; %entry
622; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
623; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
624; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
626; GFX7-NEXT:    s_mov_b32 s6, 2
627; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
628; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s5, v0
629; GFX7-NEXT:    s_mov_b32 m0, -1
630; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
631; GFX7-NEXT:    v_mov_b32_e32 v1, s4
632; GFX7-NEXT:    ds_write_b32 v0, v1
633; GFX7-NEXT:    s_endpgm
634;
635; GFX10-WGP-LABEL: local_nontemporal_store_1:
636; GFX10-WGP:       ; %bb.0: ; %entry
637; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
638; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
639; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
640; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
641; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
642; GFX10-WGP-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
643; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
645; GFX10-WGP-NEXT:    ds_write_b32 v0, v1
646; GFX10-WGP-NEXT:    s_endpgm
647;
648; GFX10-CU-LABEL: local_nontemporal_store_1:
649; GFX10-CU:       ; %bb.0: ; %entry
650; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
651; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
652; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
654; GFX10-CU-NEXT:    s_mov_b32 s5, 2
655; GFX10-CU-NEXT:    v_lshl_add_u32 v0, v0, s5, s6
656; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
658; GFX10-CU-NEXT:    ds_write_b32 v0, v1
659; GFX10-CU-NEXT:    s_endpgm
660;
661; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
662; SKIP-CACHE-INV:       ; %bb.0: ; %entry
663; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
664; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
665; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
666; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
667; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
668; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
669; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[2:3], s1, v0
670; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
671; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
672; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
673; SKIP-CACHE-INV-NEXT:    ds_write_b32 v0, v1
674; SKIP-CACHE-INV-NEXT:    s_endpgm
675;
676; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
677; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
678; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
679; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
680; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
682; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
683; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
684; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
685; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
686; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
687; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
689; GFX90A-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
690; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
691;
692; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
693; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
694; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
695; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
696; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
697; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
698; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
699; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
700; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
701; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
702; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s5, v1
703; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
705; GFX90A-TGSPLIT-NEXT:    ds_write_b32 v0, v1
706; GFX90A-TGSPLIT-NEXT:    s_endpgm
707;
708; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
709; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
710; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
711; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
712; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
714; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
715; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
716; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
717; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
718; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
719; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
720; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
721; GFX940-NOTTGSPLIT-NEXT:    ds_write_b32 v0, v1
722; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
723;
724; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
725; GFX940-TGSPLIT:       ; %bb.0: ; %entry
726; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
727; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
728; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
729; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
730; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
731; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
732; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 2
733; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
734; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v0, v0, s1, v1
735; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
736; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s0
737; GFX940-TGSPLIT-NEXT:    ds_write_b32 v0, v1
738; GFX940-TGSPLIT-NEXT:    s_endpgm
739;
740; GFX11-WGP-LABEL: local_nontemporal_store_1:
741; GFX11-WGP:       ; %bb.0: ; %entry
742; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
743; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
744; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
745; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
746; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
747; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
748; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
749; GFX11-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
750; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s0
752; GFX11-WGP-NEXT:    ds_store_b32 v0, v1
753; GFX11-WGP-NEXT:    s_endpgm
754;
755; GFX11-CU-LABEL: local_nontemporal_store_1:
756; GFX11-CU:       ; %bb.0: ; %entry
757; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
758; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
759; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
761; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
762; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
763; GFX11-CU-NEXT:    s_mov_b32 s1, 2
764; GFX11-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
765; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
766; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s0
767; GFX11-CU-NEXT:    ds_store_b32 v0, v1
768; GFX11-CU-NEXT:    s_endpgm
769;
770; GFX12-WGP-LABEL: local_nontemporal_store_1:
771; GFX12-WGP:       ; %bb.0: ; %entry
772; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
773; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
774; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
775; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
776; GFX12-WGP-NEXT:    s_mov_b32 s1, 0x3ff
777; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
778; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
779; GFX12-WGP-NEXT:    s_mov_b32 s1, 2
780; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
781; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
782; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
783; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
784; GFX12-WGP-NEXT:    ds_store_b32 v0, v1
785; GFX12-WGP-NEXT:    s_endpgm
786;
787; GFX12-CU-LABEL: local_nontemporal_store_1:
788; GFX12-CU:       ; %bb.0: ; %entry
789; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
790; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
791; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
792; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
793; GFX12-CU-NEXT:    s_mov_b32 s1, 0x3ff
794; GFX12-CU-NEXT:    s_wait_alu 0xfffe
795; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s1
796; GFX12-CU-NEXT:    s_mov_b32 s1, 2
797; GFX12-CU-NEXT:    s_wait_alu 0xfffe
798; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, s1, s2
799; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
800; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
801; GFX12-CU-NEXT:    ds_store_b32 v0, v1
802; GFX12-CU-NEXT:    s_endpgm
803    ptr addrspace(1) %in, ptr addrspace(3) %out) {
804entry:
805  %tid = call i32 @llvm.amdgcn.workitem.id.x()
806  %val = load i32, ptr addrspace(1) %in, align 4
807  %out.gep = getelementptr inbounds i32, ptr addrspace(3) %out, i32 %tid
808  store i32 %val, ptr addrspace(3) %out.gep, !nontemporal !0
809  ret void
810}
811
812define amdgpu_kernel void @local_nontemporal_volatile_load(
813; GFX6-LABEL: local_nontemporal_volatile_load:
814; GFX6:       ; %bb.0: ; %entry
815; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
816; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
817; GFX6-NEXT:    ; kill: def $sgpr6 killed $sgpr8
818; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
819; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX6-NEXT:    s_mov_b32 s11, s5
821; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
822; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
823; GFX6-NEXT:    s_mov_b32 s10, -1
824; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
825; GFX6-NEXT:    s_mov_b32 s5, s11
826; GFX6-NEXT:    s_mov_b32 s6, s10
827; GFX6-NEXT:    s_mov_b32 s7, s9
828; GFX6-NEXT:    s_mov_b32 m0, -1
829; GFX6-NEXT:    v_mov_b32_e32 v0, s8
830; GFX6-NEXT:    ds_read_b32 v0, v0
831; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
832; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
833; GFX6-NEXT:    s_endpgm
834;
835; GFX7-LABEL: local_nontemporal_volatile_load:
836; GFX7:       ; %bb.0: ; %entry
837; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
838; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
839; GFX7-NEXT:    s_mov_b32 m0, -1
840; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX7-NEXT:    v_mov_b32_e32 v0, s6
842; GFX7-NEXT:    ds_read_b32 v2, v0
843; GFX7-NEXT:    v_mov_b32_e32 v0, s4
844; GFX7-NEXT:    v_mov_b32_e32 v1, s5
845; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX7-NEXT:    flat_store_dword v[0:1], v2
847; GFX7-NEXT:    s_endpgm
848;
849; GFX10-WGP-LABEL: local_nontemporal_volatile_load:
850; GFX10-WGP:       ; %bb.0: ; %entry
851; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
852; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
853; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
854; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
855; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
856; GFX10-WGP-NEXT:    ds_read_b32 v1, v1
857; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
858; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
859; GFX10-WGP-NEXT:    s_endpgm
860;
861; GFX10-CU-LABEL: local_nontemporal_volatile_load:
862; GFX10-CU:       ; %bb.0: ; %entry
863; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
864; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
865; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
866; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
868; GFX10-CU-NEXT:    ds_read_b32 v1, v1
869; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
870; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
871; GFX10-CU-NEXT:    s_endpgm
872;
873; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load:
874; SKIP-CACHE-INV:       ; %bb.0: ; %entry
875; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
876; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
877; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
878; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
879; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
880; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
881; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
882; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
883; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
884; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
885; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
886; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
887; SKIP-CACHE-INV-NEXT:    s_mov_b32 m0, -1
888; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
889; SKIP-CACHE-INV-NEXT:    ds_read_b32 v0, v0
890; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
891; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
892; SKIP-CACHE-INV-NEXT:    s_endpgm
893;
894; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
895; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
896; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
897; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
898; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
899; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
901; GFX90A-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
902; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
904; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
905;
906; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load:
907; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
908; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
909; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
910; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
911; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
912; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
913; GFX90A-TGSPLIT-NEXT:    ds_read_b32 v1, v1
914; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
916; GFX90A-TGSPLIT-NEXT:    s_endpgm
917;
918; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
919; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
920; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
921; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
922; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
923; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
925; GFX940-NOTTGSPLIT-NEXT:    ds_read_b32 v1, v1
926; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
928; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
929;
930; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load:
931; GFX940-TGSPLIT:       ; %bb.0: ; %entry
932; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
933; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
934; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
935; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
937; GFX940-TGSPLIT-NEXT:    ds_read_b32 v1, v1
938; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
940; GFX940-TGSPLIT-NEXT:    s_endpgm
941;
942; GFX11-WGP-LABEL: local_nontemporal_volatile_load:
943; GFX11-WGP:       ; %bb.0: ; %entry
944; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
945; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
946; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
947; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, s2
949; GFX11-WGP-NEXT:    ds_load_b32 v1, v1
950; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
952; GFX11-WGP-NEXT:    s_endpgm
953;
954; GFX11-CU-LABEL: local_nontemporal_volatile_load:
955; GFX11-CU:       ; %bb.0: ; %entry
956; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
957; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
958; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
959; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX11-CU-NEXT:    v_mov_b32_e32 v1, s2
961; GFX11-CU-NEXT:    ds_load_b32 v1, v1
962; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
963; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
964; GFX11-CU-NEXT:    s_endpgm
965;
966; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
967; GFX12-WGP:       ; %bb.0: ; %entry
968; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
969; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
970; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
971; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
972; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s2
973; GFX12-WGP-NEXT:    ds_load_b32 v1, v1
974; GFX12-WGP-NEXT:    s_wait_dscnt 0x0
975; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
976; GFX12-WGP-NEXT:    s_endpgm
977;
978; GFX12-CU-LABEL: local_nontemporal_volatile_load:
979; GFX12-CU:       ; %bb.0: ; %entry
980; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
981; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
982; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
983; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
984; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s2
985; GFX12-CU-NEXT:    ds_load_b32 v1, v1
986; GFX12-CU-NEXT:    s_wait_dscnt 0x0
987; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
988; GFX12-CU-NEXT:    s_endpgm
989    ptr addrspace(3) %in, ptr addrspace(1) %out) {
990entry:
991  %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0
992  store i32 %val, ptr addrspace(1) %out
993  ret void
994}
995
996!0 = !{i32 1}
997declare i32 @llvm.amdgcn.workitem.id.x()
998