xref: /llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
14; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
15
16define amdgpu_kernel void @private_nontemporal_load_0(
17; GFX6-LABEL: private_nontemporal_load_0:
18; GFX6:       ; %bb.0: ; %entry
19; GFX6-NEXT:    s_add_u32 s0, s0, s15
20; GFX6-NEXT:    s_addc_u32 s1, s1, 0
21; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
22; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
23; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
24; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
25; GFX6-NEXT:    s_mov_b32 s11, s5
26; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
27; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
28; GFX6-NEXT:    s_mov_b32 s10, -1
29; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
30; GFX6-NEXT:    s_mov_b32 s5, s11
31; GFX6-NEXT:    s_mov_b32 s6, s10
32; GFX6-NEXT:    s_mov_b32 s7, s9
33; GFX6-NEXT:    v_mov_b32_e32 v0, s8
34; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
37; GFX6-NEXT:    s_endpgm
38;
39; GFX7-LABEL: private_nontemporal_load_0:
40; GFX7:       ; %bb.0: ; %entry
41; GFX7-NEXT:    s_add_u32 s0, s0, s15
42; GFX7-NEXT:    s_addc_u32 s1, s1, 0
43; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
44; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
45; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX7-NEXT:    v_mov_b32_e32 v0, s6
47; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
48; GFX7-NEXT:    v_mov_b32_e32 v0, s4
49; GFX7-NEXT:    v_mov_b32_e32 v1, s5
50; GFX7-NEXT:    s_waitcnt vmcnt(0)
51; GFX7-NEXT:    flat_store_dword v[0:1], v2
52; GFX7-NEXT:    s_endpgm
53;
54; GFX10-WGP-LABEL: private_nontemporal_load_0:
55; GFX10-WGP:       ; %bb.0: ; %entry
56; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
57; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
58; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
59; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
60; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
61; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
63; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
64; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
65; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
66; GFX10-WGP-NEXT:    s_endpgm
67;
68; GFX10-CU-LABEL: private_nontemporal_load_0:
69; GFX10-CU:       ; %bb.0: ; %entry
70; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
71; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
72; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
73; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
74; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
75; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
77; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
78; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
79; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
80; GFX10-CU-NEXT:    s_endpgm
81;
82; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0:
83; SKIP-CACHE-INV:       ; %bb.0: ; %entry
84; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
85; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
86; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
87; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
88; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
89; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
90; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
91; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
92; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
93; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
94; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
95; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
96; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
97; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
98; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
99; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
100; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
101; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
102; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
103; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
104; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
105; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
106; SKIP-CACHE-INV-NEXT:    s_endpgm
107;
108; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
109; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
110; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
111; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
112; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
113; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
114; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
115; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
117; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
118; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
119; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
120; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
121;
122; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
123; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
124; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
125; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
126; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
127; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
128; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
129; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
131; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
132; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
133; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
134; GFX90A-TGSPLIT-NEXT:    s_endpgm
135;
136; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
137; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
138; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
139; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
140; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
141; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, off, s2 nt
143; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
144; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
145; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
146;
147; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
148; GFX940-TGSPLIT:       ; %bb.0: ; %entry
149; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
150; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
151; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
152; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, off, s2 nt
154; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
155; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
156; GFX940-TGSPLIT-NEXT:    s_endpgm
157;
158; GFX11-WGP-LABEL: private_nontemporal_load_0:
159; GFX11-WGP:       ; %bb.0: ; %entry
160; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
161; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
162; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
163; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 slc dlc
165; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
166; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
167; GFX11-WGP-NEXT:    s_endpgm
168;
169; GFX11-CU-LABEL: private_nontemporal_load_0:
170; GFX11-CU:       ; %bb.0: ; %entry
171; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
172; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
173; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
174; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 slc dlc
176; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
177; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
178; GFX11-CU-NEXT:    s_endpgm
179;
180; GFX12-WGP-LABEL: private_nontemporal_load_0:
181; GFX12-WGP:       ; %bb.0: ; %entry
182; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
183; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
184; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
185; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
186; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
187; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
188; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
189; GFX12-WGP-NEXT:    s_endpgm
190;
191; GFX12-CU-LABEL: private_nontemporal_load_0:
192; GFX12-CU:       ; %bb.0: ; %entry
193; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
194; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
195; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
196; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
197; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT
198; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
199; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
200; GFX12-CU-NEXT:    s_endpgm
201    ptr addrspace(5) %in, ptr addrspace(1) %out) {
202entry:
203  %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0
204  store i32 %val, ptr addrspace(1) %out
205  ret void
206}
207
208define amdgpu_kernel void @private_nontemporal_load_1(
209; GFX6-LABEL: private_nontemporal_load_1:
210; GFX6:       ; %bb.0: ; %entry
211; GFX6-NEXT:    s_add_u32 s0, s0, s15
212; GFX6-NEXT:    s_addc_u32 s1, s1, 0
213; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
214; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
215; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
216; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX6-NEXT:    s_mov_b32 s11, s5
218; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
219; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
220; GFX6-NEXT:    s_mov_b32 s10, -1
221; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
222; GFX6-NEXT:    s_mov_b32 s5, s11
223; GFX6-NEXT:    s_mov_b32 s6, s10
224; GFX6-NEXT:    s_mov_b32 s7, s9
225; GFX6-NEXT:    s_mov_b32 s9, 2
226; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s9, v0
227; GFX6-NEXT:    v_add_i32_e64 v0, s[8:9], s8, v0
228; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc slc
229; GFX6-NEXT:    s_waitcnt vmcnt(0)
230; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
231; GFX6-NEXT:    s_endpgm
232;
233; GFX7-LABEL: private_nontemporal_load_1:
234; GFX7:       ; %bb.0: ; %entry
235; GFX7-NEXT:    s_add_u32 s0, s0, s15
236; GFX7-NEXT:    s_addc_u32 s1, s1, 0
237; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
238; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
239; GFX7-NEXT:    s_mov_b32 s7, 2
240; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s7, v0
241; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX7-NEXT:    v_add_i32_e64 v0, s[6:7], s6, v0
243; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc slc
244; GFX7-NEXT:    v_mov_b32_e32 v0, s4
245; GFX7-NEXT:    v_mov_b32_e32 v1, s5
246; GFX7-NEXT:    s_waitcnt vmcnt(0)
247; GFX7-NEXT:    flat_store_dword v[0:1], v2
248; GFX7-NEXT:    s_endpgm
249;
250; GFX10-WGP-LABEL: private_nontemporal_load_1:
251; GFX10-WGP:       ; %bb.0: ; %entry
252; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
253; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
254; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
255; GFX10-WGP-NEXT:    s_load_dword s7, s[8:9], 0x0
256; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
257; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
258; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
259; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
261; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
262; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
263; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
264; GFX10-WGP-NEXT:    s_endpgm
265;
266; GFX10-CU-LABEL: private_nontemporal_load_1:
267; GFX10-CU:       ; %bb.0: ; %entry
268; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
269; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
270; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
271; GFX10-CU-NEXT:    s_load_dword s7, s[8:9], 0x0
272; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
273; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
274; GFX10-CU-NEXT:    s_mov_b32 s6, 2
275; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v1, s6, s7
277; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen slc
278; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
279; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
280; GFX10-CU-NEXT:    s_endpgm
281;
282; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1:
283; SKIP-CACHE-INV:       ; %bb.0: ; %entry
284; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
285; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
286; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
287; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
288; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
289; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
290; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
291; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
292; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
293; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
294; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
295; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
296; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
297; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
298; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
299; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
300; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
301; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
302; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 2
303; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s5, v0
304; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v0, s[4:5], s4, v0
305; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
306; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
307; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
308; SKIP-CACHE-INV-NEXT:    s_endpgm
309;
310; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
311; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
312; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
313; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
314; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
315; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
316; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
317; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
318; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
319; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
320; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 2
321; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
323; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
324; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
325; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
326; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
327; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
328;
329; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
330; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
331; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
332; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
333; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
334; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x0
335; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
336; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
337; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
338; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s6
339; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 2
340; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s7
342; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s6, v2
343; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc slc
344; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
345; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
346; GFX90A-TGSPLIT-NEXT:    s_endpgm
347;
348; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
349; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
350; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
351; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
352; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
353; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
354; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
355; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
356; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s2, 2
357; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
359; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
360; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
361; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
362; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
363; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
364;
365; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
366; GFX940-TGSPLIT:       ; %bb.0: ; %entry
367; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
368; GFX940-TGSPLIT-NEXT:    s_load_dword s3, s[4:5], 0x0
369; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
370; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
371; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 0x3ff
372; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v1, v1, s2
373; GFX940-TGSPLIT-NEXT:    s_mov_b32 s2, 2
374; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v2, s3
376; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v1, s2, v2
377; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, v1, off nt
378; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
379; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
380; GFX940-TGSPLIT-NEXT:    s_endpgm
381;
382; GFX11-WGP-LABEL: private_nontemporal_load_1:
383; GFX11-WGP:       ; %bb.0: ; %entry
384; GFX11-WGP-NEXT:    v_mov_b32_e32 v1, v0
385; GFX11-WGP-NEXT:    s_load_b32 s3, s[4:5], 0x0
386; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
387; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
388; GFX11-WGP-NEXT:    s_mov_b32 s2, 0x3ff
389; GFX11-WGP-NEXT:    v_and_b32_e64 v1, v1, s2
390; GFX11-WGP-NEXT:    s_mov_b32 s2, 2
391; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
393; GFX11-WGP-NEXT:    scratch_load_b32 v1, v1, off slc dlc
394; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
395; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
396; GFX11-WGP-NEXT:    s_endpgm
397;
398; GFX11-CU-LABEL: private_nontemporal_load_1:
399; GFX11-CU:       ; %bb.0: ; %entry
400; GFX11-CU-NEXT:    v_mov_b32_e32 v1, v0
401; GFX11-CU-NEXT:    s_load_b32 s3, s[4:5], 0x0
402; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
403; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
404; GFX11-CU-NEXT:    s_mov_b32 s2, 0x3ff
405; GFX11-CU-NEXT:    v_and_b32_e64 v1, v1, s2
406; GFX11-CU-NEXT:    s_mov_b32 s2, 2
407; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v1, s2, s3
409; GFX11-CU-NEXT:    scratch_load_b32 v1, v1, off slc dlc
410; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
411; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
412; GFX11-CU-NEXT:    s_endpgm
413;
414; GFX12-WGP-LABEL: private_nontemporal_load_1:
415; GFX12-WGP:       ; %bb.0: ; %entry
416; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, v0
417; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
418; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
419; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
420; GFX12-WGP-NEXT:    s_mov_b32 s3, 0x3ff
421; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
422; GFX12-WGP-NEXT:    v_and_b32_e64 v1, v1, s3
423; GFX12-WGP-NEXT:    s_mov_b32 s3, 2
424; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
425; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
426; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
427; GFX12-WGP-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
428; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
429; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
430; GFX12-WGP-NEXT:    s_endpgm
431;
432; GFX12-CU-LABEL: private_nontemporal_load_1:
433; GFX12-CU:       ; %bb.0: ; %entry
434; GFX12-CU-NEXT:    v_mov_b32_e32 v1, v0
435; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
436; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
437; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
438; GFX12-CU-NEXT:    s_mov_b32 s3, 0x3ff
439; GFX12-CU-NEXT:    s_wait_alu 0xfffe
440; GFX12-CU-NEXT:    v_and_b32_e64 v1, v1, s3
441; GFX12-CU-NEXT:    s_mov_b32 s3, 2
442; GFX12-CU-NEXT:    s_wait_alu 0xfffe
443; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s3, v1
444; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
445; GFX12-CU-NEXT:    scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT
446; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
447; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
448; GFX12-CU-NEXT:    s_endpgm
449    ptr addrspace(5) %in, ptr addrspace(1) %out) {
450entry:
451  %tid = call i32 @llvm.amdgcn.workitem.id.x()
452  %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid
453  %val = load i32, ptr addrspace(5) %val.gep, align 4, !nontemporal !0
454  store i32 %val, ptr addrspace(1) %out
455  ret void
456}
457
458define amdgpu_kernel void @private_nontemporal_store_0(
459; GFX6-LABEL: private_nontemporal_store_0:
460; GFX6:       ; %bb.0: ; %entry
461; GFX6-NEXT:    s_add_u32 s0, s0, s15
462; GFX6-NEXT:    s_addc_u32 s1, s1, 0
463; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
464; GFX6-NEXT:    s_load_dword s4, s[8:9], 0x2
465; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
466; GFX6-NEXT:    s_load_dword s5, s[6:7], 0x0
467; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX6-NEXT:    v_mov_b32_e32 v0, s5
469; GFX6-NEXT:    v_mov_b32_e32 v1, s4
470; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
471; GFX6-NEXT:    s_endpgm
472;
473; GFX7-LABEL: private_nontemporal_store_0:
474; GFX7:       ; %bb.0: ; %entry
475; GFX7-NEXT:    s_add_u32 s0, s0, s15
476; GFX7-NEXT:    s_addc_u32 s1, s1, 0
477; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
478; GFX7-NEXT:    s_load_dword s4, s[8:9], 0x2
479; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
481; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX7-NEXT:    v_mov_b32_e32 v0, s5
483; GFX7-NEXT:    v_mov_b32_e32 v1, s4
484; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
485; GFX7-NEXT:    s_endpgm
486;
487; GFX10-WGP-LABEL: private_nontemporal_store_0:
488; GFX10-WGP:       ; %bb.0: ; %entry
489; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
490; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
491; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
492; GFX10-WGP-NEXT:    s_load_dword s4, s[8:9], 0x8
493; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
494; GFX10-WGP-NEXT:    s_load_dword s5, s[6:7], 0x0
495; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s5
497; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s4
498; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
499; GFX10-WGP-NEXT:    s_endpgm
500;
501; GFX10-CU-LABEL: private_nontemporal_store_0:
502; GFX10-CU:       ; %bb.0: ; %entry
503; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
504; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
505; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
506; GFX10-CU-NEXT:    s_load_dword s4, s[8:9], 0x8
507; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX10-CU-NEXT:    s_load_dword s5, s[6:7], 0x0
509; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
510; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s5
511; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s4
512; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
513; GFX10-CU-NEXT:    s_endpgm
514;
515; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
516; SKIP-CACHE-INV:       ; %bb.0: ; %entry
517; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
518; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
519; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
520; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
521; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
522; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
523; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
524; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[4:5], 0x2
525; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
526; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[2:3], 0x0
527; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
528; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s1
529; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v1, s0
530; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
531; SKIP-CACHE-INV-NEXT:    s_endpgm
532;
533; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
534; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
535; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
536; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
537; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
538; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
539; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
541; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
543; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
544; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
545; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
546;
547; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
548; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
549; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
550; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
551; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
552; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[8:9], 0x8
553; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
554; GFX90A-TGSPLIT-NEXT:    s_load_dword s5, s[6:7], 0x0
555; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
556; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s5
557; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s4
558; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
559; GFX90A-TGSPLIT-NEXT:    s_endpgm
560;
561; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
562; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
563; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
564; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
565; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
567; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
568; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
569; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword off, v0, s0 sc0 nt sc1
570; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
571;
572; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
573; GFX940-TGSPLIT:       ; %bb.0: ; %entry
574; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
575; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[4:5], 0x8
576; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX940-TGSPLIT-NEXT:    s_load_dword s1, s[2:3], 0x0
578; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s1
580; GFX940-TGSPLIT-NEXT:    scratch_store_dword off, v0, s0 sc0 nt sc1
581; GFX940-TGSPLIT-NEXT:    s_endpgm
582;
583; GFX11-WGP-LABEL: private_nontemporal_store_0:
584; GFX11-WGP:       ; %bb.0: ; %entry
585; GFX11-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
586; GFX11-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
587; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX11-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
589; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
590; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s1
591; GFX11-WGP-NEXT:    scratch_store_b32 off, v0, s0 glc slc dlc
592; GFX11-WGP-NEXT:    s_endpgm
593;
594; GFX11-CU-LABEL: private_nontemporal_store_0:
595; GFX11-CU:       ; %bb.0: ; %entry
596; GFX11-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
597; GFX11-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
598; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX11-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
600; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
601; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s1
602; GFX11-CU-NEXT:    scratch_store_b32 off, v0, s0 glc slc dlc
603; GFX11-CU-NEXT:    s_endpgm
604;
605; GFX12-WGP-LABEL: private_nontemporal_store_0:
606; GFX12-WGP:       ; %bb.0: ; %entry
607; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
608; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
609; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
610; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
611; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
612; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
613; GFX12-WGP-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT
614; GFX12-WGP-NEXT:    s_endpgm
615;
616; GFX12-CU-LABEL: private_nontemporal_store_0:
617; GFX12-CU:       ; %bb.0: ; %entry
618; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
619; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
620; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
621; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
622; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
623; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
624; GFX12-CU-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT
625; GFX12-CU-NEXT:    s_endpgm
626    ptr addrspace(1) %in, ptr addrspace(5) %out) {
627entry:
628  %val = load i32, ptr addrspace(1) %in, align 4
629  store i32 %val, ptr addrspace(5) %out, !nontemporal !0
630  ret void
631}
632
633define amdgpu_kernel void @private_nontemporal_store_1(
634; GFX6-LABEL: private_nontemporal_store_1:
635; GFX6:       ; %bb.0: ; %entry
636; GFX6-NEXT:    s_add_u32 s0, s0, s15
637; GFX6-NEXT:    s_addc_u32 s1, s1, 0
638; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
639; GFX6-NEXT:    s_load_dword s5, s[8:9], 0x2
640; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX6-NEXT:    s_load_dword s4, s[6:7], 0x0
642; GFX6-NEXT:    s_mov_b32 s6, 2
643; GFX6-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
644; GFX6-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
645; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX6-NEXT:    v_mov_b32_e32 v0, s4
647; GFX6-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
648; GFX6-NEXT:    s_endpgm
649;
650; GFX7-LABEL: private_nontemporal_store_1:
651; GFX7:       ; %bb.0: ; %entry
652; GFX7-NEXT:    s_add_u32 s0, s0, s15
653; GFX7-NEXT:    s_addc_u32 s1, s1, 0
654; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
655; GFX7-NEXT:    s_load_dword s5, s[8:9], 0x2
656; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
657; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
658; GFX7-NEXT:    s_mov_b32 s6, 2
659; GFX7-NEXT:    v_lshlrev_b32_e64 v0, s6, v0
660; GFX7-NEXT:    v_add_i32_e64 v1, s[6:7], s5, v0
661; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
662; GFX7-NEXT:    v_mov_b32_e32 v0, s4
663; GFX7-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
664; GFX7-NEXT:    s_endpgm
665;
666; GFX10-WGP-LABEL: private_nontemporal_store_1:
667; GFX10-WGP:       ; %bb.0: ; %entry
668; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
669; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
670; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
671; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x8
672; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX10-WGP-NEXT:    s_load_dword s4, s[4:5], 0x0
674; GFX10-WGP-NEXT:    s_mov_b32 s5, 2
675; GFX10-WGP-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
676; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, s4
678; GFX10-WGP-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
679; GFX10-WGP-NEXT:    s_endpgm
680;
681; GFX10-CU-LABEL: private_nontemporal_store_1:
682; GFX10-CU:       ; %bb.0: ; %entry
683; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
684; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
685; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
686; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x8
687; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
688; GFX10-CU-NEXT:    s_load_dword s4, s[4:5], 0x0
689; GFX10-CU-NEXT:    s_mov_b32 s5, 2
690; GFX10-CU-NEXT:    v_lshl_add_u32 v1, v0, s5, s6
691; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX10-CU-NEXT:    v_mov_b32_e32 v0, s4
693; GFX10-CU-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
694; GFX10-CU-NEXT:    s_endpgm
695;
696; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
697; SKIP-CACHE-INV:       ; %bb.0: ; %entry
698; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
699; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
700; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
701; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
702; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
703; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
704; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
705; SKIP-CACHE-INV-NEXT:    s_load_dword s1, s[4:5], 0x2
706; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
707; SKIP-CACHE-INV-NEXT:    s_load_dword s0, s[2:3], 0x0
708; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, 2
709; SKIP-CACHE-INV-NEXT:    v_lshlrev_b32_e64 v0, s2, v0
710; SKIP-CACHE-INV-NEXT:    v_add_i32_e64 v1, s[2:3], s1, v0
711; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
712; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s0
713; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, v1, s[12:15], 0 offen glc slc
714; SKIP-CACHE-INV-NEXT:    s_endpgm
715;
716; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
717; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
718; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
719; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
720; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
721; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
722; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
724; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
725; GFX90A-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
726; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s5, 2
727; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
728; GFX90A-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
729; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
731; GFX90A-NOTTGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
732; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
733;
734; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
735; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
736; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
737; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
738; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
739; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x8
740; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[4:5], 0x0
742; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 0x3ff
743; GFX90A-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s5
744; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s5, 2
745; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
746; GFX90A-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s5, v1
747; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
748; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s4
749; GFX90A-TGSPLIT-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen glc slc
750; GFX90A-TGSPLIT-NEXT:    s_endpgm
751;
752; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
753; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
754; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
755; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
756; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
758; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
759; GFX940-NOTTGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
760; GFX940-NOTTGSPLIT-NEXT:    s_mov_b32 s1, 2
761; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
762; GFX940-NOTTGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
763; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
765; GFX940-NOTTGSPLIT-NEXT:    scratch_store_dword v1, v0, off sc0 nt sc1
766; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
767;
768; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
769; GFX940-TGSPLIT:       ; %bb.0: ; %entry
770; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
771; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x8
772; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX940-TGSPLIT-NEXT:    s_load_dword s0, s[0:1], 0x0
774; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 0x3ff
775; GFX940-TGSPLIT-NEXT:    v_and_b32_e64 v0, v0, s1
776; GFX940-TGSPLIT-NEXT:    s_mov_b32 s1, 2
777; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s2
778; GFX940-TGSPLIT-NEXT:    v_lshl_add_u32 v1, v0, s1, v1
779; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, s0
781; GFX940-TGSPLIT-NEXT:    scratch_store_dword v1, v0, off sc0 nt sc1
782; GFX940-TGSPLIT-NEXT:    s_endpgm
783;
784; GFX11-WGP-LABEL: private_nontemporal_store_1:
785; GFX11-WGP:       ; %bb.0: ; %entry
786; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
787; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x8
788; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX11-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
790; GFX11-WGP-NEXT:    s_mov_b32 s1, 0x3ff
791; GFX11-WGP-NEXT:    v_and_b32_e64 v0, v0, s1
792; GFX11-WGP-NEXT:    s_mov_b32 s1, 2
793; GFX11-WGP-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
794; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
795; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, s0
796; GFX11-WGP-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
797; GFX11-WGP-NEXT:    s_endpgm
798;
799; GFX11-CU-LABEL: private_nontemporal_store_1:
800; GFX11-CU:       ; %bb.0: ; %entry
801; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
802; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x8
803; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
804; GFX11-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
805; GFX11-CU-NEXT:    s_mov_b32 s1, 0x3ff
806; GFX11-CU-NEXT:    v_and_b32_e64 v0, v0, s1
807; GFX11-CU-NEXT:    s_mov_b32 s1, 2
808; GFX11-CU-NEXT:    v_lshl_add_u32 v1, v0, s1, s2
809; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX11-CU-NEXT:    v_mov_b32_e32 v0, s0
811; GFX11-CU-NEXT:    scratch_store_b32 v1, v0, off glc slc dlc
812; GFX11-CU-NEXT:    s_endpgm
813;
814; GFX12-WGP-LABEL: private_nontemporal_store_1:
815; GFX12-WGP:       ; %bb.0: ; %entry
816; GFX12-WGP-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
817; GFX12-WGP-NEXT:    s_load_b32 s0, s[4:5], 0x8
818; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
819; GFX12-WGP-NEXT:    s_load_b32 s1, s[2:3], 0x0
820; GFX12-WGP-NEXT:    s_mov_b32 s2, 0x3ff
821; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
822; GFX12-WGP-NEXT:    v_and_b32_e64 v0, v0, s2
823; GFX12-WGP-NEXT:    s_mov_b32 s2, 2
824; GFX12-WGP-NEXT:    s_wait_alu 0xfffe
825; GFX12-WGP-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
826; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
827; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, s1
828; GFX12-WGP-NEXT:    scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
829; GFX12-WGP-NEXT:    s_endpgm
830;
831; GFX12-CU-LABEL: private_nontemporal_store_1:
832; GFX12-CU:       ; %bb.0: ; %entry
833; GFX12-CU-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
834; GFX12-CU-NEXT:    s_load_b32 s0, s[4:5], 0x8
835; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
836; GFX12-CU-NEXT:    s_load_b32 s1, s[2:3], 0x0
837; GFX12-CU-NEXT:    s_mov_b32 s2, 0x3ff
838; GFX12-CU-NEXT:    s_wait_alu 0xfffe
839; GFX12-CU-NEXT:    v_and_b32_e64 v0, v0, s2
840; GFX12-CU-NEXT:    s_mov_b32 s2, 2
841; GFX12-CU-NEXT:    s_wait_alu 0xfffe
842; GFX12-CU-NEXT:    v_lshlrev_b32_e64 v1, s2, v0
843; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
844; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s1
845; GFX12-CU-NEXT:    scratch_store_b32 v1, v0, s0 th:TH_STORE_NT
846; GFX12-CU-NEXT:    s_endpgm
847    ptr addrspace(1) %in, ptr addrspace(5) %out) {
848entry:
849  %tid = call i32 @llvm.amdgcn.workitem.id.x()
850  %val = load i32, ptr addrspace(1) %in, align 4
851  %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid
852  store i32 %val, ptr addrspace(5) %out.gep, !nontemporal !0
853  ret void
854}
855
856define amdgpu_kernel void @private_nontemporal_volatile_load(
857; GFX6-LABEL: private_nontemporal_volatile_load:
858; GFX6:       ; %bb.0: ; %entry
859; GFX6-NEXT:    s_add_u32 s0, s0, s15
860; GFX6-NEXT:    s_addc_u32 s1, s1, 0
861; GFX6-NEXT:    s_mov_b64 s[4:5], s[8:9]
862; GFX6-NEXT:    s_load_dword s8, s[4:5], 0x0
863; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
864; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
865; GFX6-NEXT:    s_mov_b32 s11, s5
866; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
867; GFX6-NEXT:    s_mov_b32 s9, 0x100f000
868; GFX6-NEXT:    s_mov_b32 s10, -1
869; GFX6-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
870; GFX6-NEXT:    s_mov_b32 s5, s11
871; GFX6-NEXT:    s_mov_b32 s6, s10
872; GFX6-NEXT:    s_mov_b32 s7, s9
873; GFX6-NEXT:    v_mov_b32_e32 v0, s8
874; GFX6-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc
875; GFX6-NEXT:    s_waitcnt vmcnt(0)
876; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
877; GFX6-NEXT:    s_endpgm
878;
879; GFX7-LABEL: private_nontemporal_volatile_load:
880; GFX7:       ; %bb.0: ; %entry
881; GFX7-NEXT:    s_add_u32 s0, s0, s15
882; GFX7-NEXT:    s_addc_u32 s1, s1, 0
883; GFX7-NEXT:    s_load_dword s6, s[8:9], 0x0
884; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x2
885; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX7-NEXT:    v_mov_b32_e32 v0, s6
887; GFX7-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen glc
888; GFX7-NEXT:    s_waitcnt vmcnt(0)
889; GFX7-NEXT:    v_mov_b32_e32 v0, s4
890; GFX7-NEXT:    v_mov_b32_e32 v1, s5
891; GFX7-NEXT:    flat_store_dword v[0:1], v2
892; GFX7-NEXT:    s_endpgm
893;
894; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
895; GFX10-WGP:       ; %bb.0: ; %entry
896; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
897; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
898; GFX10-WGP-NEXT:    s_load_dword s6, s[8:9], 0x0
899; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
900; GFX10-WGP-NEXT:    v_mov_b32_e32 v0, 0
901; GFX10-WGP-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, s6
903; GFX10-WGP-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
904; GFX10-WGP-NEXT:    s_waitcnt vmcnt(0)
905; GFX10-WGP-NEXT:    global_store_dword v0, v1, s[4:5]
906; GFX10-WGP-NEXT:    s_endpgm
907;
908; GFX10-CU-LABEL: private_nontemporal_volatile_load:
909; GFX10-CU:       ; %bb.0: ; %entry
910; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
911; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
912; GFX10-CU-NEXT:    s_load_dword s6, s[8:9], 0x0
913; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
914; GFX10-CU-NEXT:    v_mov_b32_e32 v0, 0
915; GFX10-CU-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX10-CU-NEXT:    v_mov_b32_e32 v1, s6
917; GFX10-CU-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
918; GFX10-CU-NEXT:    s_waitcnt vmcnt(0)
919; GFX10-CU-NEXT:    global_store_dword v0, v1, s[4:5]
920; GFX10-CU-NEXT:    s_endpgm
921;
922; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load:
923; SKIP-CACHE-INV:       ; %bb.0: ; %entry
924; SKIP-CACHE-INV-NEXT:    s_getpc_b64 s[12:13]
925; SKIP-CACHE-INV-NEXT:    s_mov_b32 s12, s0
926; SKIP-CACHE-INV-NEXT:    s_load_dwordx4 s[12:15], s[12:13], 0x0
927; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
928; SKIP-CACHE-INV-NEXT:    s_add_u32 s12, s12, s11
929; SKIP-CACHE-INV-NEXT:    s_addc_u32 s13, s13, 0
930; SKIP-CACHE-INV-NEXT:    s_mov_b64 s[0:1], s[4:5]
931; SKIP-CACHE-INV-NEXT:    s_load_dword s4, s[0:1], 0x0
932; SKIP-CACHE-INV-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2
933; SKIP-CACHE-INV-NEXT:    s_waitcnt lgkmcnt(0)
934; SKIP-CACHE-INV-NEXT:    s_mov_b32 s7, s1
935; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
936; SKIP-CACHE-INV-NEXT:    s_mov_b32 s5, 0xf000
937; SKIP-CACHE-INV-NEXT:    s_mov_b32 s6, -1
938; SKIP-CACHE-INV-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
939; SKIP-CACHE-INV-NEXT:    s_mov_b32 s1, s7
940; SKIP-CACHE-INV-NEXT:    s_mov_b32 s2, s6
941; SKIP-CACHE-INV-NEXT:    s_mov_b32 s3, s5
942; SKIP-CACHE-INV-NEXT:    v_mov_b32_e32 v0, s4
943; SKIP-CACHE-INV-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
944; SKIP-CACHE-INV-NEXT:    s_waitcnt vmcnt(0)
945; SKIP-CACHE-INV-NEXT:    buffer_store_dword v0, off, s[0:3], 0
946; SKIP-CACHE-INV-NEXT:    s_endpgm
947;
948; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
949; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
950; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
951; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
952; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
953; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
954; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
955; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
957; GFX90A-NOTTGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
958; GFX90A-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
959; GFX90A-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
960; GFX90A-NOTTGSPLIT-NEXT:    s_endpgm
961;
962; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
963; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
964; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
965; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
966; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[8:9], 0x0
967; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x8
968; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
969; GFX90A-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
970; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, s6
971; GFX90A-TGSPLIT-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen glc
972; GFX90A-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
973; GFX90A-TGSPLIT-NEXT:    global_store_dword v0, v1, s[4:5]
974; GFX90A-TGSPLIT-NEXT:    s_endpgm
975;
976; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
977; GFX940-NOTTGSPLIT:       ; %bb.0: ; %entry
978; GFX940-NOTTGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
979; GFX940-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
980; GFX940-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
981; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
982; GFX940-NOTTGSPLIT-NEXT:    scratch_load_dword v1, off, s2 sc0 sc1
983; GFX940-NOTTGSPLIT-NEXT:    s_waitcnt vmcnt(0)
984; GFX940-NOTTGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
985; GFX940-NOTTGSPLIT-NEXT:    s_endpgm
986;
987; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load:
988; GFX940-TGSPLIT:       ; %bb.0: ; %entry
989; GFX940-TGSPLIT-NEXT:    s_load_dword s2, s[4:5], 0x0
990; GFX940-TGSPLIT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
991; GFX940-TGSPLIT-NEXT:    v_mov_b32_e32 v0, 0
992; GFX940-TGSPLIT-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX940-TGSPLIT-NEXT:    scratch_load_dword v1, off, s2 sc0 sc1
994; GFX940-TGSPLIT-NEXT:    s_waitcnt vmcnt(0)
995; GFX940-TGSPLIT-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
996; GFX940-TGSPLIT-NEXT:    s_endpgm
997;
998; GFX11-WGP-LABEL: private_nontemporal_volatile_load:
999; GFX11-WGP:       ; %bb.0: ; %entry
1000; GFX11-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1001; GFX11-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1002; GFX11-WGP-NEXT:    v_mov_b32_e32 v0, 0
1003; GFX11-WGP-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX11-WGP-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
1005; GFX11-WGP-NEXT:    s_waitcnt vmcnt(0)
1006; GFX11-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1007; GFX11-WGP-NEXT:    s_endpgm
1008;
1009; GFX11-CU-LABEL: private_nontemporal_volatile_load:
1010; GFX11-CU:       ; %bb.0: ; %entry
1011; GFX11-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1012; GFX11-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1013; GFX11-CU-NEXT:    v_mov_b32_e32 v0, 0
1014; GFX11-CU-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX11-CU-NEXT:    scratch_load_b32 v1, off, s2 glc dlc
1016; GFX11-CU-NEXT:    s_waitcnt vmcnt(0)
1017; GFX11-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1018; GFX11-CU-NEXT:    s_endpgm
1019;
1020; GFX12-WGP-LABEL: private_nontemporal_volatile_load:
1021; GFX12-WGP:       ; %bb.0: ; %entry
1022; GFX12-WGP-NEXT:    s_load_b32 s2, s[4:5], 0x0
1023; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1024; GFX12-WGP-NEXT:    v_mov_b32_e32 v0, 0
1025; GFX12-WGP-NEXT:    s_wait_kmcnt 0x0
1026; GFX12-WGP-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
1027; GFX12-WGP-NEXT:    s_wait_bvhcnt 0x0
1028; GFX12-WGP-NEXT:    s_wait_samplecnt 0x0
1029; GFX12-WGP-NEXT:    s_wait_loadcnt 0x0
1030; GFX12-WGP-NEXT:    global_store_b32 v0, v1, s[0:1]
1031; GFX12-WGP-NEXT:    s_endpgm
1032;
1033; GFX12-CU-LABEL: private_nontemporal_volatile_load:
1034; GFX12-CU:       ; %bb.0: ; %entry
1035; GFX12-CU-NEXT:    s_load_b32 s2, s[4:5], 0x0
1036; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
1037; GFX12-CU-NEXT:    v_mov_b32_e32 v0, 0
1038; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
1039; GFX12-CU-NEXT:    scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS
1040; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
1041; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
1042; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
1043; GFX12-CU-NEXT:    global_store_b32 v0, v1, s[0:1]
1044; GFX12-CU-NEXT:    s_endpgm
1045    ptr addrspace(5) %in, ptr addrspace(1) %out) {
1046entry:
1047  %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0
1048  store i32 %val, ptr addrspace(1) %out
1049  ret void
1050}
1051
1052!0 = !{i32 1}
1053declare i32 @llvm.amdgcn.workitem.id.x()
1054