xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (revision cc3aab580b680e8566e9f7a1ff9feff895ecfc49)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-SDAG %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-GISEL %s
6; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s
7; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s
8; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s
9; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s
10; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
11; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
12
13define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
14; GFX9-SDAG-LABEL: buffer_nontemporal_load_store:
15; GFX9-SDAG:       ; %bb.0: ; %entry
16; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
17; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
18; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
19; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
20; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
21; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
23; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
24; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
25; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
26; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
27; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
28; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc slc
29; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
30; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
31; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
32; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
34; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
35; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
36; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
37; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
38; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
39; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
40; GFX9-SDAG-NEXT:    s_endpgm
41;
42; GFX9-GISEL-LABEL: buffer_nontemporal_load_store:
43; GFX9-GISEL:       ; %bb.0: ; %entry
44; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
45; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
46; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
47; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
48; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
49; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
50; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
51; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
52; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
53; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
54; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
55; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
56; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
57; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
58; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
59; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
60; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
61; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
63; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
64; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
65; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
66; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
67; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
68; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
69; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
70; GFX9-GISEL-NEXT:    s_endpgm
71;
72; GFX940-SDAG-LABEL: buffer_nontemporal_load_store:
73; GFX940-SDAG:       ; %bb.0: ; %entry
74; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
75; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
76; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
77; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
78; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
80; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
81; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
82; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
83; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
84; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
85; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
86; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
87; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
88; GFX940-SDAG-NEXT:    s_nop 0
89; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
90; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
91; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
92; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
93; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
94; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
95; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
96; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
97; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
98; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
99; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
100; GFX940-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
101; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
102; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
103; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
104; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
105; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
106; GFX940-SDAG-NEXT:    s_nop 0
107; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
108; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
109; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
110; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen nt
111; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
112; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
113; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
114; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
115; GFX940-SDAG-NEXT:  ; %bb.2:
116; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
117; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
118; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
119; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
120; GFX940-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
121; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
122; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
123; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
124; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
125; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
126; GFX940-SDAG-NEXT:    s_nop 0
127; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
128; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
129; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
130; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1
131; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
132; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
133; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
134; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
135; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
136; GFX940-SDAG-NEXT:  ; %bb.4:
137; GFX940-SDAG-NEXT:    s_endpgm
138;
139; GFX940-GISEL-LABEL: buffer_nontemporal_load_store:
140; GFX940-GISEL:       ; %bb.0: ; %entry
141; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
142; GFX940-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
143; GFX940-GISEL-NEXT:    s_mov_b32 s7, 0
144; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
145; GFX940-GISEL-NEXT:    s_mov_b32 s10, s7
146; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
148; GFX940-GISEL-NEXT:    s_mov_b32 s9, s2
149; GFX940-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
150; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
151; GFX940-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
152; GFX940-GISEL-NEXT:    v_mov_b32_e32 v0, s0
153; GFX940-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
154; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
155; GFX940-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
156; GFX940-GISEL-NEXT:    s_mov_b32 s4, s7
157; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
158; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
160; GFX940-GISEL-NEXT:    s_mov_b32 s5, s2
161; GFX940-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
162; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
163; GFX940-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
164; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, s0
165; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
166; GFX940-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1
167; GFX940-GISEL-NEXT:    s_endpgm
168;
169; GFX10-SDAG-LABEL: buffer_nontemporal_load_store:
170; GFX10-SDAG:       ; %bb.0: ; %entry
171; GFX10-SDAG-NEXT:    s_clause 0x1
172; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
173; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
174; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
175; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
176; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
177; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
178; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
180; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
181; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
182; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
183; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
184; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen slc
185; GFX10-SDAG-NEXT:    s_clause 0x1
186; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
187; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
188; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
189; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
191; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
192; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
193; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
194; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
195; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
196; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
197; GFX10-SDAG-NEXT:    s_endpgm
198;
199; GFX10-GISEL-LABEL: buffer_nontemporal_load_store:
200; GFX10-GISEL:       ; %bb.0: ; %entry
201; GFX10-GISEL-NEXT:    s_clause 0x1
202; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
203; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
204; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
205; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
206; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
207; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
209; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
210; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
211; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
212; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
213; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
214; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
215; GFX10-GISEL-NEXT:    s_clause 0x1
216; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
217; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
218; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
219; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
221; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
222; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
223; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
224; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
225; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
226; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
227; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
228; GFX10-GISEL-NEXT:    s_endpgm
229;
230; GFX11-SDAG-LABEL: buffer_nontemporal_load_store:
231; GFX11-SDAG:       ; %bb.0: ; %entry
232; GFX11-SDAG-NEXT:    s_clause 0x2
233; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
234; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
235; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
236; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
238; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
239; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
240; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
241; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
242; GFX11-SDAG-NEXT:    s_clause 0x1
243; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
244; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
245; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
246; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
247; GFX11-SDAG-NEXT:    s_clause 0x1
248; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
249; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
250; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
251; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
252; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
254; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
255; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
256; GFX11-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
257; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
258; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
259; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
260; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
261; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
262; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
263; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
264; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
265; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
266; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
267; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc
268; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
269; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
270; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
271; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
272; GFX11-SDAG-NEXT:  ; %bb.2:
273; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
274; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
275; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
276; GFX11-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
277; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
278; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
279; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
280; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
281; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
282; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
283; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
284; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
285; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
286; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
287; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
288; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc
289; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
290; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
291; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
292; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
293; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
294; GFX11-SDAG-NEXT:  ; %bb.4:
295; GFX11-SDAG-NEXT:    s_endpgm
296;
297; GFX11-GISEL-LABEL: buffer_nontemporal_load_store:
298; GFX11-GISEL:       ; %bb.0: ; %entry
299; GFX11-GISEL-NEXT:    s_clause 0x1
300; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
301; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
302; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
303; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
304; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
305; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
306; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
308; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
309; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
310; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
311; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
312; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
313; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
314; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
315; GFX11-GISEL-NEXT:    s_clause 0x1
316; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
317; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
318; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
319; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
321; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
322; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
323; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
324; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
325; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
326; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
327; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
328; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
329; GFX11-GISEL-NEXT:    s_endpgm
330;
331; GFX12-SDAG-LABEL: buffer_nontemporal_load_store:
332; GFX12-SDAG:       ; %bb.0: ; %entry
333; GFX12-SDAG-NEXT:    s_clause 0x2
334; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
335; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
336; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
337; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
338; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
339; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
340; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
341; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
342; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
343; GFX12-SDAG-NEXT:    s_clause 0x1
344; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
345; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
346; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
347; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
348; GFX12-SDAG-NEXT:    s_clause 0x1
349; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
350; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
351; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
352; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
353; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
354; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
355; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
356; GFX12-SDAG-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
357; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
358; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
359; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
360; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
361; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
362; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
363; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
364; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
365; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
366; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
367; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
368; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
369; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
370; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
371; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT
372; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
373; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
374; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
375; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
376; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_1
377; GFX12-SDAG-NEXT:  ; %bb.2:
378; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
379; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
380; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
381; GFX12-SDAG-NEXT:  .LBB0_3: ; =>This Inner Loop Header: Depth=1
382; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
383; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
384; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
385; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
386; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
387; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
388; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
389; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
390; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
391; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
392; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
393; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
394; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
395; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
396; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT
397; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
398; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
399; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
400; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
401; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
402; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
403; GFX12-SDAG-NEXT:  ; %bb.4:
404; GFX12-SDAG-NEXT:    s_endpgm
405;
406; GFX12-GISEL-LABEL: buffer_nontemporal_load_store:
407; GFX12-GISEL:       ; %bb.0: ; %entry
408; GFX12-GISEL-NEXT:    s_clause 0x1
409; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
410; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
411; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
412; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
413; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
414; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
415; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
416; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
417; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
418; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
419; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
420; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
421; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
422; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
423; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
424; GFX12-GISEL-NEXT:    s_clause 0x1
425; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
426; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
427; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
428; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
429; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
430; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
431; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
432; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
433; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
434; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
435; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
436; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
437; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
438; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
439; GFX12-GISEL-NEXT:    s_endpgm
440entry:
441  %val = load i32, ptr addrspace(7) %in, !nontemporal !0
442  store i32 %val, ptr addrspace(7) %out, !nontemporal !0
443  ret void
444}
445
446define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) {
447; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
448; GFX9-SDAG:       ; %bb.0: ; %entry
449; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
450; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
451; GFX9-SDAG-NEXT:    s_mov_b32 s10, 0
452; GFX9-SDAG-NEXT:    s_add_u32 s0, s0, s15
453; GFX9-SDAG-NEXT:    s_mov_b32 s15, s10
454; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX9-SDAG-NEXT:    s_mov_b32 s14, s7
456; GFX9-SDAG-NEXT:    s_addc_u32 s1, s1, 0
457; GFX9-SDAG-NEXT:    s_mov_b32 s12, s5
458; GFX9-SDAG-NEXT:    s_or_b64 s[14:15], s[14:15], s[10:11]
459; GFX9-SDAG-NEXT:    s_mov_b32 s13, s6
460; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
461; GFX9-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc
462; GFX9-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
463; GFX9-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
464; GFX9-SDAG-NEXT:    s_mov_b32 s9, s10
465; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
466; GFX9-SDAG-NEXT:    s_mov_b32 s8, s7
467; GFX9-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
468; GFX9-SDAG-NEXT:    s_mov_b32 s8, s5
469; GFX9-SDAG-NEXT:    s_mov_b32 s9, s6
470; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s4
471; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
472; GFX9-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
473; GFX9-SDAG-NEXT:    s_endpgm
474;
475; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
476; GFX9-GISEL:       ; %bb.0: ; %entry
477; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
478; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
479; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
480; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
481; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
482; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
484; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
485; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
486; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
487; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
488; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
489; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
490; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
491; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
492; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
493; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
494; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
495; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
496; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
497; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
498; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
499; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
500; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
501; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
502; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
503; GFX9-GISEL-NEXT:    s_endpgm
504;
505; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
506; GFX940-SDAG:       ; %bb.0: ; %entry
507; GFX940-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
508; GFX940-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
509; GFX940-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
510; GFX940-SDAG-NEXT:    s_load_dword s7, s[4:5], 0x30
511; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
513; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
514; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
515; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[10:11], off, off offset:40
516; GFX940-SDAG-NEXT:    scratch_load_dword v4, off, off offset:36
517; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
518; GFX940-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
519; GFX940-SDAG-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
520; GFX940-SDAG-NEXT:    scratch_load_dwordx2 v[12:13], off, off offset:8
521; GFX940-SDAG-NEXT:    s_nop 0
522; GFX940-SDAG-NEXT:    scratch_load_dword v0, off, off offset:4
523; GFX940-SDAG-NEXT:    v_mov_b32_e32 v7, s6
524; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, s7
525; GFX940-SDAG-NEXT:    v_mov_b32_e32 v9, s0
526; GFX940-SDAG-NEXT:    s_mov_b64 s[2:3], exec
527; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(4)
528; GFX940-SDAG-NEXT:    v_mov_b32_e32 v6, v11
529; GFX940-SDAG-NEXT:    v_mov_b32_e32 v5, v10
530; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(1)
531; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, v13
532; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, v12
533; GFX940-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
534; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
535; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
536; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
537; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
538; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5]
539; GFX940-SDAG-NEXT:    s_nop 0
540; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7]
541; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
542; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
543; GFX940-SDAG-NEXT:    buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1
544; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
545; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr9
546; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
547; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
548; GFX940-SDAG-NEXT:  ; %bb.2:
549; GFX940-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
550; GFX940-SDAG-NEXT:    v_mov_b32_e32 v4, s8
551; GFX940-SDAG-NEXT:    s_mov_b64 s[0:1], exec
552; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
553; GFX940-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
554; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
555; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
556; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
557; GFX940-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
558; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
559; GFX940-SDAG-NEXT:    s_nop 0
560; GFX940-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
561; GFX940-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
562; GFX940-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
563; GFX940-SDAG-NEXT:    buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1
564; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
565; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr8
566; GFX940-SDAG-NEXT:    ; implicit-def: $vgpr4
567; GFX940-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
568; GFX940-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
569; GFX940-SDAG-NEXT:  ; %bb.4:
570; GFX940-SDAG-NEXT:    s_endpgm
571;
572; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
573; GFX940-GISEL:       ; %bb.0: ; %entry
574; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
575; GFX940-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
576; GFX940-GISEL-NEXT:    s_mov_b32 s7, 0
577; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
578; GFX940-GISEL-NEXT:    s_mov_b32 s10, s7
579; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
581; GFX940-GISEL-NEXT:    s_mov_b32 s9, s2
582; GFX940-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
583; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
584; GFX940-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
585; GFX940-GISEL-NEXT:    v_mov_b32_e32 v0, s0
586; GFX940-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
587; GFX940-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
588; GFX940-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
589; GFX940-GISEL-NEXT:    s_mov_b32 s4, s7
590; GFX940-GISEL-NEXT:    s_mov_b32 s8, s7
591; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX940-GISEL-NEXT:    s_mov_b32 s6, s1
593; GFX940-GISEL-NEXT:    s_mov_b32 s5, s2
594; GFX940-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
595; GFX940-GISEL-NEXT:    s_mov_b32 s6, s3
596; GFX940-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
597; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, s0
598; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
599; GFX940-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
600; GFX940-GISEL-NEXT:    s_endpgm
601;
602; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
603; GFX10-SDAG:       ; %bb.0: ; %entry
604; GFX10-SDAG-NEXT:    s_clause 0x1
605; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
606; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x10
607; GFX10-SDAG-NEXT:    s_mov_b32 s10, 0
608; GFX10-SDAG-NEXT:    s_add_u32 s0, s0, s15
609; GFX10-SDAG-NEXT:    s_mov_b32 s13, s10
610; GFX10-SDAG-NEXT:    s_addc_u32 s1, s1, 0
611; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX10-SDAG-NEXT:    v_mov_b32_e32 v0, s4
613; GFX10-SDAG-NEXT:    s_mov_b32 s12, s7
614; GFX10-SDAG-NEXT:    s_or_b64 s[14:15], s[12:13], s[10:11]
615; GFX10-SDAG-NEXT:    s_mov_b32 s12, s5
616; GFX10-SDAG-NEXT:    s_mov_b32 s13, s6
617; GFX10-SDAG-NEXT:    buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc
618; GFX10-SDAG-NEXT:    s_clause 0x1
619; GFX10-SDAG-NEXT:    s_load_dword s11, s[8:9], 0x30
620; GFX10-SDAG-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x20
621; GFX10-SDAG-NEXT:    s_mov_b32 s9, s10
622; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
623; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, s4
624; GFX10-SDAG-NEXT:    s_mov_b32 s8, s7
625; GFX10-SDAG-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
626; GFX10-SDAG-NEXT:    s_mov_b32 s8, s5
627; GFX10-SDAG-NEXT:    s_mov_b32 s9, s6
628; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
629; GFX10-SDAG-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
630; GFX10-SDAG-NEXT:    s_endpgm
631;
632; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
633; GFX10-GISEL:       ; %bb.0: ; %entry
634; GFX10-GISEL-NEXT:    s_clause 0x1
635; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
636; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
637; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
638; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
639; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
640; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
641; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
642; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
643; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
644; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
645; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
646; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
647; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
648; GFX10-GISEL-NEXT:    s_clause 0x1
649; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
650; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
651; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
652; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
654; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
655; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
656; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
657; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
658; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
659; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
660; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
661; GFX10-GISEL-NEXT:    s_endpgm
662;
663; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
664; GFX11-SDAG:       ; %bb.0: ; %entry
665; GFX11-SDAG-NEXT:    s_clause 0x2
666; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
667; GFX11-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
668; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
669; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
671; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
672; GFX11-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
673; GFX11-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
674; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
675; GFX11-SDAG-NEXT:    s_clause 0x1
676; GFX11-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
677; GFX11-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
678; GFX11-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
679; GFX11-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
680; GFX11-SDAG-NEXT:    s_clause 0x1
681; GFX11-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
682; GFX11-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
683; GFX11-SDAG-NEXT:    v_mov_b32_e32 v7, s6
684; GFX11-SDAG-NEXT:    v_mov_b32_e32 v9, s0
685; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
686; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, s1
687; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
688; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
689; GFX11-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
690; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
691; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
692; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
693; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
694; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
695; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
696; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
697; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
698; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
699; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
700; GFX11-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc
701; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
702; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr9
703; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
704; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
705; GFX11-SDAG-NEXT:  ; %bb.2:
706; GFX11-SDAG-NEXT:    s_mov_b32 exec_lo, s1
707; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, s8
708; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
709; GFX11-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
710; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
711; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
712; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
713; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
714; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
715; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
716; GFX11-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
717; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
718; GFX11-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
719; GFX11-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
720; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
721; GFX11-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], 0 offen dlc
722; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
723; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr8
724; GFX11-SDAG-NEXT:    ; implicit-def: $vgpr4
725; GFX11-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
726; GFX11-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
727; GFX11-SDAG-NEXT:  ; %bb.4:
728; GFX11-SDAG-NEXT:    s_endpgm
729;
730; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
731; GFX11-GISEL:       ; %bb.0: ; %entry
732; GFX11-GISEL-NEXT:    s_clause 0x1
733; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
734; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
735; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
736; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
737; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
738; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
739; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
740; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
741; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
742; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
743; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
744; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
745; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
746; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
747; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
748; GFX11-GISEL-NEXT:    s_clause 0x1
749; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
750; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
751; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
752; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
754; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
755; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
756; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
757; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
758; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
759; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
760; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
761; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
762; GFX11-GISEL-NEXT:    s_endpgm
763;
764; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store:
765; GFX12-SDAG:       ; %bb.0: ; %entry
766; GFX12-SDAG-NEXT:    s_clause 0x2
767; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
768; GFX12-SDAG-NEXT:    s_load_b128 s[8:11], s[4:5], 0x20
769; GFX12-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x10
770; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
771; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
772; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
773; GFX12-SDAG-NEXT:    v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9
774; GFX12-SDAG-NEXT:    v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11
775; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
776; GFX12-SDAG-NEXT:    s_clause 0x1
777; GFX12-SDAG-NEXT:    scratch_load_b64 v[5:6], off, off offset:40
778; GFX12-SDAG-NEXT:    scratch_load_b32 v4, off, off offset:36
779; GFX12-SDAG-NEXT:    s_load_b32 s1, s[4:5], 0x30
780; GFX12-SDAG-NEXT:    scratch_store_b128 off, v[7:10], off
781; GFX12-SDAG-NEXT:    s_clause 0x1
782; GFX12-SDAG-NEXT:    scratch_load_b64 v[1:2], off, off offset:8
783; GFX12-SDAG-NEXT:    scratch_load_b32 v0, off, off offset:4
784; GFX12-SDAG-NEXT:    v_mov_b32_e32 v7, s6
785; GFX12-SDAG-NEXT:    v_mov_b32_e32 v9, s0
786; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
787; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, s1
788; GFX12-SDAG-NEXT:    s_mov_b32 s1, exec_lo
789; GFX12-SDAG-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
790; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x2
791; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
792; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
793; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v6
794; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v7
795; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
796; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5]
797; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[6:7]
798; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
799; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
800; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
801; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
802; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
803; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
804; GFX12-SDAG-NEXT:    buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS
805; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
806; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr9
807; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
808; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
809; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_1
810; GFX12-SDAG-NEXT:  ; %bb.2:
811; GFX12-SDAG-NEXT:    s_mov_b32 exec_lo, s1
812; GFX12-SDAG-NEXT:    v_mov_b32_e32 v4, s8
813; GFX12-SDAG-NEXT:    s_mov_b32 s0, exec_lo
814; GFX12-SDAG-NEXT:  .LBB1_3: ; =>This Inner Loop Header: Depth=1
815; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x1
816; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
817; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
818; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
819; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
820; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
821; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
822; GFX12-SDAG-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
823; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
824; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
825; GFX12-SDAG-NEXT:    s_and_b32 s0, vcc_lo, s0
826; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
827; GFX12-SDAG-NEXT:    s_and_saveexec_b32 s0, s0
828; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
829; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
830; GFX12-SDAG-NEXT:    buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
831; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
832; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr8
833; GFX12-SDAG-NEXT:    ; implicit-def: $vgpr4
834; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
835; GFX12-SDAG-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
836; GFX12-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
837; GFX12-SDAG-NEXT:  ; %bb.4:
838; GFX12-SDAG-NEXT:    s_endpgm
839;
840; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store:
841; GFX12-GISEL:       ; %bb.0: ; %entry
842; GFX12-GISEL-NEXT:    s_clause 0x1
843; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
844; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
845; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
846; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
847; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
848; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
849; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
850; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
851; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
852; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
853; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
854; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
855; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
856; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
857; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
858; GFX12-GISEL-NEXT:    s_clause 0x1
859; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
860; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
861; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
862; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
863; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
864; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
865; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
866; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
867; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
868; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
869; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
870; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
871; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
872; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
873; GFX12-GISEL-NEXT:    s_endpgm
874entry:
875  %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0
876  store volatile i32 %val, ptr addrspace(7) %out, !nontemporal !0
877  ret void
878}
879
880!0 = !{i32 1}
881;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
882; GFX10: {{.*}}
883; GFX11: {{.*}}
884; GFX12: {{.*}}
885; GFX9: {{.*}}
886; GFX940: {{.*}}
887