xref: /llvm-project/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -O0 -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s
3; RUN: llc -mtriple=amdgcn- -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s
4
5; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
6
7define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
8; GFX9-O0-LABEL: no_cfg:
9; GFX9-O0:       ; %bb.0:
10; GFX9-O0-NEXT:    s_mov_b32 s6, s2
11; GFX9-O0-NEXT:    s_mov_b32 s4, s0
12; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
13; GFX9-O0-NEXT:    s_mov_b32 s7, s3
14; GFX9-O0-NEXT:    s_mov_b32 s8, s7
15; GFX9-O0-NEXT:    s_mov_b32 s9, s6
16; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
17; GFX9-O0-NEXT:    s_mov_b32 s5, s1
18; GFX9-O0-NEXT:    s_mov_b32 s10, s5
19; GFX9-O0-NEXT:    s_mov_b32 s0, s4
20; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
21; GFX9-O0-NEXT:    s_mov_b32 s1, s10
22; GFX9-O0-NEXT:    s_mov_b32 s2, s9
23; GFX9-O0-NEXT:    s_mov_b32 s3, s8
24; GFX9-O0-NEXT:    s_mov_b32 s4, 0
25; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[0:3], s4
26; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
27; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
28; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
29; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
30; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
31; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
32; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
33; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
34; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
35; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
36; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
37; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
38; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
39; GFX9-O0-NEXT:    s_nop 1
40; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
41; GFX9-O0-NEXT:    v_add_u32_e64 v0, v0, v2
42; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
43; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
44; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
45; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
46; GFX9-O0-NEXT:    s_nop 1
47; GFX9-O0-NEXT:    v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
48; GFX9-O0-NEXT:    v_add_u32_e64 v0, v1, v0
49; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
50; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
51; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v4
52; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[6:7]
53; GFX9-O0-NEXT:    s_mov_b32 s5, 1
54; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s5, v3
55; GFX9-O0-NEXT:    s_mov_b32 s5, 2
56; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s5
57; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
58; GFX9-O0-NEXT:    s_endpgm
59;
60; GFX9-O3-LABEL: no_cfg:
61; GFX9-O3:       ; %bb.0:
62; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[0:3], 0
63; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
64; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
65; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
66; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
67; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
68; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
69; GFX9-O3-NEXT:    s_nop 0
70; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
71; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
72; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
73; GFX9-O3-NEXT:    v_add_u32_e32 v0, v3, v0
74; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
75; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
76; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
77; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
78; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
79; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
80; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
81; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:4
82; GFX9-O3-NEXT:    s_endpgm
83  %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
84  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
85  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
86  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
87  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
88  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
89
90
91  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
92  %tmp121 = add i32 %tmp105, %tmp120
93  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
94
95  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
96  %tmp136 = add i32 %tmp107, %tmp135
97  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
98
99  %tmp138 = icmp eq i32 %tmp122, %tmp137
100  %tmp139 = sext i1 %tmp138 to i32
101  %tmp140 = shl nsw i32 %tmp139, 1
102  %tmp141 = and i32 %tmp140, 2
103  %tmp145 = bitcast i32 %tmp141 to float
104  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
105  ret void
106}
107
108define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
109; GFX9-O0-LABEL: cfg:
110; GFX9-O0:       ; %bb.0: ; %entry
111; GFX9-O0-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
112; GFX9-O0-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
113; GFX9-O0-NEXT:    s_mov_b32 s18, -1
114; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
115; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
116; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
117; GFX9-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
118; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 0
119; GFX9-O0-NEXT:    s_mov_b32 s4, s1
120; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 0
121; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
122; GFX9-O0-NEXT:    s_mov_b32 s3, s1
123; GFX9-O0-NEXT:    s_mov_b32 s8, s3
124; GFX9-O0-NEXT:    s_mov_b32 s9, s2
125; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
126; GFX9-O0-NEXT:    s_mov_b32 s1, s4
127; GFX9-O0-NEXT:    s_mov_b32 s10, s1
128; GFX9-O0-NEXT:    s_mov_b32 s4, s0
129; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
130; GFX9-O0-NEXT:    s_mov_b32 s5, s10
131; GFX9-O0-NEXT:    s_mov_b32 s6, s9
132; GFX9-O0-NEXT:    s_mov_b32 s7, s8
133; GFX9-O0-NEXT:    v_writelane_b32 v5, s2, 1
134; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 2
135; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 3
136; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 4
137; GFX9-O0-NEXT:    s_mov_b32 s0, 0
138; GFX9-O0-NEXT:    s_nop 2
139; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], s0
140; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
141; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
142; GFX9-O0-NEXT:    s_nop 0
143; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
144; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
145; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
146; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
147; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
148; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
149; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
150; GFX9-O0-NEXT:    s_nop 1
151; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
152; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
153; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
154; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
155; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
156; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
157; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s0
158; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
159; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
160; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 5
161; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 6
162; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
163; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill
164; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
165; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
166; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
167; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
168; GFX9-O0-NEXT:  ; %bb.1: ; %if
169; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
170; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
171; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
173; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
174; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
175; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
176; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
177; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
178; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
179; GFX9-O0-NEXT:    s_nop 1
180; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
181; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
182; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
183; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
184; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
185; GFX9-O0-NEXT:  .LBB1_2: ; %merge
186; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
187; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload
188; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
189; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
190; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
191; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
192; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
193; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
194; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
195; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
196; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
197; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
198; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
199; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
200; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
201; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
202; GFX9-O0-NEXT:    s_mov_b32 s4, 1
203; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
204; GFX9-O0-NEXT:    s_mov_b32 s4, 2
205; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s4
206; GFX9-O0-NEXT:    s_mov_b32 s6, s1
207; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
208; GFX9-O0-NEXT:    s_mov_b32 s4, s3
209; GFX9-O0-NEXT:    s_mov_b32 s5, s2
210; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
211; GFX9-O0-NEXT:    s_mov_b32 s1, s6
212; GFX9-O0-NEXT:    s_mov_b32 s2, s5
213; GFX9-O0-NEXT:    s_mov_b32 s3, s4
214; GFX9-O0-NEXT:    s_mov_b32 s4, 0
215; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
216; GFX9-O0-NEXT:    s_endpgm
217;
218; GFX9-O3-LABEL: cfg:
219; GFX9-O3:       ; %bb.0: ; %entry
220; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[0:3], 0
221; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
222; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
223; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
224; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
225; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
226; GFX9-O3-NEXT:    s_nop 1
227; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
228; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
229; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
230; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
231; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
232; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
233; GFX9-O3-NEXT:  ; %bb.1: ; %if
234; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
235; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
236; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
237; GFX9-O3-NEXT:    s_nop 1
238; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
239; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
240; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
241; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
242; GFX9-O3-NEXT:  ; %bb.2: ; %merge
243; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
244; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
245; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
246; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
247; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
248; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
249; GFX9-O3-NEXT:    s_endpgm
250entry:
251  %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
252  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
253  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
254  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
255
256  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
257  %tmp121 = add i32 %tmp105, %tmp120
258  %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
259
260  %cond = icmp eq i32 %arg, 0
261  br i1 %cond, label %if, label %merge
262if:
263  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
264  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
265
266  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
267  %tmp136 = add i32 %tmp107, %tmp135
268  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
269  br label %merge
270
271merge:
272  %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
273  %tmp138 = icmp eq i32 %tmp122, %merge_value
274  %tmp139 = sext i1 %tmp138 to i32
275  %tmp140 = shl nsw i32 %tmp139, 1
276  %tmp141 = and i32 %tmp140, 2
277  %tmp145 = bitcast i32 %tmp141 to float
278  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
279  ret void
280}
281
282define hidden i32 @called(i32 %a) noinline {
283; GFX9-O0-LABEL: called:
284; GFX9-O0:       ; %bb.0:
285; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
287; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
288; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
289; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
290;
291; GFX9-O3-LABEL: called:
292; GFX9-O3:       ; %bb.0:
293; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
294; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
295; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
296; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
297; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
298  %add = add i32 %a, %a
299  %mul = mul i32 %add, %a
300  %sub = sub i32 %mul, %add
301  ret i32 %sub
302}
303
304define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
305; GFX9-O0-LABEL: call:
306; GFX9-O0:       ; %bb.0:
307; GFX9-O0-NEXT:    s_mov_b32 s32, 0
308; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
309; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
310; GFX9-O0-NEXT:    s_mov_b32 s26, -1
311; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
312; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
313; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
314; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
315; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
316; GFX9-O0-NEXT:    v_writelane_b32 v3, s12, 0
317; GFX9-O0-NEXT:    v_writelane_b32 v3, s13, 1
318; GFX9-O0-NEXT:    s_mov_b32 s14, s10
319; GFX9-O0-NEXT:    s_mov_b32 s13, s9
320; GFX9-O0-NEXT:    s_mov_b32 s12, s8
321; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
322; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
323; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
324; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
325; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
326; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
327; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
328; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
329; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
330; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
331; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
332; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
333; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
334; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
335; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
336; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
337; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX9-O0-NEXT:    s_mov_b32 s3, s9
339; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
340; GFX9-O0-NEXT:    s_mov_b32 s9, s17
341; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
342; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
343; GFX9-O0-NEXT:    s_mov_b32 s17, s9
344; GFX9-O0-NEXT:    s_mov_b32 s18, s8
345; GFX9-O0-NEXT:    s_mov_b32 s19, s3
346; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
347; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
348; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
349; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
350; GFX9-O0-NEXT:    s_mov_b32 s8, 0
351; GFX9-O0-NEXT:    v_writelane_b32 v3, s8, 8
352; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
353; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
354; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
355; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
356; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
357; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s8
358; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
359; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
360; GFX9-O0-NEXT:    s_mov_b32 s2, s0
361; GFX9-O0-NEXT:    s_mov_b32 s0, s1
362; GFX9-O0-NEXT:    s_mov_b32 s3, s8
363; GFX9-O0-NEXT:    s_mov_b32 s1, s9
364; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
365; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
366; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
367; GFX9-O0-NEXT:    s_mov_b32 s9, s0
368; GFX9-O0-NEXT:    s_getpc_b64 s[16:17]
369; GFX9-O0-NEXT:    s_add_u32 s16, s16, called@rel32@lo+4
370; GFX9-O0-NEXT:    s_addc_u32 s17, s17, called@rel32@hi+12
371; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
372; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
373; GFX9-O0-NEXT:    s_mov_b32 s15, 20
374; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
375; GFX9-O0-NEXT:    s_mov_b32 s15, 10
376; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
377; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
378; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
379; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
380; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
381; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
382; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
383; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
384; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
385; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
386; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
387; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
388; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
389; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
390; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v7
391; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
392; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
393; GFX9-O0-NEXT:    s_nop 0
394; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
395; GFX9-O0-NEXT:    s_endpgm
396;
397; GFX9-O3-LABEL: call:
398; GFX9-O3:       ; %bb.0:
399; GFX9-O3-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
400; GFX9-O3-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
401; GFX9-O3-NEXT:    s_mov_b32 s26, -1
402; GFX9-O3-NEXT:    s_mov_b32 s27, 0xe00000
403; GFX9-O3-NEXT:    s_add_u32 s24, s24, s11
404; GFX9-O3-NEXT:    s_mov_b32 s32, 0
405; GFX9-O3-NEXT:    s_addc_u32 s25, s25, 0
406; GFX9-O3-NEXT:    s_or_saveexec_b64 s[16:17], -1
407; GFX9-O3-NEXT:    s_mov_b32 s14, s10
408; GFX9-O3-NEXT:    s_mov_b32 s13, s9
409; GFX9-O3-NEXT:    s_mov_b32 s12, s8
410; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[6:7]
411; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v2
412; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
413; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
414; GFX9-O3-NEXT:    s_mov_b64 exec, s[16:17]
415; GFX9-O3-NEXT:    s_load_dword s6, s[4:5], 0x34
416; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
417; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
418; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s6
419; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
420; GFX9-O3-NEXT:    s_add_u32 s8, s4, 56
421; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
422; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
423; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
424; GFX9-O3-NEXT:    s_addc_u32 s9, s5, 0
425; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
426; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
427; GFX9-O3-NEXT:    s_mov_b64 s[6:7], s[2:3]
428; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
429; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
430; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
431; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
432; GFX9-O3-NEXT:    s_getpc_b64 s[22:23]
433; GFX9-O3-NEXT:    s_add_u32 s22, s22, called@rel32@lo+4
434; GFX9-O3-NEXT:    s_addc_u32 s23, s23, called@rel32@hi+12
435; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
436; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
437; GFX9-O3-NEXT:    v_add_u32_e32 v3, v3, v6
438; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
439; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
440; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
441; GFX9-O3-NEXT:    s_endpgm
442
443
444  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
445  %tmp134 = call i32 @called(i32 %tmp107)
446  %tmp136 = add i32 %tmp134, %tmp107
447  %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
448  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
449  ret void
450}
451
452define i64 @called_i64(i64 %a) noinline {
453; GFX9-O0-LABEL: called_i64:
454; GFX9-O0:       ; %bb.0:
455; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
456; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
457; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
458; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
459; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
460; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
461; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
462; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
463; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
464; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
465; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
466; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v5
467; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
468; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
469; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
470; GFX9-O0-NEXT:    s_mov_b32 s4, 32
471; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
472; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
473; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
474; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
475; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
476; GFX9-O0-NEXT:    v_mul_lo_u32 v1, v0, v1
477; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
478; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[4:5]
479; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
480; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v2, v3
481; GFX9-O0-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0
482; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
483; GFX9-O0-NEXT:    v_add3_u32 v0, v0, v1, v2
484; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
485; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
486; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
487; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
488; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
489; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
490; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s4, v[0:1]
491; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
492; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
493; GFX9-O0-NEXT:    s_mov_b32 s5, 0
494; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
495; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
496; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
497; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
498; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
499; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
500; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
501; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
502; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
503; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
504; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
505; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
506; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
507; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
508; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
509; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v3
510; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
511; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
512; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
513; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
514; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
515; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
516; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
517;
518; GFX9-O3-LABEL: called_i64:
519; GFX9-O3:       ; %bb.0:
520; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
521; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
522; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
523; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
524; GFX9-O3-NEXT:    v_mul_lo_u32 v5, v2, v1
525; GFX9-O3-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
526; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
527; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
528; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
529; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
530  %add = add i64 %a, %a
531  %mul = mul i64 %add, %a
532  %sub = sub i64 %mul, %add
533  ret i64 %sub
534}
535
536define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
537; GFX9-O0-LABEL: call_i64:
538; GFX9-O0:       ; %bb.0:
539; GFX9-O0-NEXT:    s_mov_b32 s32, 0
540; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
541; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
542; GFX9-O0-NEXT:    s_mov_b32 s26, -1
543; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
544; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
545; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
546; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
547; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
548; GFX9-O0-NEXT:    v_writelane_b32 v8, s12, 0
549; GFX9-O0-NEXT:    v_writelane_b32 v8, s13, 1
550; GFX9-O0-NEXT:    s_mov_b32 s14, s10
551; GFX9-O0-NEXT:    s_mov_b32 s13, s9
552; GFX9-O0-NEXT:    s_mov_b32 s12, s8
553; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
554; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
555; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
556; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
557; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
558; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
559; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
560; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
561; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
562; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
563; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
564; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
565; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
566; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
567; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
568; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
569; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
570; GFX9-O0-NEXT:    s_mov_b32 s8, s19
571; GFX9-O0-NEXT:    s_mov_b32 s9, s18
572; GFX9-O0-NEXT:    s_mov_b32 s15, s17
573; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
574; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
575; GFX9-O0-NEXT:    s_mov_b32 s17, s15
576; GFX9-O0-NEXT:    s_mov_b32 s18, s9
577; GFX9-O0-NEXT:    s_mov_b32 s19, s8
578; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
579; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
580; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
581; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
582; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
583; GFX9-O0-NEXT:    s_mov_b32 s15, s9
584; GFX9-O0-NEXT:    s_mov_b32 s16, s3
585; GFX9-O0-NEXT:    ; implicit-def: $sgpr18_sgpr19
586; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s16
587; GFX9-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
588; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
589; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[16:17]
590; GFX9-O0-NEXT:    s_mov_b64 exec, s[16:17]
591; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
592; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
593; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
594; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
595; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
596; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
597; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
598; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s8
599; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
600; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
601; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
602; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
603; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
604; GFX9-O0-NEXT:    s_mov_b32 s2, 32
605; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s2, v[9:10]
606; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
607; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
608; GFX9-O0-NEXT:    s_mov_b32 s2, s0
609; GFX9-O0-NEXT:    s_mov_b32 s0, s1
610; GFX9-O0-NEXT:    s_mov_b32 s3, s8
611; GFX9-O0-NEXT:    s_mov_b32 s1, s9
612; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
613; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
614; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
615; GFX9-O0-NEXT:    s_mov_b32 s9, s0
616; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
617; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4
618; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12
619; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
620; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
621; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
622; GFX9-O0-NEXT:    s_mov_b32 s15, 20
623; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
624; GFX9-O0-NEXT:    s_mov_b32 s15, 10
625; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
626; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
627; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
628; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
629; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
630; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
631; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
633; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
634; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
635; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
636; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
637; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
638; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
639; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
640; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
641; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
642; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
643; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
644; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
645; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
646; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
647; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
648; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
649; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
650; GFX9-O0-NEXT:    s_mov_b32 s4, 0
651; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
652; GFX9-O0-NEXT:    s_endpgm
653;
654; GFX9-O3-LABEL: call_i64:
655; GFX9-O3:       ; %bb.0:
656; GFX9-O3-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
657; GFX9-O3-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
658; GFX9-O3-NEXT:    s_mov_b32 s26, -1
659; GFX9-O3-NEXT:    s_mov_b32 s27, 0xe00000
660; GFX9-O3-NEXT:    s_add_u32 s24, s24, s11
661; GFX9-O3-NEXT:    s_mov_b32 s32, 0
662; GFX9-O3-NEXT:    s_addc_u32 s25, s25, 0
663; GFX9-O3-NEXT:    s_or_saveexec_b64 s[16:17], -1
664; GFX9-O3-NEXT:    s_mov_b32 s14, s10
665; GFX9-O3-NEXT:    s_mov_b32 s13, s9
666; GFX9-O3-NEXT:    s_mov_b32 s12, s8
667; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[6:7]
668; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v2
669; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
670; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
671; GFX9-O3-NEXT:    s_mov_b64 exec, s[16:17]
672; GFX9-O3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
673; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
674; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
675; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s7
676; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
677; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
678; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
679; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s6
680; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
681; GFX9-O3-NEXT:    s_add_u32 s8, s4, 60
682; GFX9-O3-NEXT:    s_addc_u32 s9, s5, 0
683; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
684; GFX9-O3-NEXT:    s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4
685; GFX9-O3-NEXT:    s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12
686; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
687; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
688; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
689; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
690; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
691; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
692; GFX9-O3-NEXT:    s_mov_b64 s[6:7], s[2:3]
693; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
694; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
695; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
696; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
697; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
698; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
700; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
701; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
702; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
703; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
704; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
705; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
706; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
707; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4
708; GFX9-O3-NEXT:    s_endpgm
709
710
711
712  %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
713  %tmp134 = call i64 @called_i64(i64 %tmp107)
714  %tmp136 = add i64 %tmp134, %tmp107
715  %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136)
716  %tmp138 = bitcast i64 %tmp137 to <2 x i32>
717  call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
718  ret void
719}
720
721define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
722; GFX9-O0-LABEL: _amdgpu_cs_main:
723; GFX9-O0:       ; %bb.0:
724; GFX9-O0-NEXT:    s_mov_b32 s4, s3
725; GFX9-O0-NEXT:    s_mov_b32 s5, s2
726; GFX9-O0-NEXT:    s_mov_b32 s6, s1
727; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
728; GFX9-O0-NEXT:    s_mov_b32 s1, s6
729; GFX9-O0-NEXT:    s_mov_b32 s2, s5
730; GFX9-O0-NEXT:    s_mov_b32 s3, s4
731; GFX9-O0-NEXT:    ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3
732; GFX9-O0-NEXT:    s_mov_b32 s4, 5
733; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
734; GFX9-O0-NEXT:    s_mov_b32 s4, 0
735; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
736; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
737; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
738; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
739; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
740; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
741; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
742; GFX9-O0-NEXT:    s_mov_b32 s10, -1
743; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
744; GFX9-O0-NEXT:    s_mov_b32 s11, s5
745; GFX9-O0-NEXT:    s_mov_b32 s8, s11
746; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
747; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
748; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
749; GFX9-O0-NEXT:    s_mov_b32 s5, s10
750; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
751; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
752; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
753; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
754; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
755; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
756; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
757; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
758; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
759; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
760; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
761; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
762; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
763; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
764; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
765; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
766; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
767; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
768; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
769; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
770; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
771; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
772; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
773; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
774; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
775; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
776; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
777; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
778; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
779; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
780; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
781; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
782; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
783; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
784; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
785; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
786; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
787; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
788; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
789; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
790; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
791; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
792; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
793; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
794; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
795; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
796; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
797; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
798; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
799; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
800; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
801; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
802; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
803; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
804; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
805; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
806; GFX9-O0-NEXT:    s_endpgm
807;
808; GFX9-O3-LABEL: _amdgpu_cs_main:
809; GFX9-O3:       ; %bb.0:
810; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
811; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
812; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
813; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
814; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
815; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
816; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
817; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
818; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
819; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
820; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
821; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
822; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
823; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
824; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
825; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
826; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
827; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
828; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
829; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
830; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
831; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
832; GFX9-O3-NEXT:    s_endpgm
833  %tmp17 = shl i32 %index, 5
834  %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
835  %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
836  %tmp19 = or i32 %tmp17, 16
837  %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
838  %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
839  %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
840  %tmp97 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp22)
841  %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1
842  %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807)
843  %tmp174 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp99)
844  %.i25 = bitcast <2 x i32> %tmp20 to i64
845  %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807)
846  %tmp251 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp176)
847  %.cast = bitcast i64 %tmp97 to <2 x float>
848  %.cast6 = bitcast i64 %tmp174 to <2 x float>
849  %.cast7 = bitcast i64 %tmp251 to <2 x float>
850  %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
851  %desc.int = bitcast <4 x i32> %desc to i128
852  %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8)
853  tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0)
854  tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc.ptr, i32 %tmp19, i32 0, i32 0)
855  ret void
856}
857
858
859define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
860; GFX9-O0-LABEL: strict_wwm_no_cfg:
861; GFX9-O0:       ; %bb.0:
862; GFX9-O0-NEXT:    s_mov_b32 s6, s2
863; GFX9-O0-NEXT:    s_mov_b32 s4, s0
864; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
865; GFX9-O0-NEXT:    s_mov_b32 s7, s3
866; GFX9-O0-NEXT:    s_mov_b32 s8, s7
867; GFX9-O0-NEXT:    s_mov_b32 s9, s6
868; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
869; GFX9-O0-NEXT:    s_mov_b32 s5, s1
870; GFX9-O0-NEXT:    s_mov_b32 s10, s5
871; GFX9-O0-NEXT:    s_mov_b32 s0, s4
872; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
873; GFX9-O0-NEXT:    s_mov_b32 s1, s10
874; GFX9-O0-NEXT:    s_mov_b32 s2, s9
875; GFX9-O0-NEXT:    s_mov_b32 s3, s8
876; GFX9-O0-NEXT:    s_mov_b32 s4, 0
877; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[0:3], s4
878; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
879; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
880; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
881; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
882; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
883; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
884; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[6:7]
885; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
886; GFX9-O0-NEXT:    ; implicit-def: $sgpr6_sgpr7
887; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
888; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
889; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[6:7]
890; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
891; GFX9-O0-NEXT:    s_nop 1
892; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
893; GFX9-O0-NEXT:    v_add_u32_e64 v0, v0, v2
894; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
895; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
896; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
897; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
898; GFX9-O0-NEXT:    s_nop 1
899; GFX9-O0-NEXT:    v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
900; GFX9-O0-NEXT:    v_add_u32_e64 v0, v1, v0
901; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
902; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
903; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v4
904; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[6:7]
905; GFX9-O0-NEXT:    s_mov_b32 s5, 1
906; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s5, v3
907; GFX9-O0-NEXT:    s_mov_b32 s5, 2
908; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s5
909; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
910; GFX9-O0-NEXT:    s_endpgm
911;
912; GFX9-O3-LABEL: strict_wwm_no_cfg:
913; GFX9-O3:       ; %bb.0:
914; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[0:3], 0
915; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
916; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
917; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
918; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
919; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[4:5]
920; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, 0, v5, s[4:5]
921; GFX9-O3-NEXT:    s_nop 0
922; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
923; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
924; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
925; GFX9-O3-NEXT:    v_add_u32_e32 v0, v3, v0
926; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
927; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
928; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
929; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
930; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
931; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
932; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
933; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:4
934; GFX9-O3-NEXT:    s_endpgm
935  %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
936  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
937  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
938  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
939  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
940  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
941
942
943  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
944  %tmp121 = add i32 %tmp105, %tmp120
945  %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
946
947  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
948  %tmp136 = add i32 %tmp107, %tmp135
949  %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
950
951  %tmp138 = icmp eq i32 %tmp122, %tmp137
952  %tmp139 = sext i1 %tmp138 to i32
953  %tmp140 = shl nsw i32 %tmp139, 1
954  %tmp141 = and i32 %tmp140, 2
955  %tmp145 = bitcast i32 %tmp141 to float
956  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
957  ret void
958}
959
960define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
961; GFX9-O0-LABEL: strict_wwm_cfg:
962; GFX9-O0:       ; %bb.0: ; %entry
963; GFX9-O0-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
964; GFX9-O0-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
965; GFX9-O0-NEXT:    s_mov_b32 s18, -1
966; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
967; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
968; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
969; GFX9-O0-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
970; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 0
971; GFX9-O0-NEXT:    s_mov_b32 s4, s1
972; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 0
973; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
974; GFX9-O0-NEXT:    s_mov_b32 s3, s1
975; GFX9-O0-NEXT:    s_mov_b32 s8, s3
976; GFX9-O0-NEXT:    s_mov_b32 s9, s2
977; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
978; GFX9-O0-NEXT:    s_mov_b32 s1, s4
979; GFX9-O0-NEXT:    s_mov_b32 s10, s1
980; GFX9-O0-NEXT:    s_mov_b32 s4, s0
981; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
982; GFX9-O0-NEXT:    s_mov_b32 s5, s10
983; GFX9-O0-NEXT:    s_mov_b32 s6, s9
984; GFX9-O0-NEXT:    s_mov_b32 s7, s8
985; GFX9-O0-NEXT:    v_writelane_b32 v5, s2, 1
986; GFX9-O0-NEXT:    v_writelane_b32 v5, s3, 2
987; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 3
988; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 4
989; GFX9-O0-NEXT:    s_mov_b32 s0, 0
990; GFX9-O0-NEXT:    s_nop 2
991; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], s0
992; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
993; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
994; GFX9-O0-NEXT:    s_nop 0
995; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
996; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
997; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
998; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
999; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
1000; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
1001; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
1002; GFX9-O0-NEXT:    s_nop 1
1003; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
1004; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
1005; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
1006; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
1007; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
1008; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v0, s0
1009; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s0
1010; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
1011; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
1012; GFX9-O0-NEXT:    v_writelane_b32 v5, s0, 5
1013; GFX9-O0-NEXT:    v_writelane_b32 v5, s1, 6
1014; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
1015; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 ; 4-byte Folded Spill
1016; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
1017; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
1018; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
1019; GFX9-O0-NEXT:    s_cbranch_execz .LBB8_2
1020; GFX9-O0-NEXT:  ; %bb.1: ; %if
1021; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
1022; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
1023; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
1024; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
1025; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
1026; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
1027; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
1028; GFX9-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
1029; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
1030; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
1031; GFX9-O0-NEXT:    s_nop 1
1032; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1033; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
1034; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
1035; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
1036; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
1037; GFX9-O0-NEXT:  .LBB8_2: ; %merge
1038; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
1039; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[16:19], 0 ; 4-byte Folded Reload
1040; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
1041; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
1042; GFX9-O0-NEXT:    v_readlane_b32 s4, v5, 5
1043; GFX9-O0-NEXT:    v_readlane_b32 s5, v5, 6
1044; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
1045; GFX9-O0-NEXT:    v_readlane_b32 s2, v5, 1
1046; GFX9-O0-NEXT:    v_readlane_b32 s3, v5, 2
1047; GFX9-O0-NEXT:    v_readlane_b32 s0, v5, 3
1048; GFX9-O0-NEXT:    v_readlane_b32 s1, v5, 4
1049; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
1050; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
1051; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
1052; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v3
1053; GFX9-O0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
1054; GFX9-O0-NEXT:    s_mov_b32 s4, 1
1055; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
1056; GFX9-O0-NEXT:    s_mov_b32 s4, 2
1057; GFX9-O0-NEXT:    v_and_b32_e64 v0, v0, s4
1058; GFX9-O0-NEXT:    s_mov_b32 s6, s1
1059; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
1060; GFX9-O0-NEXT:    s_mov_b32 s4, s3
1061; GFX9-O0-NEXT:    s_mov_b32 s5, s2
1062; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1063; GFX9-O0-NEXT:    s_mov_b32 s1, s6
1064; GFX9-O0-NEXT:    s_mov_b32 s2, s5
1065; GFX9-O0-NEXT:    s_mov_b32 s3, s4
1066; GFX9-O0-NEXT:    s_mov_b32 s4, 0
1067; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
1068; GFX9-O0-NEXT:    s_endpgm
1069;
1070; GFX9-O3-LABEL: strict_wwm_cfg:
1071; GFX9-O3:       ; %bb.0: ; %entry
1072; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[0:3], 0
1073; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
1074; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
1075; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
1076; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
1077; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[4:5]
1078; GFX9-O3-NEXT:    s_nop 1
1079; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1080; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
1081; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
1082; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
1083; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1084; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1085; GFX9-O3-NEXT:  ; %bb.1: ; %if
1086; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
1087; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
1088; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[6:7]
1089; GFX9-O3-NEXT:    s_nop 1
1090; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1091; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
1092; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
1093; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
1094; GFX9-O3-NEXT:  ; %bb.2: ; %merge
1095; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
1096; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
1097; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1098; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1099; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
1100; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
1101; GFX9-O3-NEXT:    s_endpgm
1102entry:
1103  %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
1104  %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
1105  %tmp102 = extractelement <2 x i32> %tmp101, i32 0
1106  %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
1107
1108  %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
1109  %tmp121 = add i32 %tmp105, %tmp120
1110  %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
1111
1112  %cond = icmp eq i32 %arg, 0
1113  br i1 %cond, label %if, label %merge
1114if:
1115  %tmp103 = extractelement <2 x i32> %tmp101, i32 1
1116  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
1117
1118  %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
1119  %tmp136 = add i32 %tmp107, %tmp135
1120  %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
1121  br label %merge
1122
1123merge:
1124  %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
1125  %tmp138 = icmp eq i32 %tmp122, %merge_value
1126  %tmp139 = sext i1 %tmp138 to i32
1127  %tmp140 = shl nsw i32 %tmp139, 1
1128  %tmp141 = and i32 %tmp140, 2
1129  %tmp145 = bitcast i32 %tmp141 to float
1130  call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %tmp145, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
1131  ret void
1132}
1133
1134define hidden i32 @strict_wwm_called(i32 %a) noinline {
1135; GFX9-O0-LABEL: strict_wwm_called:
1136; GFX9-O0:       ; %bb.0:
1137; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1138; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
1139; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
1140; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
1141; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
1142;
1143; GFX9-O3-LABEL: strict_wwm_called:
1144; GFX9-O3:       ; %bb.0:
1145; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1146; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
1147; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
1148; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
1149; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
1150  %add = add i32 %a, %a
1151  %mul = mul i32 %add, %a
1152  %sub = sub i32 %mul, %add
1153  ret i32 %sub
1154}
1155
1156define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
1157; GFX9-O0-LABEL: strict_wwm_call:
1158; GFX9-O0:       ; %bb.0:
1159; GFX9-O0-NEXT:    s_mov_b32 s32, 0
1160; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1161; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1162; GFX9-O0-NEXT:    s_mov_b32 s26, -1
1163; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
1164; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
1165; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
1166; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
1167; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
1168; GFX9-O0-NEXT:    v_writelane_b32 v3, s12, 0
1169; GFX9-O0-NEXT:    v_writelane_b32 v3, s13, 1
1170; GFX9-O0-NEXT:    s_mov_b32 s14, s10
1171; GFX9-O0-NEXT:    s_mov_b32 s13, s9
1172; GFX9-O0-NEXT:    s_mov_b32 s12, s8
1173; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
1174; GFX9-O0-NEXT:    v_writelane_b32 v3, s4, 2
1175; GFX9-O0-NEXT:    v_writelane_b32 v3, s5, 3
1176; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
1177; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 0
1178; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 1
1179; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
1180; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 2
1181; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 3
1182; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
1183; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v1
1184; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v0
1185; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
1186; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
1187; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
1188; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
1189; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
1190; GFX9-O0-NEXT:    s_mov_b32 s3, s9
1191; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
1192; GFX9-O0-NEXT:    s_mov_b32 s9, s17
1193; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
1194; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
1195; GFX9-O0-NEXT:    s_mov_b32 s17, s9
1196; GFX9-O0-NEXT:    s_mov_b32 s18, s8
1197; GFX9-O0-NEXT:    s_mov_b32 s19, s3
1198; GFX9-O0-NEXT:    v_writelane_b32 v3, s16, 4
1199; GFX9-O0-NEXT:    v_writelane_b32 v3, s17, 5
1200; GFX9-O0-NEXT:    v_writelane_b32 v3, s18, 6
1201; GFX9-O0-NEXT:    v_writelane_b32 v3, s19, 7
1202; GFX9-O0-NEXT:    s_mov_b32 s8, 0
1203; GFX9-O0-NEXT:    v_writelane_b32 v3, s8, 8
1204; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
1205; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
1206; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
1207; GFX9-O0-NEXT:    v_writelane_b32 v3, s2, 9
1208; GFX9-O0-NEXT:    v_writelane_b32 v3, s3, 10
1209; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s8
1210; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
1211; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
1212; GFX9-O0-NEXT:    s_mov_b32 s2, s0
1213; GFX9-O0-NEXT:    s_mov_b32 s0, s1
1214; GFX9-O0-NEXT:    s_mov_b32 s3, s8
1215; GFX9-O0-NEXT:    s_mov_b32 s1, s9
1216; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
1217; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
1218; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
1219; GFX9-O0-NEXT:    s_mov_b32 s9, s0
1220; GFX9-O0-NEXT:    s_getpc_b64 s[16:17]
1221; GFX9-O0-NEXT:    s_add_u32 s16, s16, strict_wwm_called@rel32@lo+4
1222; GFX9-O0-NEXT:    s_addc_u32 s17, s17, strict_wwm_called@rel32@hi+12
1223; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
1224; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
1225; GFX9-O0-NEXT:    s_mov_b32 s15, 20
1226; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
1227; GFX9-O0-NEXT:    s_mov_b32 s15, 10
1228; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v5, s15, v5
1229; GFX9-O0-NEXT:    v_or3_b32 v4, v6, v5, v4
1230; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
1231; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v4
1232; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
1233; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1234; GFX9-O0-NEXT:    v_readlane_b32 s0, v3, 4
1235; GFX9-O0-NEXT:    v_readlane_b32 s1, v3, 5
1236; GFX9-O0-NEXT:    v_readlane_b32 s2, v3, 6
1237; GFX9-O0-NEXT:    v_readlane_b32 s3, v3, 7
1238; GFX9-O0-NEXT:    v_readlane_b32 s6, v3, 9
1239; GFX9-O0-NEXT:    v_readlane_b32 s7, v3, 10
1240; GFX9-O0-NEXT:    v_readlane_b32 s4, v3, 8
1241; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
1242; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v7
1243; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
1244; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
1245; GFX9-O0-NEXT:    s_nop 0
1246; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s4 offset:4
1247; GFX9-O0-NEXT:    s_endpgm
1248;
1249; GFX9-O3-LABEL: strict_wwm_call:
1250; GFX9-O3:       ; %bb.0:
1251; GFX9-O3-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1252; GFX9-O3-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1253; GFX9-O3-NEXT:    s_mov_b32 s26, -1
1254; GFX9-O3-NEXT:    s_mov_b32 s27, 0xe00000
1255; GFX9-O3-NEXT:    s_add_u32 s24, s24, s11
1256; GFX9-O3-NEXT:    s_mov_b32 s32, 0
1257; GFX9-O3-NEXT:    s_addc_u32 s25, s25, 0
1258; GFX9-O3-NEXT:    s_or_saveexec_b64 s[16:17], -1
1259; GFX9-O3-NEXT:    s_mov_b32 s14, s10
1260; GFX9-O3-NEXT:    s_mov_b32 s13, s9
1261; GFX9-O3-NEXT:    s_mov_b32 s12, s8
1262; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[6:7]
1263; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v2
1264; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
1265; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
1266; GFX9-O3-NEXT:    s_mov_b64 exec, s[16:17]
1267; GFX9-O3-NEXT:    s_load_dword s6, s[4:5], 0x34
1268; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
1269; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
1270; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s6
1271; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
1272; GFX9-O3-NEXT:    s_add_u32 s8, s4, 56
1273; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
1274; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
1275; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[20:21]
1276; GFX9-O3-NEXT:    s_addc_u32 s9, s5, 0
1277; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
1278; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
1279; GFX9-O3-NEXT:    s_mov_b64 s[6:7], s[2:3]
1280; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
1281; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
1282; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
1283; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
1284; GFX9-O3-NEXT:    s_getpc_b64 s[22:23]
1285; GFX9-O3-NEXT:    s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4
1286; GFX9-O3-NEXT:    s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12
1287; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
1288; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
1289; GFX9-O3-NEXT:    v_add_u32_e32 v3, v3, v6
1290; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
1291; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
1292; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
1293; GFX9-O3-NEXT:    s_endpgm
1294
1295
1296  %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
1297  %tmp134 = call i32 @strict_wwm_called(i32 %tmp107)
1298  %tmp136 = add i32 %tmp134, %tmp107
1299  %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
1300  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
1301  ret void
1302}
1303
1304define i64 @strict_wwm_called_i64(i64 %a) noinline {
1305; GFX9-O0-LABEL: strict_wwm_called_i64:
1306; GFX9-O0:       ; %bb.0:
1307; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
1309; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
1310; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
1311; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
1312; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
1313; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
1314; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
1315; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
1316; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
1317; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
1318; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v5
1319; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
1320; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
1321; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
1322; GFX9-O0-NEXT:    s_mov_b32 s4, 32
1323; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
1324; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
1325; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
1326; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
1327; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
1328; GFX9-O0-NEXT:    v_mul_lo_u32 v1, v0, v1
1329; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
1330; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[4:5]
1331; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
1332; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v2, v3
1333; GFX9-O0-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0
1334; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
1335; GFX9-O0-NEXT:    v_add3_u32 v0, v0, v1, v2
1336; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1337; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
1338; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
1339; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
1340; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
1341; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
1342; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s4, v[0:1]
1343; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
1344; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
1345; GFX9-O0-NEXT:    s_mov_b32 s5, 0
1346; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1347; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
1348; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
1349; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
1350; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
1351; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
1352; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
1353; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
1354; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
1355; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
1356; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
1357; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
1358; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
1359; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
1360; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
1361; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v3
1362; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
1363; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
1364; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
1365; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
1366; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
1367; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
1368; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
1369;
1370; GFX9-O3-LABEL: strict_wwm_called_i64:
1371; GFX9-O3:       ; %bb.0:
1372; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1373; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
1374; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
1375; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
1376; GFX9-O3-NEXT:    v_mul_lo_u32 v5, v2, v1
1377; GFX9-O3-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
1378; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
1379; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
1380; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
1381; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
1382  %add = add i64 %a, %a
1383  %mul = mul i64 %add, %a
1384  %sub = sub i64 %mul, %add
1385  ret i64 %sub
1386}
1387
1388define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
1389; GFX9-O0-LABEL: strict_wwm_call_i64:
1390; GFX9-O0:       ; %bb.0:
1391; GFX9-O0-NEXT:    s_mov_b32 s32, 0
1392; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1393; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1394; GFX9-O0-NEXT:    s_mov_b32 s26, -1
1395; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
1396; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
1397; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
1398; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
1399; GFX9-O0-NEXT:    ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
1400; GFX9-O0-NEXT:    v_writelane_b32 v8, s12, 0
1401; GFX9-O0-NEXT:    v_writelane_b32 v8, s13, 1
1402; GFX9-O0-NEXT:    s_mov_b32 s14, s10
1403; GFX9-O0-NEXT:    s_mov_b32 s13, s9
1404; GFX9-O0-NEXT:    s_mov_b32 s12, s8
1405; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
1406; GFX9-O0-NEXT:    v_writelane_b32 v8, s4, 2
1407; GFX9-O0-NEXT:    v_writelane_b32 v8, s5, 3
1408; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
1409; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 0
1410; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 1
1411; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
1412; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 2
1413; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 3
1414; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
1415; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
1416; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
1417; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
1418; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
1419; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
1420; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1421; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
1422; GFX9-O0-NEXT:    s_mov_b32 s8, s19
1423; GFX9-O0-NEXT:    s_mov_b32 s9, s18
1424; GFX9-O0-NEXT:    s_mov_b32 s15, s17
1425; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
1426; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
1427; GFX9-O0-NEXT:    s_mov_b32 s17, s15
1428; GFX9-O0-NEXT:    s_mov_b32 s18, s9
1429; GFX9-O0-NEXT:    s_mov_b32 s19, s8
1430; GFX9-O0-NEXT:    v_writelane_b32 v8, s16, 4
1431; GFX9-O0-NEXT:    v_writelane_b32 v8, s17, 5
1432; GFX9-O0-NEXT:    v_writelane_b32 v8, s18, 6
1433; GFX9-O0-NEXT:    v_writelane_b32 v8, s19, 7
1434; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
1435; GFX9-O0-NEXT:    s_mov_b32 s15, s9
1436; GFX9-O0-NEXT:    s_mov_b32 s16, s3
1437; GFX9-O0-NEXT:    ; implicit-def: $sgpr18_sgpr19
1438; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s16
1439; GFX9-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
1440; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s15
1441; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v0, s[16:17]
1442; GFX9-O0-NEXT:    s_mov_b64 exec, s[16:17]
1443; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
1444; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
1445; GFX9-O0-NEXT:    ; implicit-def: $sgpr16_sgpr17
1446; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s2
1447; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
1448; GFX9-O0-NEXT:    v_writelane_b32 v8, s2, 8
1449; GFX9-O0-NEXT:    v_writelane_b32 v8, s3, 9
1450; GFX9-O0-NEXT:    v_mov_b32_e32 v7, s8
1451; GFX9-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[2:3]
1452; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
1453; GFX9-O0-NEXT:    ; implicit-def: $sgpr2
1454; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
1455; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
1456; GFX9-O0-NEXT:    s_mov_b32 s2, 32
1457; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s2, v[9:10]
1458; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
1459; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
1460; GFX9-O0-NEXT:    s_mov_b32 s2, s0
1461; GFX9-O0-NEXT:    s_mov_b32 s0, s1
1462; GFX9-O0-NEXT:    s_mov_b32 s3, s8
1463; GFX9-O0-NEXT:    s_mov_b32 s1, s9
1464; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
1465; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
1466; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
1467; GFX9-O0-NEXT:    s_mov_b32 s9, s0
1468; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
1469; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4
1470; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12
1471; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
1472; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
1473; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
1474; GFX9-O0-NEXT:    s_mov_b32 s15, 20
1475; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
1476; GFX9-O0-NEXT:    s_mov_b32 s15, 10
1477; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
1478; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
1479; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
1480; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
1481; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
1482; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
1483; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
1484; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1485; GFX9-O0-NEXT:    v_readlane_b32 s0, v8, 4
1486; GFX9-O0-NEXT:    v_readlane_b32 s1, v8, 5
1487; GFX9-O0-NEXT:    v_readlane_b32 s2, v8, 6
1488; GFX9-O0-NEXT:    v_readlane_b32 s3, v8, 7
1489; GFX9-O0-NEXT:    v_readlane_b32 s4, v8, 8
1490; GFX9-O0-NEXT:    v_readlane_b32 s5, v8, 9
1491; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
1492; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
1493; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
1494; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
1495; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
1496; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v10
1497; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
1498; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
1499; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
1500; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
1501; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
1502; GFX9-O0-NEXT:    s_mov_b32 s4, 0
1503; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], s4 offset:4
1504; GFX9-O0-NEXT:    s_endpgm
1505;
1506; GFX9-O3-LABEL: strict_wwm_call_i64:
1507; GFX9-O3:       ; %bb.0:
1508; GFX9-O3-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
1509; GFX9-O3-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
1510; GFX9-O3-NEXT:    s_mov_b32 s26, -1
1511; GFX9-O3-NEXT:    s_mov_b32 s27, 0xe00000
1512; GFX9-O3-NEXT:    s_add_u32 s24, s24, s11
1513; GFX9-O3-NEXT:    s_mov_b32 s32, 0
1514; GFX9-O3-NEXT:    s_addc_u32 s25, s25, 0
1515; GFX9-O3-NEXT:    s_or_saveexec_b64 s[16:17], -1
1516; GFX9-O3-NEXT:    s_mov_b32 s14, s10
1517; GFX9-O3-NEXT:    s_mov_b32 s13, s9
1518; GFX9-O3-NEXT:    s_mov_b32 s12, s8
1519; GFX9-O3-NEXT:    s_mov_b64 s[10:11], s[6:7]
1520; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v2
1521; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
1522; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
1523; GFX9-O3-NEXT:    s_mov_b64 exec, s[16:17]
1524; GFX9-O3-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1525; GFX9-O3-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
1526; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s7
1528; GFX9-O3-NEXT:    s_or_saveexec_b64 s[8:9], -1
1529; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, 0, v0, s[8:9]
1530; GFX9-O3-NEXT:    s_mov_b64 exec, s[8:9]
1531; GFX9-O3-NEXT:    v_mov_b32_e32 v0, s6
1532; GFX9-O3-NEXT:    s_or_saveexec_b64 s[20:21], -1
1533; GFX9-O3-NEXT:    s_add_u32 s8, s4, 60
1534; GFX9-O3-NEXT:    s_addc_u32 s9, s5, 0
1535; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
1536; GFX9-O3-NEXT:    s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4
1537; GFX9-O3-NEXT:    s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12
1538; GFX9-O3-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x0
1539; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v3, 20, v3
1540; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 10, v4
1541; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, 0, v0, s[20:21]
1542; GFX9-O3-NEXT:    v_or3_b32 v3, v5, v4, v3
1543; GFX9-O3-NEXT:    s_mov_b64 s[4:5], s[0:1]
1544; GFX9-O3-NEXT:    s_mov_b64 s[6:7], s[2:3]
1545; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[24:25]
1546; GFX9-O3-NEXT:    v_mov_b32_e32 v31, v3
1547; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[26:27]
1548; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
1549; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v6
1550; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
1551; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[22:23]
1552; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
1553; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
1554; GFX9-O3-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
1555; GFX9-O3-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
1556; GFX9-O3-NEXT:    s_mov_b64 exec, s[20:21]
1557; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
1558; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v4
1559; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0 offset:4
1560; GFX9-O3-NEXT:    s_endpgm
1561
1562
1563
1564  %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
1565  %tmp134 = call i64 @strict_wwm_called_i64(i64 %tmp107)
1566  %tmp136 = add i64 %tmp134, %tmp107
1567  %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136)
1568  %tmp138 = bitcast i64 %tmp137 to <2 x i32>
1569  call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
1570  ret void
1571}
1572
1573define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
1574; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
1575; GFX9-O0:       ; %bb.0:
1576; GFX9-O0-NEXT:    s_mov_b32 s4, s3
1577; GFX9-O0-NEXT:    s_mov_b32 s5, s2
1578; GFX9-O0-NEXT:    s_mov_b32 s6, s1
1579; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
1580; GFX9-O0-NEXT:    s_mov_b32 s1, s6
1581; GFX9-O0-NEXT:    s_mov_b32 s2, s5
1582; GFX9-O0-NEXT:    s_mov_b32 s3, s4
1583; GFX9-O0-NEXT:    ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3
1584; GFX9-O0-NEXT:    s_mov_b32 s4, 5
1585; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
1586; GFX9-O0-NEXT:    s_mov_b32 s4, 0
1587; GFX9-O0-NEXT:    buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
1588; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
1589; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
1590; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
1591; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
1592; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
1593; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
1594; GFX9-O0-NEXT:    s_mov_b32 s10, -1
1595; GFX9-O0-NEXT:    ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
1596; GFX9-O0-NEXT:    s_mov_b32 s11, s5
1597; GFX9-O0-NEXT:    s_mov_b32 s8, s11
1598; GFX9-O0-NEXT:    ; implicit-def: $sgpr12_sgpr13
1599; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
1600; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
1601; GFX9-O0-NEXT:    s_mov_b32 s5, s10
1602; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
1603; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
1604; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
1605; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
1606; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
1607; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
1608; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
1609; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
1610; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v3
1611; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v13
1612; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v14
1613; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
1614; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
1615; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
1616; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
1617; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
1618; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
1619; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[6:7]
1620; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
1621; GFX9-O0-NEXT:    ; implicit-def: $sgpr9
1622; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
1623; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
1624; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
1625; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v3
1626; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
1627; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
1628; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
1629; GFX9-O0-NEXT:    ; implicit-def: $sgpr10_sgpr11
1630; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
1631; GFX9-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
1632; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
1633; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
1634; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
1635; GFX9-O0-NEXT:    ; implicit-def: $sgpr8_sgpr9
1636; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
1637; GFX9-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[6:7]
1638; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1639; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1640; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
1641; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
1642; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
1643; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v3
1644; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v10
1645; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
1646; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
1647; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v7
1648; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1649; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1650; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1651; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
1652; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
1653; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v12
1654; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v11
1655; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v10
1656; GFX9-O0-NEXT:    buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
1657; GFX9-O0-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
1658; GFX9-O0-NEXT:    s_endpgm
1659;
1660; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
1661; GFX9-O3:       ; %bb.0:
1662; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
1663; GFX9-O3-NEXT:    buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
1664; GFX9-O3-NEXT:    buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
1665; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
1666; GFX9-O3-NEXT:    v_bfrev_b32_e32 v1, -2
1667; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
1668; GFX9-O3-NEXT:    v_cndmask_b32_e64 v3, v1, v9, s[4:5]
1669; GFX9-O3-NEXT:    v_cndmask_b32_e64 v2, -1, v8, s[4:5]
1670; GFX9-O3-NEXT:    v_cndmask_b32_e64 v5, v1, v11, s[4:5]
1671; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, -1, v10, s[4:5]
1672; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
1673; GFX9-O3-NEXT:    v_cndmask_b32_e64 v7, v1, v13, s[4:5]
1674; GFX9-O3-NEXT:    v_cndmask_b32_e64 v6, -1, v12, s[4:5]
1675; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
1676; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
1677; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
1678; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
1679; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
1680; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
1681; GFX9-O3-NEXT:    v_mov_b32_e32 v13, v7
1682; GFX9-O3-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
1683; GFX9-O3-NEXT:    buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
1684; GFX9-O3-NEXT:    s_endpgm
1685  %tmp17 = shl i32 %index, 5
1686  %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
1687  %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
1688  %tmp19 = or i32 %tmp17, 16
1689  %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
1690  %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
1691  %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
1692  %tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22)
1693  %.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1
1694  %tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807)
1695  %tmp174 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp99)
1696  %.i25 = bitcast <2 x i32> %tmp20 to i64
1697  %tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807)
1698  %tmp251 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp176)
1699  %.cast = bitcast i64 %tmp97 to <2 x float>
1700  %.cast6 = bitcast i64 %tmp174 to <2 x float>
1701  %.cast7 = bitcast i64 %tmp251 to <2 x float>
1702  %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1703  %desc.int = bitcast <4 x i32> %desc to i128
1704  %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8)
1705  tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0)
1706  tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8)%desc.ptr, i32 %tmp19, i32 0, i32 0)
1707  ret void
1708}
1709
1710declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
1711declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
1712declare i32 @llvm.amdgcn.wwm.i32(i32)
1713declare i64 @llvm.amdgcn.wwm.i64(i64)
1714declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
1715declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
1716declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
1717declare <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8), i32, i32, i32)
1718declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32)
1719declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8), i32, i32, i32)
1720declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8), i32, i32, i32)
1721declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32)
1722declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32)
1723declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
1724declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
1725
1726!llvm.module.flags = !{!0}
1727!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
1728