Lines Matching +full:0 +full:x1c8
9 define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
11 ; CI: ; %bb.0:
12 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
13 ; CI-NEXT: s_mov_b32 s3, 0xf000
14 ; CI-NEXT: s_mov_b32 s2, 0
16 ; CI-NEXT: v_mov_b32_e32 v1, 0
17 ; CI-NEXT: s_waitcnt lgkmcnt(0)
18 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
20 ; CI-NEXT: s_waitcnt vmcnt(0)
25 ; GFX9: ; %bb.0:
26 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
28 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
29 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
30 ; GFX9-NEXT: s_waitcnt vmcnt(0)
36 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
39 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
44 define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
46 ; CI: ; %bb.0:
47 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
48 ; CI-NEXT: s_mov_b32 s3, 0xf000
49 ; CI-NEXT: s_mov_b32 s2, 0
51 ; CI-NEXT: v_mov_b32_e32 v1, 0
52 ; CI-NEXT: s_waitcnt lgkmcnt(0)
53 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
54 ; CI-NEXT: s_waitcnt vmcnt(0)
55 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
56 ; CI-NEXT: s_waitcnt vmcnt(0)
62 ; GFX9: ; %bb.0:
63 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
65 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
67 ; GFX9-NEXT: s_waitcnt vmcnt(0)
68 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
69 ; GFX9-NEXT: s_waitcnt vmcnt(0)
73 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
74 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
75 %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
77 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
80 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
85 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
87 ; CI: ; %bb.0:
88 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
89 ; CI-NEXT: s_mov_b32 s7, 0xf000
90 ; CI-NEXT: s_mov_b32 s6, 0
92 ; CI-NEXT: v_mov_b32_e32 v1, 0
93 ; CI-NEXT: s_waitcnt lgkmcnt(0)
94 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
95 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
97 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
98 ; CI-NEXT: s_waitcnt vmcnt(0)
99 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
100 ; CI-NEXT: s_waitcnt vmcnt(0)
107 ; GFX9: ; %bb.0:
108 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
123 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
126 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
131 define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
133 ; CI: ; %bb.0:
134 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
135 ; CI-NEXT: s_mov_b32 s7, 0xf000
136 ; CI-NEXT: s_mov_b32 s6, 0
138 ; CI-NEXT: v_mov_b32_e32 v1, 0
139 ; CI-NEXT: s_waitcnt lgkmcnt(0)
140 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
141 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
143 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
144 ; CI-NEXT: s_waitcnt vmcnt(0)
145 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 glc
146 ; CI-NEXT: s_waitcnt vmcnt(0)
153 ; GFX9: ; %bb.0:
154 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
156 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
157 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
160 ; GFX9-NEXT: s_waitcnt vmcnt(0)
169 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
172 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
182 define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
184 ; CI: ; %bb.0:
185 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
186 ; CI-NEXT: s_mov_b32 s3, 0xf000
187 ; CI-NEXT: s_mov_b32 s2, 0
189 ; CI-NEXT: v_mov_b32_e32 v2, 0
190 ; CI-NEXT: s_waitcnt lgkmcnt(0)
191 ; CI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[0:3], 0 addr64 glc
192 ; CI-NEXT: s_waitcnt vmcnt(0)
193 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64 offset:8 glc
194 ; CI-NEXT: s_waitcnt vmcnt(0)
201 ; GFX9: ; %bb.0:
202 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc
209 ; GFX9-NEXT: s_waitcnt vmcnt(0)
210 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
215 %in.gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %x.i
216 %in.gep.1 = getelementptr <2 x float>, ptr addrspace(1) %in.gep.0, i32 1
217 %val0 = load volatile <2 x float>, ptr addrspace(1) %in.gep.0, align 8
219 %val0.0 = extractelement <2 x float> %val0, i32 0
221 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
222 store float %val0.0, ptr addrspace(3) %arrayidx0, align 4
224 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
229 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
231 ; CI: ; %bb.0:
232 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
233 ; CI-NEXT: s_mov_b32 s3, 0xf000
234 ; CI-NEXT: s_mov_b32 s2, 0
236 ; CI-NEXT: v_mov_b32_e32 v2, 0
237 ; CI-NEXT: s_waitcnt lgkmcnt(0)
238 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[1:2], s[0:3], 0 addr64
241 ; CI-NEXT: s_waitcnt vmcnt(0)
246 ; GFX9: ; %bb.0:
247 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
250 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
251 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1]
252 ; GFX9-NEXT: s_waitcnt vmcnt(0)
258 %val0 = extractelement <2 x float> %val, i32 0
260 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
263 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
268 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
270 ; CI: ; %bb.0:
271 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
272 ; CI-NEXT: s_mov_b32 s3, 0xf000
273 ; CI-NEXT: s_mov_b32 s2, 0
275 ; CI-NEXT: v_mov_b32_e32 v2, 0
276 ; CI-NEXT: s_waitcnt lgkmcnt(0)
277 ; CI-NEXT: buffer_load_dwordx4 v[1:4], v[1:2], s[0:3], 0 addr64
280 ; CI-NEXT: s_waitcnt vmcnt(0)
285 ; GFX9: ; %bb.0:
286 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
289 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
290 ; GFX9-NEXT: global_load_dwordx4 v[1:4], v1, s[0:1]
291 ; GFX9-NEXT: s_waitcnt vmcnt(0)
297 %val0 = extractelement <4 x float> %val, i32 0
299 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
302 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
307 define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
309 ; CI: ; %bb.0:
310 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
311 ; CI-NEXT: s_mov_b32 s3, 0xf000
312 ; CI-NEXT: s_mov_b32 s2, 0
314 ; CI-NEXT: v_mov_b32_e32 v1, 0
315 ; CI-NEXT: s_waitcnt lgkmcnt(0)
316 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
317 ; CI-NEXT: s_waitcnt vmcnt(0)
318 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 offset:4 glc
319 ; CI-NEXT: s_waitcnt vmcnt(0)
325 ; GFX9: ; %bb.0:
326 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
328 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
329 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
330 ; GFX9-NEXT: s_waitcnt vmcnt(0)
331 ; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
336 %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %x.i
337 %in.gep.1 = getelementptr float, ptr addrspace(1) %in.gep.0, i32 1
338 %val0 = load volatile float, ptr addrspace(1) %in.gep.0, align 4
340 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
343 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
348 define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
350 ; CI: ; %bb.0:
351 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
352 ; CI-NEXT: s_mov_b32 s7, 0xf000
353 ; CI-NEXT: s_mov_b32 s6, 0
355 ; CI-NEXT: v_mov_b32_e32 v1, 0
356 ; CI-NEXT: s_waitcnt lgkmcnt(0)
357 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
358 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
360 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
361 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
365 ; CI-NEXT: s_waitcnt vmcnt(0)
370 ; GFX9: ; %bb.0:
371 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
374 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
386 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
389 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
394 define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
396 ; CI: ; %bb.0:
397 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
398 ; CI-NEXT: s_mov_b32 s7, 0xf000
399 ; CI-NEXT: s_mov_b32 s6, 0
401 ; CI-NEXT: v_mov_b32_e32 v1, 0
402 ; CI-NEXT: s_waitcnt lgkmcnt(0)
403 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
404 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
406 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
407 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
409 ; CI-NEXT: s_waitcnt vmcnt(0)
415 ; GFX9: ; %bb.0:
416 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
418 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
431 %idx.0 = add nsw i32 %tid.x, 0
432 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
436 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
440 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
444 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
450 define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
452 ; CI: ; %bb.0:
453 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
454 ; CI-NEXT: s_mov_b32 s7, 0xf000
455 ; CI-NEXT: s_mov_b32 s6, 0
457 ; CI-NEXT: v_mov_b32_e32 v1, 0
458 ; CI-NEXT: s_waitcnt lgkmcnt(0)
459 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
460 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
462 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
463 ; CI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64
465 ; CI-NEXT: s_waitcnt vmcnt(0)
471 ; GFX9: ; %bb.0:
472 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
475 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
487 %idx.0 = add nsw i32 %tid.x, 3
488 %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
492 %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
496 %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
500 %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
506 define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x ptr addrspace(3)> %lds.ptr) #0 {
508 ; CI: ; %bb.0:
509 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
510 ; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x6
511 ; CI-NEXT: s_mov_b32 s7, 0xf000
512 ; CI-NEXT: s_mov_b32 s6, 0
514 ; CI-NEXT: s_waitcnt lgkmcnt(0)
515 ; CI-NEXT: s_mov_b64 s[4:5], s[0:1]
516 ; CI-NEXT: v_mov_b32_e32 v1, 0
517 ; CI-NEXT: s_mov_b64 s[0:1], s[2:3]
519 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
520 ; CI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
526 ; CI-NEXT: s_waitcnt vmcnt(0)
531 ; GFX9: ; %bb.0:
532 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
533 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x18
535 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
551 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
552 %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
554 %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
559 store float %val0, ptr addrspace(3) %gep.0, align 4
566 define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
568 ; CI: ; %bb.0:
569 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
570 ; CI-NEXT: s_mov_b32 s3, 0xf000
571 ; CI-NEXT: s_mov_b32 s2, 0
573 ; CI-NEXT: v_mov_b32_e32 v1, 0
574 ; CI-NEXT: s_waitcnt lgkmcnt(0)
575 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
577 ; CI-NEXT: s_waitcnt vmcnt(0)
582 ; GFX9: ; %bb.0:
583 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
586 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
587 ; GFX9-NEXT: s_waitcnt vmcnt(0)
588 ; GFX9-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:8
593 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
596 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
601 define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
603 ; CI: ; %bb.0:
604 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
605 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
606 ; CI-NEXT: s_mov_b32 s3, 0xf000
607 ; CI-NEXT: s_mov_b32 s2, 0
609 ; CI-NEXT: v_mov_b32_e32 v1, 0
610 ; CI-NEXT: s_waitcnt lgkmcnt(0)
611 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
614 ; CI-NEXT: s_waitcnt vmcnt(0)
620 ; GFX9: ; %bb.0:
621 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
622 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10
624 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
625 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
627 ; GFX9-NEXT: s_waitcnt vmcnt(0)
642 define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in, ptr addrspace(3) %lds) #0 {
644 ; CI: ; %bb.0:
645 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
646 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4
647 ; CI-NEXT: s_mov_b32 s3, 0xf000
648 ; CI-NEXT: s_mov_b32 s2, 0
650 ; CI-NEXT: v_mov_b32_e32 v1, 0
651 ; CI-NEXT: s_waitcnt lgkmcnt(0)
652 ; CI-NEXT: buffer_load_dwordx2 v[1:2], v[0:1], s[0:3], 0 addr64
655 ; CI-NEXT: s_waitcnt vmcnt(0)
677 ; GFX9-ALIGNED: ; %bb.0:
678 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
679 ; GFX9-ALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
681 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
682 ; GFX9-ALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
684 ; GFX9-ALIGNED-NEXT: s_waitcnt vmcnt(0)
704 ; GFX9-UNALIGNED: ; %bb.0:
705 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
706 ; GFX9-UNALIGNED-NEXT: s_load_dword s2, s[4:5], 0x10
708 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX9-UNALIGNED-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
711 ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
712 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:5
713 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:9
726 define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr addrspace(1) %in) #0 {
728 ; CI: ; %bb.0:
729 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
730 ; CI-NEXT: s_mov_b32 s3, 0xf000
731 ; CI-NEXT: s_mov_b32 s2, 0
733 ; CI-NEXT: v_mov_b32_e32 v1, 0
734 ; CI-NEXT: s_waitcnt lgkmcnt(0)
735 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 glc
736 ; CI-NEXT: s_waitcnt vmcnt(0)
737 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 glc
738 ; CI-NEXT: s_waitcnt vmcnt(0)
744 ; GFX9: ; %bb.0:
745 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
747 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
748 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] glc
749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
750 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc
751 ; GFX9-NEXT: s_waitcnt vmcnt(0)
752 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:8
755 %in.gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %x.i
756 %in.gep.1 = getelementptr double, ptr addrspace(1) %in.gep.0, i32 1
757 %val0 = load volatile double, ptr addrspace(1) %in.gep.0, align 8
759 %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
762 %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
771 ; CI: ; %bb.0:
772 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
774 ; CI-NEXT: v_mov_b32_e32 v2, 0
776 ; CI-NEXT: ds_write_b64 v2, v[0:1]
780 ; GFX9: ; %bb.0:
781 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
783 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
784 ; GFX9-NEXT: ds_write_b64 v2, v[0:1]
787 store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
793 ; CI: ; %bb.0:
794 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
795 ; CI-NEXT: v_mov_b32_e32 v1, 0
801 ; GFX9: ; %bb.0:
802 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
803 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
807 store i32 123, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
815 ; CI: ; %bb.0:
816 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
817 ; CI-NEXT: v_mov_b32_e32 v1, 0
821 ; CI-NEXT: ds_write_b128 v1, v[0:3]
825 ; GFX9: ; %bb.0:
826 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
827 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
830 ; GFX9-NEXT: ds_write_b128 v1, v[0:3]
833 store i64 123, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
841 ; CI: ; %bb.0:
842 ; CI-NEXT: s_mov_b64 s[0:1], 0x7b
844 ; CI-NEXT: v_mov_b32_e32 v2, 0
847 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
848 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
852 ; GFX9: ; %bb.0:
853 ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b
855 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
857 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
858 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
860 store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
861 store i64 123, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
868 define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb, ptr addrspace(1) %in) #0 {
870 ; CI: ; %bb.0:
871 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4
873 ; CI-NEXT: s_waitcnt lgkmcnt(0)
874 ; CI-NEXT: s_load_dword s0, s[0:1], 0x0
876 ; CI-NEXT: s_add_i32 s2, s1, 0xc20
877 ; CI-NEXT: s_addk_i32 s1, 0xc60
879 ; CI-NEXT: s_waitcnt lgkmcnt(0)
892 ; GFX9: ; %bb.0:
893 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
895 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
896 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0
897 ; GFX9-NEXT: s_add_i32 s1, s2, 0xc20
898 ; GFX9-NEXT: s_addk_i32 s2, 0xc60
901 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
914 %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
917 %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
920 %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
923 %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
925 %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
928 %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
931 %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
934 %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
937 %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
940 %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
945 define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) %out, ptr addrspace(1) %in) #0 {
947 ; CI: ; %bb.0:
948 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
949 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
952 ; CI-NEXT: s_waitcnt lgkmcnt(0)
953 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
955 ; CI-NEXT: s_waitcnt lgkmcnt(0)
965 ; GFX9-ALIGNED: ; %bb.0:
966 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
967 ; GFX9-ALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
968 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
971 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
981 ; GFX9-UNALIGNED: ; %bb.0:
982 ; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
983 ; GFX9-UNALIGNED-NEXT: s_load_dword s8, s[4:5], 0x0
984 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
986 ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
987 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
1007 ; CI: ; %bb.0: ; %entry
1008 ; CI-NEXT: v_mov_b32_e32 v0, 0x7b
1009 ; CI-NEXT: v_mov_b32_e32 v1, 0
1014 ; CI-NEXT: v_mov_b32_e32 v0, 0xc8
1024 ; GFX9-ALIGNED: ; %bb.0: ; %entry
1025 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
1026 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0
1030 ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8
1040 ; GFX9-UNALIGNED: ; %bb.0: ; %entry
1041 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
1042 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0x1c8
1043 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0
1044 ; GFX9-UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] offset:65
1056 attributes #0 = { nounwind }