Lines Matching +full:0 +full:x307

7 ; GFX10:       ; %bb.0:
8 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; GFX10-NEXT: s_waitcnt vmcnt(0)
11 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706
16 ; GFX9: ; %bb.0:
17 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19 ; GFX9-NEXT: s_mov_b32 s4, 0x6060706
20 ; GFX9-NEXT: s_waitcnt vmcnt(0)
23 ; GFX9-NEXT: s_waitcnt vmcnt(0)
34 ; GFX10: ; %bb.0:
35 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
38 ; GFX10-NEXT: s_waitcnt vmcnt(0)
39 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307
44 ; GFX9: ; %bb.0:
45 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
48 ; GFX9-NEXT: s_movk_i32 s4, 0x307
49 ; GFX9-NEXT: s_waitcnt vmcnt(0)
52 ; GFX9-NEXT: s_waitcnt vmcnt(0)
63 ; GFX10: ; %bb.0:
64 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GFX10-NEXT: s_waitcnt vmcnt(0)
67 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404
72 ; GFX9: ; %bb.0:
73 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75 ; GFX9-NEXT: s_mov_b32 s4, 0x5040404
76 ; GFX9-NEXT: s_waitcnt vmcnt(0)
79 ; GFX9-NEXT: s_waitcnt vmcnt(0)
90 ; GFX10: ; %bb.0:
91 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
93 ; GFX10-NEXT: s_waitcnt vmcnt(0)
94 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
99 ; GFX9: ; %bb.0:
100 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
102 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
110 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
117 ; GFX10: ; %bb.0:
118 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
121 ; GFX10-NEXT: s_waitcnt vmcnt(0)
122 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405
127 ; GFX9: ; %bb.0:
128 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
129 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
131 ; GFX9-NEXT: s_mov_b32 s4, 0x40405
132 ; GFX9-NEXT: s_waitcnt vmcnt(0)
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
139 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4>
146 define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
148 ; GFX10: ; %bb.0:
149 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150 ; GFX10-NEXT: flat_load_dword v6, v[0:1]
152 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
153 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507
155 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
159 ; GFX9: ; %bb.0:
160 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX9-NEXT: flat_load_dword v6, v[0:1]
163 ; GFX9-NEXT: s_mov_b32 s4, 0x3030507
164 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
167 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
169 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
170 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
172 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
176 define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) {
178 ; GFX10: ; %bb.0:
179 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
181 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
182 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707
184 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX9: ; %bb.0:
189 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191 ; GFX9-NEXT: s_mov_b32 s4, 0x7060707
192 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
195 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
197 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4
198 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4
200 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4
206 ; GFX10: ; %bb.0:
207 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104
213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
217 ; GFX9: ; %bb.0:
218 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
221 ; GFX9-NEXT: s_mov_b32 s4, 0x10104
222 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
225 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
229 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4>
236 ; GFX10: ; %bb.0:
237 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
241 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506
243 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
247 ; GFX9: ; %bb.0:
248 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251 ; GFX9-NEXT: s_mov_b32 s4, 0x3060506
252 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
255 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX10: ; %bb.0:
267 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268 ; GFX10-NEXT: s_clause 0x1
269 ; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
270 ; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
271 ; GFX10-NEXT: s_waitcnt vmcnt(0)
272 ; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005
273 ; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
277 ; GFX9: ; %bb.0:
278 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279 ; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
280 ; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen
281 ; GFX9-NEXT: s_mov_b32 s4, 0x7040005
282 ; GFX9-NEXT: s_waitcnt vmcnt(0)
284 ; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
289 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7>
296 ; GFX10: ; %bb.0:
297 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
298 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
300 ; GFX10-NEXT: s_waitcnt vmcnt(0)
301 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107
306 ; GFX9: ; %bb.0:
307 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
310 ; GFX9-NEXT: s_mov_b32 s4, 0x2000107
311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
326 ; GFX10: ; %bb.0:
327 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
329 ; GFX10-NEXT: s_waitcnt vmcnt(0)
330 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706
335 ; GFX9: ; %bb.0:
336 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
338 ; GFX9-NEXT: s_mov_b32 s4, 0x4070706
339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
342 ; GFX9-NEXT: s_waitcnt vmcnt(0)
345 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0>
352 ; GFX10: ; %bb.0:
353 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
354 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
355 ; GFX10-NEXT: s_waitcnt vmcnt(0)
361 ; GFX9: ; %bb.0:
362 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
363 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
367 ; GFX9-NEXT: s_waitcnt vmcnt(0)
377 ; GFX10: ; %bb.0:
378 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
381 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
386 ; GFX9: ; %bb.0:
387 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
388 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
389 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
393 ; GFX9-NEXT: s_waitcnt vmcnt(0)
396 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6>
404 ; GFX10: ; %bb.0:
405 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
407 ; GFX10-NEXT: s_waitcnt vmcnt(0)
408 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
413 ; GFX9: ; %bb.0:
414 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
415 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
416 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
417 ; GFX9-NEXT: s_waitcnt vmcnt(0)
420 ; GFX9-NEXT: s_waitcnt vmcnt(0)
430 ; GFX10: ; %bb.0:
431 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
433 ; GFX10-NEXT: s_waitcnt vmcnt(0)
434 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607
439 ; GFX9: ; %bb.0:
440 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
442 ; GFX9-NEXT: s_mov_b32 s4, 0x7060607
443 ; GFX9-NEXT: s_waitcnt vmcnt(0)
446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
456 ; GFX10: ; %bb.0:
457 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706
465 ; GFX9: ; %bb.0:
466 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
468 ; GFX9-NEXT: s_mov_b32 s4, 0x7060706
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
482 ; GFX10: ; %bb.0:
483 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
484 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705
491 ; GFX9: ; %bb.0:
492 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
493 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
494 ; GFX9-NEXT: s_mov_b32 s4, 0x7060705
495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-NEXT: s_waitcnt vmcnt(0)
508 ; GFX10: ; %bb.0:
509 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
510 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
511 ; GFX10-NEXT: s_waitcnt vmcnt(0)
512 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504
517 ; GFX9: ; %bb.0:
518 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
519 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
520 ; GFX9-NEXT: s_mov_b32 s4, 0x5040504
521 ; GFX9-NEXT: s_waitcnt vmcnt(0)
524 ; GFX9-NEXT: s_waitcnt vmcnt(0)
527 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5>
534 ; GFX10: ; %bb.0:
535 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
536 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
538 ; GFX10-NEXT: s_waitcnt vmcnt(0)
540 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
545 ; GFX9: ; %bb.0:
546 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
547 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
549 ; GFX9-NEXT: s_waitcnt vmcnt(0)
551 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
565 ; GFX10: ; %bb.0:
566 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
571 ; GFX10-NEXT: s_waitcnt vmcnt(0)
587 ; GFX9: ; %bb.0:
588 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
589 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
598 ; GFX9-NEXT: s_waitcnt vmcnt(0)
602 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3>
609 define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 {
611 ; GFX10: ; %bb.0: ; %bb
612 ; GFX10-NEXT: s_clause 0x1
613 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
614 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
615 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
616 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
617 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
618 ; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
619 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
620 ; GFX10-NEXT: s_bfe_u32 s2, s5, 0x80008
622 ; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100010
623 ; GFX10-NEXT: s_bfe_u32 s0, s4, 0x80008
625 ; GFX10-NEXT: s_and_b32 s5, s8, 0xff00
626 ; GFX10-NEXT: s_bfe_u32 s8, s4, 0x80010
627 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff
633 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
635 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
641 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
645 ; GFX9: ; %bb.0: ; %bb
646 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
647 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
648 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
651 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
652 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
653 ; GFX9-NEXT: s_bfe_u32 s0, s4, 0x80008
655 ; GFX9-NEXT: s_bfe_u32 s2, s5, 0x80008
659 ; GFX9-NEXT: s_bfe_u32 s2, s4, 0x80010
660 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff
661 ; GFX9-NEXT: s_bfe_u32 s4, s9, 0x100010
662 ; GFX9-NEXT: s_and_b32 s5, s8, 0xff00
666 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
668 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
674 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
679 %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9>
690 ; GFX10: ; %bb.0:
691 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
696 ; GFX10-NEXT: s_waitcnt vmcnt(0)
714 ; GFX9: ; %bb.0:
715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
718 ; GFX9-NEXT: s_waitcnt vmcnt(0)
727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
740 ; GFX10: ; %bb.0:
741 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
745 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
747 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
748 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
759 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
764 ; GFX9: ; %bb.0:
765 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
769 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
771 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
772 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
774 ; GFX9-NEXT: s_waitcnt vmcnt(0)
778 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
780 ; GFX9-NEXT: s_waitcnt vmcnt(0)
796 ; GFX10: ; %bb.0:
797 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
798 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
802 ; GFX10-NEXT: s_waitcnt vmcnt(0)
806 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
812 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
813 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
819 ; GFX9: ; %bb.0:
820 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
821 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
823 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
831 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
832 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
835 ; GFX9-NEXT: s_waitcnt vmcnt(0)
849 ; GFX10: ; %bb.0:
850 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
854 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
856 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
857 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
861 ; GFX10-NEXT: s_waitcnt vmcnt(0)
865 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00
871 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
872 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
878 ; GFX9: ; %bb.0:
879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
880 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
883 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
885 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
886 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
888 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
891 ; GFX9-NEXT: s_waitcnt vmcnt(0)
896 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
897 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
900 ; GFX9-NEXT: s_waitcnt vmcnt(0)
917 ; GFX10: ; %bb.0:
918 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
919 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
922 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
924 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
925 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
929 ; GFX10-NEXT: s_waitcnt vmcnt(0)
943 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705
949 ; GFX9: ; %bb.0:
950 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
951 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
954 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
956 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
957 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
959 ; GFX9-NEXT: s_mov_b32 s4, 0x10705
960 ; GFX9-NEXT: s_waitcnt vmcnt(0)
971 ; GFX9-NEXT: s_waitcnt vmcnt(0)
987 ; GFX10: ; %bb.0:
988 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
989 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
992 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
994 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
996 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1001 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1003 ; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9
1008 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006
1014 ; GFX9: ; %bb.0:
1015 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1016 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1019 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1021 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1022 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1024 ; GFX9-NEXT: s_mov_b32 s4, 0x5070006
1029 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1032 ; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4
1039 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1055 ; GFX10: ; %bb.0:
1056 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1057 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1060 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1061 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1063 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
1065 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
1067 ; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8
1072 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1074 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707
1075 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1083 ; GFX9: ; %bb.0:
1084 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1085 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1088 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1090 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1091 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1094 ; GFX9-NEXT: s_mov_b32 s4, 0x4010707
1098 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1103 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1109 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1116 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0>
1125 ; GFX10: ; %bb.0:
1126 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1127 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1130 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1132 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1133 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1135 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1136 ; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x7060104
1142 ; GFX9: ; %bb.0:
1143 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1147 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1149 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1150 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1152 ; GFX9-NEXT: s_mov_b32 s4, 0x7060104
1153 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1157 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1174 ; GFX10: ; %bb.0:
1175 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1176 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1179 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1181 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1182 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1186 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1187 ; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x1020305
1193 ; GFX9: ; %bb.0:
1194 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1195 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1198 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1200 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1201 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1203 ; GFX9-NEXT: s_mov_b32 s4, 0x1020305
1206 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1210 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1227 ; GFX10: ; %bb.0:
1228 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1229 ; GFX10-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1232 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1234 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1235 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1238 ; GFX10-NEXT: v_mov_b32_e32 v1, 0xff
1242 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1247 ; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706
1253 ; GFX9: ; %bb.0:
1254 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255 ; GFX9-NEXT: v_and_b32_e32 v9, 0x3ff, v31
1258 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1260 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1261 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1263 ; GFX9-NEXT: s_movk_i32 s4, 0xff
1265 ; GFX9-NEXT: s_mov_b32 s5, 0x2000706
1268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1277 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1284 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2>
1294 ; GFX10: ; %bb.0:
1295 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1299 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1301 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1302 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1307 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1311 ; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1
1315 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707
1321 ; GFX9: ; %bb.0:
1322 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1323 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1326 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1328 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1329 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1332 ; GFX9-NEXT: s_mov_b32 s4, 0x1030707
1335 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1341 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3
1346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1363 ; GFX10: ; %bb.0:
1364 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1365 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1368 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1370 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1371 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1375 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1388 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2000504
1394 ; GFX9: ; %bb.0:
1395 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1396 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1399 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1401 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1402 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1404 ; GFX9-NEXT: s_mov_b32 s4, 0x2000504
1405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1416 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6>
1433 ; GFX10: ; %bb.0:
1434 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1435 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1438 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1440 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1442 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1448 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1451 ; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1
1454 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005
1460 ; GFX9: ; %bb.0:
1461 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1462 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1465 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1468 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1469 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
1470 ; GFX9-NEXT: s_mov_b32 s4, 0x2010005
1471 ; GFX9-NEXT: s_movk_i32 s5, 0x102
1478 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1481 ; GFX9-NEXT: v_or_b32_e32 v0, 0x201, v0
1485 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1501 ; GFX10: ; %bb.0:
1502 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1503 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1506 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1508 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1510 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1514 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1548 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
1554 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo
1558 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1562 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo
1566 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706
1572 ; GFX9: ; %bb.0:
1573 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1574 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1577 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1579 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1581 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1582 ; GFX9-NEXT: s_mov_b32 s4, 0x60706
1586 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1622 ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
1626 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
1629 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc
1631 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc
1641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1658 ; GFX10: ; %bb.0:
1659 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1660 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1663 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1665 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1667 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1670 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1675 ; GFX10-NEXT: v_perm_b32 v1, v0, v2, 0x5040100
1676 ; GFX10-NEXT: v_perm_b32 v0, v3, v3, 0x5040100
1677 ; GFX10-NEXT: v_perm_b32 v2, v9, v4, 0x3010707
1678 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1683 ; GFX9: ; %bb.0:
1684 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1685 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1688 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1690 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1691 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1694 ; GFX9-NEXT: s_mov_b32 s5, 0x5040100
1695 ; GFX9-NEXT: s_mov_b32 s4, 0x3010707
1696 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1703 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
1705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1722 ; GFX10: ; %bb.0:
1723 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1724 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1727 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1729 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1730 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1734 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1736 ; GFX10-NEXT: v_and_b32_e32 v2, 0xfffffc00, v0
1737 ; GFX10-NEXT: v_and_b32_e32 v3, 0xfe, v1
1738 ; GFX10-NEXT: v_and_b32_e32 v1, 0xfffffe00, v1
1739 ; GFX10-NEXT: v_and_b32_e32 v0, 0xfc, v0
1742 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5000104
1749 ; GFX9: ; %bb.0:
1750 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1754 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1756 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1757 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1759 ; GFX9-NEXT: s_mov_b32 s4, 0x5000104
1762 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1765 ; GFX9-NEXT: v_and_b32_e32 v3, 0xfffffc00, v1
1766 ; GFX9-NEXT: v_and_b32_e32 v4, 0xfe, v2
1767 ; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffe00, v2
1768 ; GFX9-NEXT: v_and_b32_e32 v1, 0xfc, v1
1774 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1781 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5>
1791 ; GFX10: ; %bb.0:
1792 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1793 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1796 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1798 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1800 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1803 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1807 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x6010205
1808 ; GFX10-NEXT: v_bfe_i32 v10, v0, 0, 8
1809 ; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
1814 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1819 ; GFX9: ; %bb.0:
1820 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1821 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1824 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1825 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1827 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
1828 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1829 ; GFX9-NEXT: s_mov_b32 s4, 0x6010205
1833 ; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 8
1834 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1837 ; GFX9-NEXT: v_bfe_i32 v11, v2, 0, 8
1843 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
1845 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1862 ; GFX10: ; %bb.0:
1863 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1864 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1867 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
1869 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1871 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
1876 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1909 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo
1915 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo
1921 ; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo
1926 ; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo
1938 ; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306
1944 ; GFX9: ; %bb.0:
1945 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1946 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
1949 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1951 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1953 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
1954 ; GFX9-NEXT: s_mov_b32 s4, 0x2070306
1957 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1999 ; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc
2001 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
2002 ; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5]
2003 ; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
2024 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2041 ; GFX10: ; %bb.0:
2042 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2043 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2046 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2048 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2050 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2055 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2060 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007
2071 ; GFX9: ; %bb.0:
2072 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2073 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2076 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2078 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2079 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2081 ; GFX9-NEXT: s_mov_b32 s4, 0x6070007
2082 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2093 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2100 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6>
2110 ; GFX10: ; %bb.0:
2111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2112 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2115 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2117 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2118 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2121 ; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x50705
2126 ; GFX9: ; %bb.0:
2127 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2131 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2133 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2134 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2136 ; GFX9-NEXT: s_mov_b32 s4, 0x50705
2137 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2140 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2148 %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0>
2157 ; GFX10: ; %bb.0:
2158 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2159 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2162 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2164 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2165 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2170 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2178 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205
2185 ; GFX9: ; %bb.0:
2186 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2190 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2192 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2193 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2196 ; GFX9-NEXT: s_mov_b32 s4, 0x50205
2199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2211 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2218 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0>
2227 ; GFX10: ; %bb.0:
2228 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2229 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2232 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2234 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2236 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2241 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2249 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207
2267 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2269 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2272 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2276 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2284 ; GFX9: ; %bb.0:
2285 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2286 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2289 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2291 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2293 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2294 ; GFX9-NEXT: s_mov_b32 s4, 0x40207
2300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2322 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
2327 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
2329 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
2331 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
2339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2346 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4>
2356 ; GFX10: ; %bb.0:
2357 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2358 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2361 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2363 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2365 ; GFX10-NEXT: global_load_dword v9, v[0:1], off
2368 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2372 ; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5020104
2373 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2378 ; GFX9: ; %bb.0:
2379 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2380 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2383 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2385 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2386 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2388 ; GFX9-NEXT: s_mov_b32 s4, 0x5020104
2391 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2396 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
2398 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2415 ; GFX10: ; %bb.0:
2416 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2417 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2420 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2422 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2424 ; GFX10-NEXT: global_load_dword v0, v[0:1], off
2430 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2456 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
2459 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
2463 ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
2467 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
2472 ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505
2480 ; GFX9: ; %bb.0:
2481 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2482 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2485 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2487 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2489 ; GFX9-NEXT: global_load_dword v9, v[0:1], off
2490 ; GFX9-NEXT: s_mov_b32 s4, 0x2050505
2510 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2516 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
2521 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc
2523 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc
2528 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc
2543 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2560 ; GFX10: ; %bb.0:
2561 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2565 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2567 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2568 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2570 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00
2575 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2576 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2579 ; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0
2580 ; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3
2584 ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307
2590 ; GFX9: ; %bb.0:
2591 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2592 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2595 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2597 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2598 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2600 ; GFX9-NEXT: s_movk_i32 s4, 0xff00
2603 ; GFX9-NEXT: s_mov_b32 s5, 0x5060307
2606 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2607 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9
2610 ; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2
2611 ; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3
2618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2635 ; GFX10: ; %bb.0:
2636 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2637 ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2640 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
2642 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
2643 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
2645 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
2648 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v4
2649 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2650 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v9
2652 ; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x5040100
2653 ; GFX10-NEXT: v_perm_b32 v2, v4, v9, 0x60504
2654 ; GFX10-NEXT: v_perm_b32 v1, v3, v10, 0x5040100
2655 ; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2660 ; GFX9: ; %bb.0:
2661 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2662 ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31
2665 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2667 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2668 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
2670 ; GFX9-NEXT: s_mov_b32 s4, 0x60504
2671 ; GFX9-NEXT: s_movk_i32 s5, 0xff
2672 ; GFX9-NEXT: s_mov_b32 s6, 0x5040100
2675 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2677 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v4
2678 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v9
2682 ; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off
2684 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2691 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
2700 ; GFX10: ; %bb.0: ; %entry
2701 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2702 ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
2703 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
2707 ; GFX9: ; %bb.0: ; %entry
2708 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2709 ; GFX9-NEXT: s_mov_b32 s4, 0x3050204
2711 ; GFX9-NEXT: global_store_dword v[0:1], v0, off
2712 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2734 ; GFX10: ; %bb.0:
2735 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2736 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2738 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2739 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2744 ; GFX9: ; %bb.0:
2745 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2746 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2748 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2749 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2752 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2756 %v1e0 = extractelement <4 x i8> %vec1, i64 0
2778 ; GFX10: ; %bb.0:
2779 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2780 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2782 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2783 ; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404
2788 ; GFX9: ; %bb.0:
2789 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2790 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2792 ; GFX9-NEXT: s_mov_b32 s4, 0x3070404
2793 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2796 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2811 ; GFX10: ; %bb.0:
2812 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2813 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2816 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407
2821 ; GFX9: ; %bb.0:
2822 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2823 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2825 ; GFX9-NEXT: s_mov_b32 s4, 0x1030407
2826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2833 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2835 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2868 ; GFX10: ; %bb.0:
2869 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2870 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2872 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2873 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2878 ; GFX9: ; %bb.0:
2879 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2880 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2882 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2883 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2886 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2890 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2892 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2895 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8)
2896 %byte01 = zext i16 %tmp01.0 to i32
2898 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8)
2899 %tmp23.1 = zext i16 %tmp23.0 to i32
2908 ; GFX10: ; %bb.0:
2909 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2910 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2912 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2913 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
2918 ; GFX9: ; %bb.0:
2919 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2920 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2922 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
2923 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2926 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2930 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2932 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2935 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16)
2936 %byte01 = zext i16 %tmp01.0 to i32
2938 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16)
2939 %tmp23.1 = zext i16 %tmp23.0 to i32
2948 ; GFX10: ; %bb.0:
2949 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2950 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2952 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2953 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
2958 ; GFX9: ; %bb.0:
2959 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2960 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
2962 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
2963 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2966 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2970 %v1e0 = extractelement <2 x i16> %vec1, i64 0
2972 %v2e0 = extractelement <2 x i16> %vec2, i64 0
2975 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24)
2976 %byte01 = zext i16 %tmp01.0 to i32
2978 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24)
2979 %tmp23.1 = zext i16 %tmp23.0 to i32
2988 ; GFX10: ; %bb.0:
2989 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2990 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
2992 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2993 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706
2998 ; GFX9: ; %bb.0:
2999 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3000 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3002 ; GFX9-NEXT: s_mov_b32 s4, 0x3020706
3003 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3006 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3010 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3012 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3015 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32)
3016 %byte01 = zext i16 %tmp01.0 to i32
3018 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32)
3019 %tmp23.1 = zext i16 %tmp23.0 to i32
3028 ; GFX10: ; %bb.0:
3029 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3030 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3032 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3033 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3038 ; GFX9: ; %bb.0:
3039 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3040 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3042 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3043 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3046 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3050 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3052 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3055 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88)
3056 %byte01 = zext i16 %tmp01.0 to i32
3058 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88)
3059 %tmp23.1 = zext i16 %tmp23.0 to i32
3070 ; GFX10: ; %bb.0:
3071 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3072 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3074 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3075 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3080 ; GFX9: ; %bb.0:
3081 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3082 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3084 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3085 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3088 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3092 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3094 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3097 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8)
3098 %byte01 = zext i16 %tmp01.0 to i32
3100 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8)
3101 %tmp23.1 = zext i16 %tmp23.0 to i32
3110 ; GFX10: ; %bb.0:
3111 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3114 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3115 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3120 ; GFX9: ; %bb.0:
3121 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3122 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3124 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3125 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3132 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3134 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3137 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16)
3138 %byte01 = zext i16 %tmp01.0 to i32
3140 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16)
3141 %tmp23.1 = zext i16 %tmp23.0 to i32
3150 ; GFX10: ; %bb.0:
3151 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3152 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3154 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3155 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3160 ; GFX9: ; %bb.0:
3161 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3162 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3164 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3165 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3168 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3172 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3174 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3177 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24)
3178 %byte01 = zext i16 %tmp01.0 to i32
3180 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24)
3181 %tmp23.1 = zext i16 %tmp23.0 to i32
3190 ; GFX10: ; %bb.0:
3191 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3192 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3194 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3195 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504
3200 ; GFX9: ; %bb.0:
3201 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3202 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3204 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3205 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3208 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3212 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3214 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3217 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32)
3218 %byte01 = zext i16 %tmp01.0 to i32
3220 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32)
3221 %tmp23.1 = zext i16 %tmp23.0 to i32
3230 ; GFX10: ; %bb.0:
3231 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3232 ; GFX10-NEXT: global_load_dword v6, v[0:1], off
3234 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3235 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407
3240 ; GFX9: ; %bb.0:
3241 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3242 ; GFX9-NEXT: global_load_dword v6, v[0:1], off
3244 ; GFX9-NEXT: s_mov_b32 s4, 0x30407
3245 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3248 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3252 %v1e0 = extractelement <2 x i16> %vec1, i64 0
3254 %v2e0 = extractelement <2 x i16> %vec2, i64 0
3257 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88)
3258 %byte01 = zext i16 %tmp01.0 to i32
3260 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88)
3261 %tmp23.1 = zext i16 %tmp23.0 to i32
3270 ; GFX10: ; %bb.0:
3271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3272 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3278 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7
3280 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3288 ; GFX9: ; %bb.0:
3289 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3290 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3296 ; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7
3298 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3303 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3307 %v1e0 = extractelement <4 x i8> %vec1, i64 0
3331 ; GFX10: ; %bb.0:
3332 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3333 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3340 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3349 ; GFX9: ; %bb.0:
3350 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3351 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3358 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3364 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3389 ; GFX10: ; %bb.0:
3390 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3392 ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
3393 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3394 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3060505
3399 ; GFX9: ; %bb.0:
3400 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3402 ; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
3403 ; GFX9-NEXT: s_mov_b32 s4, 0x3060505
3404 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3407 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3431 ; GFX10: ; %bb.0:
3432 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3434 ; GFX10-NEXT: global_load_dword v7, v[0:1], off
3435 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3436 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x70404
3441 ; GFX9: ; %bb.0:
3442 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444 ; GFX9-NEXT: global_load_dword v7, v[0:1], off
3445 ; GFX9-NEXT: s_mov_b32 s4, 0x70404
3446 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3449 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3453 %v1e0 = extractelement <8 x i8> %vec1, i64 0
3473 ; GFX10: ; %bb.0:
3474 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3476 ; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4
3477 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3478 ; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x2070505
3483 ; GFX9: ; %bb.0:
3484 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3486 ; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4
3487 ; GFX9-NEXT: s_mov_b32 s4, 0x2070505
3488 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3491 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3515 ; GFX10: ; %bb.0:
3516 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3517 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3518 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3519 ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404
3524 ; GFX9: ; %bb.0:
3525 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3526 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3527 ; GFX9-NEXT: s_mov_b32 s4, 0x1070404
3528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3531 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3554 ; GFX10: ; %bb.0:
3555 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3556 ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:252
3557 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3558 ; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6050707
3563 ; GFX9: ; %bb.0:
3564 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3565 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:252
3566 ; GFX9-NEXT: s_mov_b32 s4, 0x6050707
3567 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3570 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3594 ; GFX10: ; %bb.0:
3595 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3596 ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
3600 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3602 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6
3603 ; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0
3604 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1
3611 ; GFX9: ; %bb.0:
3612 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3613 ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
3616 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6
3618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3620 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1
3621 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2
3625 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3629 %v1e0 = extractelement <8 x i8> %vec1, i64 0
3650 ; GFX10: ; %bb.0:
3651 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3652 ; GFX10-NEXT: s_clause 0x3
3653 ; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6
3654 ; GFX10-NEXT: global_load_ushort v3, v[0:1], off
3655 ; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2
3656 ; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4
3659 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3666 ; GFX9: ; %bb.0:
3667 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3668 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6
3669 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off
3670 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4
3671 ; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2
3674 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3678 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3681 %el0 = extractelement <6 x i16> %vec, i32 0
3703 ; GFX10: ; %bb.0:
3704 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3705 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3706 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3712 ; GFX9: ; %bb.0:
3713 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3714 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
3715 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3718 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3721 %el0 = extractelement <7 x i16> %vec, i32 0
3742 ; GFX10: ; %bb.0:
3743 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3744 ; GFX10-NEXT: s_clause 0x1
3745 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
3746 ; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8
3749 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3750 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8
3751 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00
3752 ; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03
3758 ; GFX9: ; %bb.0:
3759 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3760 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
3761 ; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
3762 ; GFX9-NEXT: s_mov_b32 s4, 0x5040c00
3763 ; GFX9-NEXT: s_mov_b32 s5, 0x5040c03
3766 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3767 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8
3772 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3775 %el0 = extractelement <13 x i8> %vec, i32 0
3796 ; GFX10: ; %bb.0:
3797 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3798 ; GFX10-NEXT: s_clause 0x2
3799 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
3800 ; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
3801 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
3804 ; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504
3805 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3806 ; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504
3812 ; GFX9: ; %bb.0:
3813 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3814 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48
3815 ; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off
3816 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64
3817 ; GFX9-NEXT: s_mov_b32 s4, 0x1000504
3821 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3825 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3828 %el0 = extractelement <13 x i64> %vec, i32 0
3853 ; GFX10: ; %bb.0:
3854 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3855 ; GFX10-NEXT: s_clause 0x1
3856 ; GFX10-NEXT: global_load_ushort v2, v[0:1], off
3857 ; GFX10-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:4
3858 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3863 ; GFX9: ; %bb.0:
3864 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3865 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off
3866 ; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:4
3867 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3868 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3875 %el0 = extractelement <2 x i16> %tvec, i32 0