1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s 3 4define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) { 5; GCN-LABEL: float4_inselt: 6; GCN: ; %bb.0: ; %entry 7; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 8; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 9; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 10; GCN-NEXT: s_waitcnt lgkmcnt(0) 11; GCN-NEXT: s_cmp_lg_u32 s6, 3 12; GCN-NEXT: v_mov_b32_e32 v0, s3 13; GCN-NEXT: s_cselect_b64 vcc, -1, 0 14; GCN-NEXT: s_cmp_lg_u32 s6, 2 15; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc 16; GCN-NEXT: v_mov_b32_e32 v0, s2 17; GCN-NEXT: s_cselect_b64 vcc, -1, 0 18; GCN-NEXT: s_cmp_lg_u32 s6, 1 19; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc 20; GCN-NEXT: v_mov_b32_e32 v0, s1 21; GCN-NEXT: s_cselect_b64 vcc, -1, 0 22; GCN-NEXT: s_cmp_lg_u32 s6, 0 23; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc 24; GCN-NEXT: v_mov_b32_e32 v0, s0 25; GCN-NEXT: s_cselect_b64 vcc, -1, 0 26; GCN-NEXT: v_mov_b32_e32 v4, s4 27; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 28; GCN-NEXT: v_mov_b32_e32 v5, s5 29; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 30; GCN-NEXT: s_endpgm 31entry: 32 %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel 33 store <4 x float> %v, ptr addrspace(1) %out 34 ret void 35} 36 37define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) { 38; GCN-LABEL: float4_inselt_undef: 39; GCN: ; %bb.0: ; %entry 40; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 41; GCN-NEXT: v_mov_b32_e32 v0, 1.0 42; GCN-NEXT: v_mov_b32_e32 v1, v0 43; GCN-NEXT: v_mov_b32_e32 v2, v0 44; GCN-NEXT: v_mov_b32_e32 v3, v0 45; GCN-NEXT: s_waitcnt lgkmcnt(0) 46; GCN-NEXT: v_mov_b32_e32 v5, s1 47; GCN-NEXT: v_mov_b32_e32 v4, s0 48; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 49; GCN-NEXT: s_endpgm 50entry: 51 %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel 52 store <4 x float> %v, ptr addrspace(1) %out 53 ret void 54} 55 56define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) { 57; GCN-LABEL: int4_inselt: 58; GCN: ; %bb.0: ; %entry 59; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 60; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 61; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 62; GCN-NEXT: s_waitcnt lgkmcnt(0) 63; GCN-NEXT: s_cmp_lg_u32 s6, 3 64; GCN-NEXT: s_cselect_b32 s3, s3, 1 65; GCN-NEXT: s_cmp_lg_u32 s6, 2 66; GCN-NEXT: s_cselect_b32 s2, s2, 1 67; GCN-NEXT: s_cmp_lg_u32 s6, 1 68; GCN-NEXT: s_cselect_b32 s1, s1, 1 69; GCN-NEXT: s_cmp_lg_u32 s6, 0 70; GCN-NEXT: s_cselect_b32 s0, s0, 1 71; GCN-NEXT: v_mov_b32_e32 v4, s4 72; GCN-NEXT: v_mov_b32_e32 v0, s0 73; GCN-NEXT: v_mov_b32_e32 v1, s1 74; GCN-NEXT: v_mov_b32_e32 v2, s2 75; GCN-NEXT: v_mov_b32_e32 v3, s3 76; GCN-NEXT: v_mov_b32_e32 v5, s5 77; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 78; GCN-NEXT: s_endpgm 79entry: 80 %v = insertelement <4 x i32> %vec, i32 1, i32 %sel 81 store <4 x i32> %v, ptr addrspace(1) %out 82 ret void 83} 84 85define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { 86; GCN-LABEL: float2_inselt: 87; GCN: ; %bb.0: ; %entry 88; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 89; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 90; GCN-NEXT: s_waitcnt lgkmcnt(0) 91; GCN-NEXT: s_cmp_lg_u32 s2, 1 92; GCN-NEXT: v_mov_b32_e32 v0, s1 93; GCN-NEXT: s_cselect_b64 vcc, -1, 0 94; GCN-NEXT: s_cmp_lg_u32 s2, 0 95; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc 96; GCN-NEXT: v_mov_b32_e32 v0, s0 97; GCN-NEXT: s_cselect_b64 vcc, -1, 0 98; GCN-NEXT: v_mov_b32_e32 v2, s4 99; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 100; GCN-NEXT: v_mov_b32_e32 v3, s5 101; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 102; GCN-NEXT: s_endpgm 103entry: 104 %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel 105 store <2 x float> %v, ptr addrspace(1) %out 106 ret void 107} 108 109define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec, i32 %sel) { 110; GCN-LABEL: float8_inselt: 111; GCN: ; %bb.0: ; %entry 112; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 113; GCN-NEXT: s_load_dword s2, s[4:5], 0x64 114; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 115; GCN-NEXT: s_waitcnt lgkmcnt(0) 116; GCN-NEXT: v_mov_b32_e32 v0, s8 117; GCN-NEXT: s_mov_b32 m0, s2 118; GCN-NEXT: s_add_u32 s2, s0, 16 119; GCN-NEXT: s_addc_u32 s3, s1, 0 120; GCN-NEXT: v_mov_b32_e32 v1, s9 121; GCN-NEXT: v_mov_b32_e32 v2, s10 122; GCN-NEXT: v_mov_b32_e32 v3, s11 123; GCN-NEXT: v_mov_b32_e32 v4, s12 124; GCN-NEXT: v_mov_b32_e32 v5, s13 125; GCN-NEXT: v_mov_b32_e32 v6, s14 126; GCN-NEXT: v_mov_b32_e32 v7, s15 127; GCN-NEXT: v_mov_b32_e32 v9, s3 128; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 129; GCN-NEXT: v_mov_b32_e32 v8, s2 130; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 131; GCN-NEXT: s_nop 0 132; GCN-NEXT: v_mov_b32_e32 v5, s1 133; GCN-NEXT: v_mov_b32_e32 v4, s0 134; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 135; GCN-NEXT: s_endpgm 136entry: 137 %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel 138 store <8 x float> %v, ptr addrspace(1) %out 139 ret void 140} 141 142define amdgpu_kernel void @float16_inselt(ptr addrspace(1) %out, <16 x float> %vec, i32 %sel) { 143; GCN-LABEL: float16_inselt: 144; GCN: ; %bb.0: ; %entry 145; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 146; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 147; GCN-NEXT: s_load_dword s4, s[4:5], 0xa4 148; GCN-NEXT: s_waitcnt lgkmcnt(0) 149; GCN-NEXT: v_mov_b32_e32 v0, s8 150; GCN-NEXT: s_add_u32 s2, s0, 48 151; GCN-NEXT: s_addc_u32 s3, s1, 0 152; GCN-NEXT: v_mov_b32_e32 v17, s3 153; GCN-NEXT: v_mov_b32_e32 v1, s9 154; GCN-NEXT: v_mov_b32_e32 v2, s10 155; GCN-NEXT: v_mov_b32_e32 v3, s11 156; GCN-NEXT: v_mov_b32_e32 v4, s12 157; GCN-NEXT: v_mov_b32_e32 v5, s13 158; GCN-NEXT: v_mov_b32_e32 v6, s14 159; GCN-NEXT: v_mov_b32_e32 v7, s15 160; GCN-NEXT: v_mov_b32_e32 v8, s16 161; GCN-NEXT: v_mov_b32_e32 v9, s17 162; GCN-NEXT: v_mov_b32_e32 v10, s18 163; GCN-NEXT: v_mov_b32_e32 v11, s19 164; GCN-NEXT: v_mov_b32_e32 v12, s20 165; GCN-NEXT: v_mov_b32_e32 v13, s21 166; GCN-NEXT: v_mov_b32_e32 v14, s22 167; GCN-NEXT: v_mov_b32_e32 v15, s23 168; GCN-NEXT: s_mov_b32 m0, s4 169; GCN-NEXT: v_mov_b32_e32 v16, s2 170; GCN-NEXT: s_add_u32 s2, s0, 32 171; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 172; GCN-NEXT: s_addc_u32 s3, s1, 0 173; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 174; GCN-NEXT: s_nop 0 175; GCN-NEXT: v_mov_b32_e32 v13, s3 176; GCN-NEXT: v_mov_b32_e32 v12, s2 177; GCN-NEXT: s_add_u32 s2, s0, 16 178; GCN-NEXT: s_addc_u32 s3, s1, 0 179; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 180; GCN-NEXT: s_nop 0 181; GCN-NEXT: v_mov_b32_e32 v9, s3 182; GCN-NEXT: v_mov_b32_e32 v8, s2 183; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 184; GCN-NEXT: s_nop 0 185; GCN-NEXT: v_mov_b32_e32 v5, s1 186; GCN-NEXT: v_mov_b32_e32 v4, s0 187; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 188; GCN-NEXT: s_endpgm 189entry: 190 %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel 191 store <16 x float> %v, ptr addrspace(1) %out 192 ret void 193} 194 195define amdgpu_kernel void @float32_inselt(ptr addrspace(1) %out, <32 x float> %vec, i32 %sel) { 196; GCN-LABEL: float32_inselt: 197; GCN: ; %bb.0: ; %entry 198; GCN-NEXT: s_load_dword s2, s[4:5], 0x124 199; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 200; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 201; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 202; GCN-NEXT: s_waitcnt lgkmcnt(0) 203; GCN-NEXT: v_mov_b32_e32 v0, s36 204; GCN-NEXT: s_mov_b32 m0, s2 205; GCN-NEXT: s_add_u32 s2, s0, 0x70 206; GCN-NEXT: s_addc_u32 s3, s1, 0 207; GCN-NEXT: v_mov_b32_e32 v33, s3 208; GCN-NEXT: v_mov_b32_e32 v1, s37 209; GCN-NEXT: v_mov_b32_e32 v2, s38 210; GCN-NEXT: v_mov_b32_e32 v3, s39 211; GCN-NEXT: v_mov_b32_e32 v4, s40 212; GCN-NEXT: v_mov_b32_e32 v5, s41 213; GCN-NEXT: v_mov_b32_e32 v6, s42 214; GCN-NEXT: v_mov_b32_e32 v7, s43 215; GCN-NEXT: v_mov_b32_e32 v8, s44 216; GCN-NEXT: v_mov_b32_e32 v9, s45 217; GCN-NEXT: v_mov_b32_e32 v10, s46 218; GCN-NEXT: v_mov_b32_e32 v11, s47 219; GCN-NEXT: v_mov_b32_e32 v12, s48 220; GCN-NEXT: v_mov_b32_e32 v13, s49 221; GCN-NEXT: v_mov_b32_e32 v14, s50 222; GCN-NEXT: v_mov_b32_e32 v15, s51 223; GCN-NEXT: v_mov_b32_e32 v16, s8 224; GCN-NEXT: v_mov_b32_e32 v17, s9 225; GCN-NEXT: v_mov_b32_e32 v18, s10 226; GCN-NEXT: v_mov_b32_e32 v19, s11 227; GCN-NEXT: v_mov_b32_e32 v20, s12 228; GCN-NEXT: v_mov_b32_e32 v21, s13 229; GCN-NEXT: v_mov_b32_e32 v22, s14 230; GCN-NEXT: v_mov_b32_e32 v23, s15 231; GCN-NEXT: v_mov_b32_e32 v24, s16 232; GCN-NEXT: v_mov_b32_e32 v25, s17 233; GCN-NEXT: v_mov_b32_e32 v26, s18 234; GCN-NEXT: v_mov_b32_e32 v27, s19 235; GCN-NEXT: v_mov_b32_e32 v28, s20 236; GCN-NEXT: v_mov_b32_e32 v29, s21 237; GCN-NEXT: v_mov_b32_e32 v30, s22 238; GCN-NEXT: v_mov_b32_e32 v31, s23 239; GCN-NEXT: v_mov_b32_e32 v32, s2 240; GCN-NEXT: s_add_u32 s2, s0, 0x60 241; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 242; GCN-NEXT: s_addc_u32 s3, s1, 0 243; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] 244; GCN-NEXT: s_nop 0 245; GCN-NEXT: v_mov_b32_e32 v29, s3 246; GCN-NEXT: v_mov_b32_e32 v28, s2 247; GCN-NEXT: s_add_u32 s2, s0, 0x50 248; GCN-NEXT: s_addc_u32 s3, s1, 0 249; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] 250; GCN-NEXT: s_nop 0 251; GCN-NEXT: v_mov_b32_e32 v25, s3 252; GCN-NEXT: v_mov_b32_e32 v24, s2 253; GCN-NEXT: s_add_u32 s2, s0, 64 254; GCN-NEXT: s_addc_u32 s3, s1, 0 255; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] 256; GCN-NEXT: s_nop 0 257; GCN-NEXT: v_mov_b32_e32 v21, s3 258; GCN-NEXT: v_mov_b32_e32 v20, s2 259; GCN-NEXT: s_add_u32 s2, s0, 48 260; GCN-NEXT: s_addc_u32 s3, s1, 0 261; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 262; GCN-NEXT: s_nop 0 263; GCN-NEXT: v_mov_b32_e32 v17, s3 264; GCN-NEXT: v_mov_b32_e32 v16, s2 265; GCN-NEXT: s_add_u32 s2, s0, 32 266; GCN-NEXT: s_addc_u32 s3, s1, 0 267; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 268; GCN-NEXT: s_nop 0 269; GCN-NEXT: v_mov_b32_e32 v13, s3 270; GCN-NEXT: v_mov_b32_e32 v12, s2 271; GCN-NEXT: s_add_u32 s2, s0, 16 272; GCN-NEXT: s_addc_u32 s3, s1, 0 273; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 274; GCN-NEXT: s_nop 0 275; GCN-NEXT: v_mov_b32_e32 v9, s3 276; GCN-NEXT: v_mov_b32_e32 v8, s2 277; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 278; GCN-NEXT: s_nop 0 279; GCN-NEXT: v_mov_b32_e32 v5, s1 280; GCN-NEXT: v_mov_b32_e32 v4, s0 281; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 282; GCN-NEXT: s_endpgm 283entry: 284 %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel 285 store <32 x float> %v, ptr addrspace(1) %out 286 ret void 287} 288 289define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { 290; GCN-LABEL: half4_inselt: 291; GCN: ; %bb.0: ; %entry 292; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 293; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 294; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 295; GCN-NEXT: s_mov_b32 s5, s4 296; GCN-NEXT: s_waitcnt lgkmcnt(0) 297; GCN-NEXT: s_lshl_b32 s6, s6, 4 298; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 299; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 300; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 301; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] 302; GCN-NEXT: v_mov_b32_e32 v0, s0 303; GCN-NEXT: v_mov_b32_e32 v2, s2 304; GCN-NEXT: v_mov_b32_e32 v1, s1 305; GCN-NEXT: v_mov_b32_e32 v3, s3 306; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 307; GCN-NEXT: s_endpgm 308entry: 309 %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel 310 store <4 x half> %v, ptr addrspace(1) %out 311 ret void 312} 313 314define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { 315; GCN-LABEL: half2_inselt: 316; GCN: ; %bb.0: ; %entry 317; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 318; GCN-NEXT: s_waitcnt lgkmcnt(0) 319; GCN-NEXT: s_lshl_b32 s3, s3, 4 320; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 321; GCN-NEXT: s_andn2_b32 s2, s2, s3 322; GCN-NEXT: s_and_b32 s3, s3, 0x3c003c00 323; GCN-NEXT: s_or_b32 s2, s3, s2 324; GCN-NEXT: v_mov_b32_e32 v0, s0 325; GCN-NEXT: v_mov_b32_e32 v1, s1 326; GCN-NEXT: v_mov_b32_e32 v2, s2 327; GCN-NEXT: flat_store_dword v[0:1], v2 328; GCN-NEXT: s_endpgm 329entry: 330 %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel 331 store <2 x half> %v, ptr addrspace(1) %out 332 ret void 333} 334 335define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec, i32 %sel) { 336; GCN-LABEL: half8_inselt: 337; GCN: ; %bb.0: ; %entry 338; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 339; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 340; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 341; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00 342; GCN-NEXT: s_waitcnt lgkmcnt(0) 343; GCN-NEXT: s_lshr_b32 s7, s3, 16 344; GCN-NEXT: s_cmp_lg_u32 s6, 7 345; GCN-NEXT: v_mov_b32_e32 v1, s7 346; GCN-NEXT: s_cselect_b64 vcc, -1, 0 347; GCN-NEXT: s_cmp_lg_u32 s6, 6 348; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 349; GCN-NEXT: v_mov_b32_e32 v2, s3 350; GCN-NEXT: s_cselect_b64 vcc, -1, 0 351; GCN-NEXT: s_lshr_b32 s3, s2, 16 352; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 353; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc 354; GCN-NEXT: s_cmp_lg_u32 s6, 5 355; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 356; GCN-NEXT: v_mov_b32_e32 v1, s3 357; GCN-NEXT: s_cselect_b64 vcc, -1, 0 358; GCN-NEXT: s_cmp_lg_u32 s6, 4 359; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 360; GCN-NEXT: v_mov_b32_e32 v2, s2 361; GCN-NEXT: s_cselect_b64 vcc, -1, 0 362; GCN-NEXT: s_lshr_b32 s2, s1, 16 363; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 364; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc 365; GCN-NEXT: s_cmp_lg_u32 s6, 3 366; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 367; GCN-NEXT: v_mov_b32_e32 v1, s2 368; GCN-NEXT: s_cselect_b64 vcc, -1, 0 369; GCN-NEXT: s_cmp_lg_u32 s6, 2 370; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 371; GCN-NEXT: v_mov_b32_e32 v4, s1 372; GCN-NEXT: s_cselect_b64 vcc, -1, 0 373; GCN-NEXT: s_lshr_b32 s1, s0, 16 374; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 375; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc 376; GCN-NEXT: s_cmp_lg_u32 s6, 1 377; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 378; GCN-NEXT: v_mov_b32_e32 v4, s1 379; GCN-NEXT: s_cselect_b64 vcc, -1, 0 380; GCN-NEXT: s_cmp_lg_u32 s6, 0 381; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc 382; GCN-NEXT: v_mov_b32_e32 v5, s0 383; GCN-NEXT: s_cselect_b64 vcc, -1, 0 384; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 385; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 386; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 387; GCN-NEXT: v_mov_b32_e32 v4, s4 388; GCN-NEXT: v_mov_b32_e32 v5, s5 389; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 390; GCN-NEXT: s_endpgm 391entry: 392 %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel 393 store <8 x half> %v, ptr addrspace(1) %out 394 ret void 395} 396 397define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { 398; GCN-LABEL: short2_inselt: 399; GCN: ; %bb.0: ; %entry 400; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 401; GCN-NEXT: s_waitcnt lgkmcnt(0) 402; GCN-NEXT: s_lshl_b32 s3, s3, 4 403; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 404; GCN-NEXT: s_andn2_b32 s2, s2, s3 405; GCN-NEXT: s_and_b32 s3, s3, 0x10001 406; GCN-NEXT: s_or_b32 s2, s3, s2 407; GCN-NEXT: v_mov_b32_e32 v0, s0 408; GCN-NEXT: v_mov_b32_e32 v1, s1 409; GCN-NEXT: v_mov_b32_e32 v2, s2 410; GCN-NEXT: flat_store_dword v[0:1], v2 411; GCN-NEXT: s_endpgm 412entry: 413 %v = insertelement <2 x i16> %vec, i16 1, i32 %sel 414 store <2 x i16> %v, ptr addrspace(1) %out 415 ret void 416} 417 418define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { 419; GCN-LABEL: short4_inselt: 420; GCN: ; %bb.0: ; %entry 421; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 422; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 423; GCN-NEXT: s_mov_b32 s4, 0x10001 424; GCN-NEXT: s_mov_b32 s5, s4 425; GCN-NEXT: s_waitcnt lgkmcnt(0) 426; GCN-NEXT: s_lshl_b32 s6, s6, 4 427; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 428; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] 429; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 430; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] 431; GCN-NEXT: v_mov_b32_e32 v0, s0 432; GCN-NEXT: v_mov_b32_e32 v2, s2 433; GCN-NEXT: v_mov_b32_e32 v1, s1 434; GCN-NEXT: v_mov_b32_e32 v3, s3 435; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 436; GCN-NEXT: s_endpgm 437entry: 438 %v = insertelement <4 x i16> %vec, i16 1, i32 %sel 439 store <4 x i16> %v, ptr addrspace(1) %out 440 ret void 441} 442 443define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { 444; GCN-LABEL: byte8_inselt: 445; GCN: ; %bb.0: ; %entry 446; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 447; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 448; GCN-NEXT: s_waitcnt lgkmcnt(0) 449; GCN-NEXT: s_lshl_b32 s4, s6, 3 450; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 451; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 452; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 453; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] 454; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] 455; GCN-NEXT: v_mov_b32_e32 v0, s0 456; GCN-NEXT: v_mov_b32_e32 v2, s2 457; GCN-NEXT: v_mov_b32_e32 v1, s1 458; GCN-NEXT: v_mov_b32_e32 v3, s3 459; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 460; GCN-NEXT: s_endpgm 461entry: 462 %v = insertelement <8 x i8> %vec, i8 1, i32 %sel 463 store <8 x i8> %v, ptr addrspace(1) %out 464 ret void 465} 466 467define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec, i32 %sel) { 468; GCN-LABEL: byte16_inselt: 469; GCN: ; %bb.0: ; %entry 470; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 471; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 472; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 473; GCN-NEXT: s_waitcnt lgkmcnt(0) 474; GCN-NEXT: s_lshr_b32 s7, s3, 24 475; GCN-NEXT: s_cmp_lg_u32 s6, 15 476; GCN-NEXT: s_cselect_b32 s7, s7, 1 477; GCN-NEXT: s_lshr_b32 s8, s3, 16 478; GCN-NEXT: s_lshl_b32 s7, s7, 8 479; GCN-NEXT: s_cmp_lg_u32 s6, 14 480; GCN-NEXT: s_cselect_b32 s8, s8, 1 481; GCN-NEXT: s_and_b32 s8, s8, 0xff 482; GCN-NEXT: s_or_b32 s7, s8, s7 483; GCN-NEXT: s_lshr_b32 s9, s3, 8 484; GCN-NEXT: s_lshl_b32 s7, s7, 16 485; GCN-NEXT: s_cmp_lg_u32 s6, 13 486; GCN-NEXT: s_cselect_b32 s8, s9, 1 487; GCN-NEXT: s_lshl_b32 s8, s8, 8 488; GCN-NEXT: s_cmp_lg_u32 s6, 12 489; GCN-NEXT: s_cselect_b32 s3, s3, 1 490; GCN-NEXT: s_and_b32 s3, s3, 0xff 491; GCN-NEXT: s_or_b32 s3, s3, s8 492; GCN-NEXT: s_and_b32 s3, s3, 0xffff 493; GCN-NEXT: s_or_b32 s3, s3, s7 494; GCN-NEXT: s_lshr_b32 s7, s2, 24 495; GCN-NEXT: s_cmp_lg_u32 s6, 11 496; GCN-NEXT: s_cselect_b32 s7, s7, 1 497; GCN-NEXT: s_lshl_b32 s7, s7, 8 498; GCN-NEXT: s_lshr_b32 s8, s2, 16 499; GCN-NEXT: s_cmp_lg_u32 s6, 10 500; GCN-NEXT: s_cselect_b32 s8, s8, 1 501; GCN-NEXT: s_and_b32 s8, s8, 0xff 502; GCN-NEXT: s_or_b32 s7, s8, s7 503; GCN-NEXT: s_lshl_b32 s7, s7, 16 504; GCN-NEXT: s_lshr_b32 s8, s2, 8 505; GCN-NEXT: s_cmp_lg_u32 s6, 9 506; GCN-NEXT: s_cselect_b32 s8, s8, 1 507; GCN-NEXT: s_lshl_b32 s8, s8, 8 508; GCN-NEXT: s_cmp_lg_u32 s6, 8 509; GCN-NEXT: s_cselect_b32 s2, s2, 1 510; GCN-NEXT: s_and_b32 s2, s2, 0xff 511; GCN-NEXT: s_or_b32 s2, s2, s8 512; GCN-NEXT: s_and_b32 s2, s2, 0xffff 513; GCN-NEXT: s_or_b32 s2, s2, s7 514; GCN-NEXT: s_lshr_b32 s7, s1, 24 515; GCN-NEXT: s_cmp_lg_u32 s6, 7 516; GCN-NEXT: s_cselect_b32 s7, s7, 1 517; GCN-NEXT: s_lshl_b32 s7, s7, 8 518; GCN-NEXT: s_lshr_b32 s8, s1, 16 519; GCN-NEXT: s_cmp_lg_u32 s6, 6 520; GCN-NEXT: s_cselect_b32 s8, s8, 1 521; GCN-NEXT: s_and_b32 s8, s8, 0xff 522; GCN-NEXT: s_or_b32 s7, s8, s7 523; GCN-NEXT: s_lshl_b32 s7, s7, 16 524; GCN-NEXT: s_lshr_b32 s8, s1, 8 525; GCN-NEXT: s_cmp_lg_u32 s6, 5 526; GCN-NEXT: s_cselect_b32 s8, s8, 1 527; GCN-NEXT: s_lshl_b32 s8, s8, 8 528; GCN-NEXT: s_cmp_lg_u32 s6, 4 529; GCN-NEXT: s_cselect_b32 s1, s1, 1 530; GCN-NEXT: s_and_b32 s1, s1, 0xff 531; GCN-NEXT: s_or_b32 s1, s1, s8 532; GCN-NEXT: s_and_b32 s1, s1, 0xffff 533; GCN-NEXT: s_or_b32 s1, s1, s7 534; GCN-NEXT: s_lshr_b32 s7, s0, 24 535; GCN-NEXT: s_cmp_lg_u32 s6, 3 536; GCN-NEXT: s_cselect_b32 s7, s7, 1 537; GCN-NEXT: s_lshl_b32 s7, s7, 8 538; GCN-NEXT: s_lshr_b32 s8, s0, 16 539; GCN-NEXT: s_cmp_lg_u32 s6, 2 540; GCN-NEXT: s_cselect_b32 s8, s8, 1 541; GCN-NEXT: s_and_b32 s8, s8, 0xff 542; GCN-NEXT: s_or_b32 s7, s8, s7 543; GCN-NEXT: s_lshl_b32 s7, s7, 16 544; GCN-NEXT: s_lshr_b32 s8, s0, 8 545; GCN-NEXT: s_cmp_lg_u32 s6, 1 546; GCN-NEXT: s_cselect_b32 s8, s8, 1 547; GCN-NEXT: s_lshl_b32 s8, s8, 8 548; GCN-NEXT: s_cmp_lg_u32 s6, 0 549; GCN-NEXT: s_cselect_b32 s0, s0, 1 550; GCN-NEXT: s_and_b32 s0, s0, 0xff 551; GCN-NEXT: s_or_b32 s0, s0, s8 552; GCN-NEXT: s_and_b32 s0, s0, 0xffff 553; GCN-NEXT: s_or_b32 s0, s0, s7 554; GCN-NEXT: v_mov_b32_e32 v4, s4 555; GCN-NEXT: v_mov_b32_e32 v0, s0 556; GCN-NEXT: v_mov_b32_e32 v1, s1 557; GCN-NEXT: v_mov_b32_e32 v2, s2 558; GCN-NEXT: v_mov_b32_e32 v3, s3 559; GCN-NEXT: v_mov_b32_e32 v5, s5 560; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 561; GCN-NEXT: s_endpgm 562entry: 563 %v = insertelement <16 x i8> %vec, i8 1, i32 %sel 564 store <16 x i8> %v, ptr addrspace(1) %out 565 ret void 566} 567 568define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) { 569; GCN-LABEL: double2_inselt: 570; GCN: ; %bb.0: ; %entry 571; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 572; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 573; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 574; GCN-NEXT: s_waitcnt lgkmcnt(0) 575; GCN-NEXT: s_cmp_eq_u32 s6, 1 576; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3 577; GCN-NEXT: s_cselect_b32 s2, 0, s2 578; GCN-NEXT: s_cmp_eq_u32 s6, 0 579; GCN-NEXT: s_cselect_b32 s1, 0x3ff00000, s1 580; GCN-NEXT: s_cselect_b32 s0, 0, s0 581; GCN-NEXT: v_mov_b32_e32 v4, s4 582; GCN-NEXT: v_mov_b32_e32 v0, s0 583; GCN-NEXT: v_mov_b32_e32 v1, s1 584; GCN-NEXT: v_mov_b32_e32 v2, s2 585; GCN-NEXT: v_mov_b32_e32 v3, s3 586; GCN-NEXT: v_mov_b32_e32 v5, s5 587; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 588; GCN-NEXT: s_endpgm 589entry: 590 %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel 591 store <2 x double> %v, ptr addrspace(1) %out 592 ret void 593} 594 595define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) { 596; GCN-LABEL: double5_inselt: 597; GCN: ; %bb.0: ; %entry 598; GCN-NEXT: s_load_dword s12, s[4:5], 0xa4 599; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x84 600; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x24 601; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 602; GCN-NEXT: s_waitcnt lgkmcnt(0) 603; GCN-NEXT: s_cmp_eq_u32 s12, 4 604; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9 605; GCN-NEXT: s_cselect_b32 s8, 0, s8 606; GCN-NEXT: s_cmp_eq_u32 s12, 1 607; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3 608; GCN-NEXT: s_cselect_b32 s2, 0, s2 609; GCN-NEXT: s_cmp_eq_u32 s12, 0 610; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1 611; GCN-NEXT: s_cselect_b32 s14, 0, s0 612; GCN-NEXT: s_cmp_eq_u32 s12, 3 613; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7 614; GCN-NEXT: s_cselect_b32 s1, 0, s6 615; GCN-NEXT: s_cmp_eq_u32 s12, 2 616; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5 617; GCN-NEXT: s_cselect_b32 s4, 0, s4 618; GCN-NEXT: v_mov_b32_e32 v3, s0 619; GCN-NEXT: s_add_u32 s0, s10, 16 620; GCN-NEXT: v_mov_b32_e32 v2, s1 621; GCN-NEXT: s_addc_u32 s1, s11, 0 622; GCN-NEXT: v_mov_b32_e32 v5, s1 623; GCN-NEXT: v_mov_b32_e32 v0, s4 624; GCN-NEXT: v_mov_b32_e32 v1, s5 625; GCN-NEXT: v_mov_b32_e32 v4, s0 626; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 627; GCN-NEXT: v_mov_b32_e32 v4, s10 628; GCN-NEXT: s_add_u32 s0, s10, 32 629; GCN-NEXT: v_mov_b32_e32 v0, s14 630; GCN-NEXT: v_mov_b32_e32 v1, s13 631; GCN-NEXT: v_mov_b32_e32 v2, s2 632; GCN-NEXT: v_mov_b32_e32 v3, s3 633; GCN-NEXT: v_mov_b32_e32 v5, s11 634; GCN-NEXT: s_addc_u32 s1, s11, 0 635; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 636; GCN-NEXT: s_nop 0 637; GCN-NEXT: v_mov_b32_e32 v3, s1 638; GCN-NEXT: v_mov_b32_e32 v0, s8 639; GCN-NEXT: v_mov_b32_e32 v1, s9 640; GCN-NEXT: v_mov_b32_e32 v2, s0 641; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 642; GCN-NEXT: s_endpgm 643entry: 644 %v = insertelement <5 x double> %vec, double 1.000000e+00, i32 %sel 645 store <5 x double> %v, ptr addrspace(1) %out 646 ret void 647} 648 649define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) { 650; GCN-LABEL: double8_inselt: 651; GCN: ; %bb.0: ; %entry 652; GCN-NEXT: s_load_dword s2, s[4:5], 0xa4 653; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 654; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 655; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 656; GCN-NEXT: s_waitcnt lgkmcnt(0) 657; GCN-NEXT: s_lshl_b32 s2, s2, 1 658; GCN-NEXT: v_mov_b32_e32 v0, s8 659; GCN-NEXT: v_mov_b32_e32 v1, s9 660; GCN-NEXT: v_mov_b32_e32 v2, s10 661; GCN-NEXT: v_mov_b32_e32 v3, s11 662; GCN-NEXT: v_mov_b32_e32 v4, s12 663; GCN-NEXT: v_mov_b32_e32 v5, s13 664; GCN-NEXT: v_mov_b32_e32 v6, s14 665; GCN-NEXT: v_mov_b32_e32 v7, s15 666; GCN-NEXT: v_mov_b32_e32 v8, s16 667; GCN-NEXT: v_mov_b32_e32 v9, s17 668; GCN-NEXT: v_mov_b32_e32 v10, s18 669; GCN-NEXT: v_mov_b32_e32 v11, s19 670; GCN-NEXT: v_mov_b32_e32 v12, s20 671; GCN-NEXT: v_mov_b32_e32 v13, s21 672; GCN-NEXT: v_mov_b32_e32 v14, s22 673; GCN-NEXT: v_mov_b32_e32 v15, s23 674; GCN-NEXT: s_mov_b32 m0, s2 675; GCN-NEXT: s_add_u32 s2, s0, 48 676; GCN-NEXT: v_movreld_b32_e32 v0, 0 677; GCN-NEXT: s_addc_u32 s3, s1, 0 678; GCN-NEXT: v_movreld_b32_e32 v1, v16 679; GCN-NEXT: v_mov_b32_e32 v17, s3 680; GCN-NEXT: v_mov_b32_e32 v16, s2 681; GCN-NEXT: s_add_u32 s2, s0, 32 682; GCN-NEXT: s_addc_u32 s3, s1, 0 683; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 684; GCN-NEXT: s_nop 0 685; GCN-NEXT: v_mov_b32_e32 v13, s3 686; GCN-NEXT: v_mov_b32_e32 v12, s2 687; GCN-NEXT: s_add_u32 s2, s0, 16 688; GCN-NEXT: s_addc_u32 s3, s1, 0 689; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 690; GCN-NEXT: s_nop 0 691; GCN-NEXT: v_mov_b32_e32 v9, s3 692; GCN-NEXT: v_mov_b32_e32 v8, s2 693; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 694; GCN-NEXT: s_nop 0 695; GCN-NEXT: v_mov_b32_e32 v5, s1 696; GCN-NEXT: v_mov_b32_e32 v4, s0 697; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 698; GCN-NEXT: s_endpgm 699entry: 700 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel 701 store <8 x double> %v, ptr addrspace(1) %out 702 ret void 703} 704 705define amdgpu_kernel void @double7_inselt(ptr addrspace(1) %out, <7 x double> %vec, i32 %sel) { 706; GCN-LABEL: double7_inselt: 707; GCN: ; %bb.0: ; %entry 708; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x64 709; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 710; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x94 711; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x84 712; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000 713; GCN-NEXT: s_waitcnt lgkmcnt(0) 714; GCN-NEXT: v_mov_b32_e32 v0, s8 715; GCN-NEXT: v_mov_b32_e32 v1, s9 716; GCN-NEXT: v_mov_b32_e32 v2, s10 717; GCN-NEXT: v_mov_b32_e32 v8, s0 718; GCN-NEXT: s_load_dword s0, s[4:5], 0xa4 719; GCN-NEXT: v_mov_b32_e32 v3, s11 720; GCN-NEXT: v_mov_b32_e32 v4, s12 721; GCN-NEXT: v_mov_b32_e32 v5, s13 722; GCN-NEXT: v_mov_b32_e32 v6, s14 723; GCN-NEXT: s_waitcnt lgkmcnt(0) 724; GCN-NEXT: s_lshl_b32 s0, s0, 1 725; GCN-NEXT: v_mov_b32_e32 v7, s15 726; GCN-NEXT: v_mov_b32_e32 v9, s1 727; GCN-NEXT: v_mov_b32_e32 v10, s2 728; GCN-NEXT: v_mov_b32_e32 v11, s3 729; GCN-NEXT: v_mov_b32_e32 v12, s16 730; GCN-NEXT: v_mov_b32_e32 v13, s17 731; GCN-NEXT: s_mov_b32 m0, s0 732; GCN-NEXT: v_movreld_b32_e32 v0, 0 733; GCN-NEXT: s_add_u32 s0, s6, 16 734; GCN-NEXT: v_movreld_b32_e32 v1, v16 735; GCN-NEXT: s_addc_u32 s1, s7, 0 736; GCN-NEXT: v_mov_b32_e32 v15, s1 737; GCN-NEXT: v_mov_b32_e32 v14, s0 738; GCN-NEXT: flat_store_dwordx4 v[14:15], v[4:7] 739; GCN-NEXT: s_add_u32 s0, s6, 48 740; GCN-NEXT: v_mov_b32_e32 v4, s6 741; GCN-NEXT: v_mov_b32_e32 v5, s7 742; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 743; GCN-NEXT: s_addc_u32 s1, s7, 0 744; GCN-NEXT: v_mov_b32_e32 v0, s0 745; GCN-NEXT: v_mov_b32_e32 v1, s1 746; GCN-NEXT: s_add_u32 s0, s6, 32 747; GCN-NEXT: flat_store_dwordx2 v[0:1], v[12:13] 748; GCN-NEXT: s_addc_u32 s1, s7, 0 749; GCN-NEXT: v_mov_b32_e32 v0, s0 750; GCN-NEXT: v_mov_b32_e32 v1, s1 751; GCN-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 752; GCN-NEXT: s_endpgm 753entry: 754 %v = insertelement <7 x double> %vec, double 1.000000e+00, i32 %sel 755 store <7 x double> %v, ptr addrspace(1) %out 756 ret void 757} 758 759define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double> %vec, i32 %sel) { 760; GCN-LABEL: double16_inselt: 761; GCN: ; %bb.0: ; %entry 762; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 763; GCN-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 764; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xe4 765; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 766; GCN-NEXT: s_waitcnt lgkmcnt(0) 767; GCN-NEXT: v_mov_b32_e32 v0, s36 768; GCN-NEXT: s_lshl_b32 s0, s0, 1 769; GCN-NEXT: s_mov_b32 m0, s0 770; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 771; GCN-NEXT: v_mov_b32_e32 v1, s37 772; GCN-NEXT: v_mov_b32_e32 v2, s38 773; GCN-NEXT: v_mov_b32_e32 v3, s39 774; GCN-NEXT: v_mov_b32_e32 v4, s40 775; GCN-NEXT: v_mov_b32_e32 v5, s41 776; GCN-NEXT: v_mov_b32_e32 v6, s42 777; GCN-NEXT: v_mov_b32_e32 v7, s43 778; GCN-NEXT: v_mov_b32_e32 v8, s44 779; GCN-NEXT: v_mov_b32_e32 v9, s45 780; GCN-NEXT: v_mov_b32_e32 v10, s46 781; GCN-NEXT: v_mov_b32_e32 v11, s47 782; GCN-NEXT: v_mov_b32_e32 v12, s48 783; GCN-NEXT: v_mov_b32_e32 v13, s49 784; GCN-NEXT: v_mov_b32_e32 v14, s50 785; GCN-NEXT: v_mov_b32_e32 v15, s51 786; GCN-NEXT: v_mov_b32_e32 v16, s8 787; GCN-NEXT: v_mov_b32_e32 v17, s9 788; GCN-NEXT: v_mov_b32_e32 v18, s10 789; GCN-NEXT: v_mov_b32_e32 v19, s11 790; GCN-NEXT: v_mov_b32_e32 v20, s12 791; GCN-NEXT: v_mov_b32_e32 v21, s13 792; GCN-NEXT: v_mov_b32_e32 v22, s14 793; GCN-NEXT: v_mov_b32_e32 v23, s15 794; GCN-NEXT: v_mov_b32_e32 v24, s16 795; GCN-NEXT: v_mov_b32_e32 v25, s17 796; GCN-NEXT: v_mov_b32_e32 v26, s18 797; GCN-NEXT: v_mov_b32_e32 v27, s19 798; GCN-NEXT: v_mov_b32_e32 v28, s20 799; GCN-NEXT: v_mov_b32_e32 v29, s21 800; GCN-NEXT: v_mov_b32_e32 v30, s22 801; GCN-NEXT: v_mov_b32_e32 v31, s23 802; GCN-NEXT: s_waitcnt lgkmcnt(0) 803; GCN-NEXT: s_add_u32 s2, s0, 0x70 804; GCN-NEXT: v_movreld_b32_e32 v0, 0 805; GCN-NEXT: s_addc_u32 s3, s1, 0 806; GCN-NEXT: v_movreld_b32_e32 v1, v32 807; GCN-NEXT: v_mov_b32_e32 v33, s3 808; GCN-NEXT: v_mov_b32_e32 v32, s2 809; GCN-NEXT: s_add_u32 s2, s0, 0x60 810; GCN-NEXT: s_addc_u32 s3, s1, 0 811; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31] 812; GCN-NEXT: s_nop 0 813; GCN-NEXT: v_mov_b32_e32 v29, s3 814; GCN-NEXT: v_mov_b32_e32 v28, s2 815; GCN-NEXT: s_add_u32 s2, s0, 0x50 816; GCN-NEXT: s_addc_u32 s3, s1, 0 817; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27] 818; GCN-NEXT: s_nop 0 819; GCN-NEXT: v_mov_b32_e32 v25, s3 820; GCN-NEXT: v_mov_b32_e32 v24, s2 821; GCN-NEXT: s_add_u32 s2, s0, 64 822; GCN-NEXT: s_addc_u32 s3, s1, 0 823; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23] 824; GCN-NEXT: s_nop 0 825; GCN-NEXT: v_mov_b32_e32 v21, s3 826; GCN-NEXT: v_mov_b32_e32 v20, s2 827; GCN-NEXT: s_add_u32 s2, s0, 48 828; GCN-NEXT: s_addc_u32 s3, s1, 0 829; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 830; GCN-NEXT: s_nop 0 831; GCN-NEXT: v_mov_b32_e32 v17, s3 832; GCN-NEXT: v_mov_b32_e32 v16, s2 833; GCN-NEXT: s_add_u32 s2, s0, 32 834; GCN-NEXT: s_addc_u32 s3, s1, 0 835; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 836; GCN-NEXT: s_nop 0 837; GCN-NEXT: v_mov_b32_e32 v13, s3 838; GCN-NEXT: v_mov_b32_e32 v12, s2 839; GCN-NEXT: s_add_u32 s2, s0, 16 840; GCN-NEXT: s_addc_u32 s3, s1, 0 841; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 842; GCN-NEXT: s_nop 0 843; GCN-NEXT: v_mov_b32_e32 v9, s3 844; GCN-NEXT: v_mov_b32_e32 v8, s2 845; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 846; GCN-NEXT: s_nop 0 847; GCN-NEXT: v_mov_b32_e32 v5, s1 848; GCN-NEXT: v_mov_b32_e32 v4, s0 849; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 850; GCN-NEXT: s_endpgm 851entry: 852 %v = insertelement <16 x double> %vec, double 1.000000e+00, i32 %sel 853 store <16 x double> %v, ptr addrspace(1) %out 854 ret void 855} 856 857define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double> %vec, i32 %sel) { 858; GCN-LABEL: double15_inselt: 859; GCN: ; %bb.0: ; %entry 860; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0xa4 861; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x114 862; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x104 863; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0xe4 864; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000 865; GCN-NEXT: s_waitcnt lgkmcnt(0) 866; GCN-NEXT: v_mov_b32_e32 v0, s8 867; GCN-NEXT: v_mov_b32_e32 v1, s9 868; GCN-NEXT: v_mov_b32_e32 v24, s0 869; GCN-NEXT: s_load_dword s0, s[4:5], 0x124 870; GCN-NEXT: v_mov_b32_e32 v25, s1 871; GCN-NEXT: v_mov_b32_e32 v2, s10 872; GCN-NEXT: v_mov_b32_e32 v3, s11 873; GCN-NEXT: v_mov_b32_e32 v4, s12 874; GCN-NEXT: s_waitcnt lgkmcnt(0) 875; GCN-NEXT: s_lshl_b32 s0, s0, 1 876; GCN-NEXT: s_mov_b32 m0, s0 877; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 878; GCN-NEXT: v_mov_b32_e32 v5, s13 879; GCN-NEXT: v_mov_b32_e32 v6, s14 880; GCN-NEXT: v_mov_b32_e32 v7, s15 881; GCN-NEXT: v_mov_b32_e32 v8, s16 882; GCN-NEXT: v_mov_b32_e32 v9, s17 883; GCN-NEXT: v_mov_b32_e32 v10, s18 884; GCN-NEXT: v_mov_b32_e32 v11, s19 885; GCN-NEXT: v_mov_b32_e32 v12, s20 886; GCN-NEXT: v_mov_b32_e32 v13, s21 887; GCN-NEXT: v_mov_b32_e32 v14, s22 888; GCN-NEXT: v_mov_b32_e32 v15, s23 889; GCN-NEXT: v_mov_b32_e32 v16, s24 890; GCN-NEXT: v_mov_b32_e32 v17, s25 891; GCN-NEXT: v_mov_b32_e32 v18, s26 892; GCN-NEXT: v_mov_b32_e32 v19, s27 893; GCN-NEXT: v_mov_b32_e32 v20, s28 894; GCN-NEXT: v_mov_b32_e32 v21, s29 895; GCN-NEXT: v_mov_b32_e32 v22, s30 896; GCN-NEXT: v_mov_b32_e32 v23, s31 897; GCN-NEXT: v_mov_b32_e32 v26, s2 898; GCN-NEXT: v_mov_b32_e32 v27, s3 899; GCN-NEXT: v_mov_b32_e32 v28, s6 900; GCN-NEXT: v_mov_b32_e32 v29, s7 901; GCN-NEXT: v_movreld_b32_e32 v0, 0 902; GCN-NEXT: s_waitcnt lgkmcnt(0) 903; GCN-NEXT: s_add_u32 s2, s0, 0x50 904; GCN-NEXT: v_movreld_b32_e32 v1, v32 905; GCN-NEXT: s_addc_u32 s3, s1, 0 906; GCN-NEXT: v_mov_b32_e32 v31, s3 907; GCN-NEXT: v_mov_b32_e32 v30, s2 908; GCN-NEXT: s_add_u32 s2, s0, 64 909; GCN-NEXT: s_addc_u32 s3, s1, 0 910; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23] 911; GCN-NEXT: s_nop 0 912; GCN-NEXT: v_mov_b32_e32 v21, s3 913; GCN-NEXT: v_mov_b32_e32 v20, s2 914; GCN-NEXT: s_add_u32 s2, s0, 48 915; GCN-NEXT: s_addc_u32 s3, s1, 0 916; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 917; GCN-NEXT: s_nop 0 918; GCN-NEXT: v_mov_b32_e32 v17, s3 919; GCN-NEXT: v_mov_b32_e32 v16, s2 920; GCN-NEXT: s_add_u32 s2, s0, 32 921; GCN-NEXT: s_addc_u32 s3, s1, 0 922; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 923; GCN-NEXT: s_nop 0 924; GCN-NEXT: v_mov_b32_e32 v13, s3 925; GCN-NEXT: v_mov_b32_e32 v12, s2 926; GCN-NEXT: s_add_u32 s2, s0, 16 927; GCN-NEXT: s_addc_u32 s3, s1, 0 928; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 929; GCN-NEXT: s_nop 0 930; GCN-NEXT: v_mov_b32_e32 v9, s3 931; GCN-NEXT: v_mov_b32_e32 v8, s2 932; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 933; GCN-NEXT: s_add_u32 s2, s0, 0x70 934; GCN-NEXT: v_mov_b32_e32 v5, s1 935; GCN-NEXT: v_mov_b32_e32 v4, s0 936; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 937; GCN-NEXT: s_addc_u32 s3, s1, 0 938; GCN-NEXT: v_mov_b32_e32 v0, s2 939; GCN-NEXT: v_mov_b32_e32 v1, s3 940; GCN-NEXT: s_add_u32 s0, s0, 0x60 941; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29] 942; GCN-NEXT: s_addc_u32 s1, s1, 0 943; GCN-NEXT: v_mov_b32_e32 v0, s0 944; GCN-NEXT: v_mov_b32_e32 v1, s1 945; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27] 946; GCN-NEXT: s_endpgm 947entry: 948 %v = insertelement <15 x double> %vec, double 1.000000e+00, i32 %sel 949 store <15 x double> %v, ptr addrspace(1) %out 950 ret void 951} 952 953; FIXME: Fold out s_or_b32 s2, 0, s3 954define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { 955; GCN-LABEL: bit4_inselt: 956; GCN: ; %bb.0: ; %entry 957; GCN-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 958; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 959; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 960; GCN-NEXT: s_mov_b32 s14, -1 961; GCN-NEXT: s_mov_b32 s15, 0xe80000 962; GCN-NEXT: s_add_u32 s12, s12, s11 963; GCN-NEXT: s_addc_u32 s13, s13, 0 964; GCN-NEXT: s_waitcnt lgkmcnt(0) 965; GCN-NEXT: s_bfe_u32 s6, s2, 0x10003 966; GCN-NEXT: v_mov_b32_e32 v0, s2 967; GCN-NEXT: s_bfe_u32 s5, s2, 0x20002 968; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 969; GCN-NEXT: v_mov_b32_e32 v0, s6 970; GCN-NEXT: s_bfe_u32 s4, s2, 0x10001 971; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 972; GCN-NEXT: v_mov_b32_e32 v0, s5 973; GCN-NEXT: s_and_b32 s3, s3, 3 974; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 975; GCN-NEXT: v_mov_b32_e32 v0, s4 976; GCN-NEXT: v_or_b32_e64 v1, s3, 0 977; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 978; GCN-NEXT: v_mov_b32_e32 v0, 1 979; GCN-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen 980; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 981; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 982; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 983; GCN-NEXT: buffer_load_ubyte v3, off, s[12:15], 0 offset:3 984; GCN-NEXT: s_waitcnt vmcnt(3) 985; GCN-NEXT: v_and_b32_e32 v0, 1, v0 986; GCN-NEXT: s_waitcnt vmcnt(2) 987; GCN-NEXT: v_and_b32_e32 v1, 1, v1 988; GCN-NEXT: s_waitcnt vmcnt(1) 989; GCN-NEXT: v_and_b32_e32 v2, 1, v2 990; GCN-NEXT: v_lshlrev_b16_e32 v1, 1, v1 991; GCN-NEXT: v_lshlrev_b16_e32 v2, 2, v2 992; GCN-NEXT: v_or_b32_e32 v0, v0, v1 993; GCN-NEXT: s_waitcnt vmcnt(0) 994; GCN-NEXT: v_lshlrev_b16_e32 v3, 3, v3 995; GCN-NEXT: v_or_b32_e32 v0, v0, v2 996; GCN-NEXT: v_or_b32_e32 v0, v0, v3 997; GCN-NEXT: v_and_b32_e32 v2, 15, v0 998; GCN-NEXT: v_mov_b32_e32 v0, s0 999; GCN-NEXT: v_mov_b32_e32 v1, s1 1000; GCN-NEXT: flat_store_byte v[0:1], v2 1001; GCN-NEXT: s_endpgm 1002entry: 1003 %v = insertelement <4 x i1> %vec, i1 1, i32 %sel 1004 store <4 x i1> %v, ptr addrspace(1) %out 1005 ret void 1006} 1007 1008define amdgpu_kernel void @bit128_inselt(ptr addrspace(1) %out, <128 x i1> %vec, i32 %sel) { 1009; GCN-LABEL: bit128_inselt: 1010; GCN: ; %bb.0: ; %entry 1011; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1012; GCN-NEXT: s_load_dword s6, s[4:5], 0x44 1013; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1014; GCN-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane 1015; GCN-NEXT: s_waitcnt lgkmcnt(0) 1016; GCN-NEXT: s_bfe_u32 s9, s0, 0xf0001 1017; GCN-NEXT: s_lshr_b32 s42, s1, 16 1018; GCN-NEXT: v_writelane_b32 v6, s4, 0 1019; GCN-NEXT: v_writelane_b32 v6, s5, 1 1020; GCN-NEXT: s_lshr_b32 s4, s0, 16 1021; GCN-NEXT: v_writelane_b32 v6, s4, 2 1022; GCN-NEXT: s_lshr_b32 s4, s0, 17 1023; GCN-NEXT: v_writelane_b32 v6, s4, 3 1024; GCN-NEXT: s_lshr_b32 s4, s0, 18 1025; GCN-NEXT: v_writelane_b32 v6, s4, 4 1026; GCN-NEXT: s_lshr_b32 s4, s0, 19 1027; GCN-NEXT: v_writelane_b32 v6, s4, 5 1028; GCN-NEXT: s_lshr_b32 s4, s0, 20 1029; GCN-NEXT: v_writelane_b32 v6, s4, 6 1030; GCN-NEXT: s_lshr_b32 s4, s0, 21 1031; GCN-NEXT: v_writelane_b32 v6, s4, 7 1032; GCN-NEXT: s_lshr_b32 s4, s0, 22 1033; GCN-NEXT: v_writelane_b32 v6, s4, 8 1034; GCN-NEXT: s_lshr_b32 s4, s0, 23 1035; GCN-NEXT: v_writelane_b32 v6, s4, 9 1036; GCN-NEXT: s_lshr_b32 s4, s0, 24 1037; GCN-NEXT: v_writelane_b32 v6, s4, 10 1038; GCN-NEXT: s_lshr_b32 s4, s0, 25 1039; GCN-NEXT: v_writelane_b32 v6, s4, 11 1040; GCN-NEXT: s_lshr_b32 s4, s0, 26 1041; GCN-NEXT: v_writelane_b32 v6, s4, 12 1042; GCN-NEXT: s_lshr_b32 s4, s0, 27 1043; GCN-NEXT: v_writelane_b32 v6, s4, 13 1044; GCN-NEXT: s_lshr_b32 s4, s0, 28 1045; GCN-NEXT: v_writelane_b32 v6, s4, 14 1046; GCN-NEXT: s_lshr_b32 s4, s0, 29 1047; GCN-NEXT: v_writelane_b32 v6, s4, 15 1048; GCN-NEXT: s_lshr_b32 s4, s0, 30 1049; GCN-NEXT: v_writelane_b32 v6, s4, 16 1050; GCN-NEXT: s_lshr_b32 s4, s0, 31 1051; GCN-NEXT: v_writelane_b32 v6, s4, 17 1052; GCN-NEXT: v_writelane_b32 v6, s9, 18 1053; GCN-NEXT: s_bfe_u32 s9, s0, 0xe0002 1054; GCN-NEXT: v_writelane_b32 v6, s9, 19 1055; GCN-NEXT: s_bfe_u32 s9, s0, 0xd0003 1056; GCN-NEXT: v_writelane_b32 v6, s9, 20 1057; GCN-NEXT: s_bfe_u32 s9, s0, 0xc0004 1058; GCN-NEXT: v_writelane_b32 v6, s9, 21 1059; GCN-NEXT: s_bfe_u32 s9, s0, 0xb0005 1060; GCN-NEXT: v_writelane_b32 v6, s9, 22 1061; GCN-NEXT: s_bfe_u32 s9, s0, 0xa0006 1062; GCN-NEXT: v_writelane_b32 v6, s9, 23 1063; GCN-NEXT: s_bfe_u32 s9, s0, 0x90007 1064; GCN-NEXT: v_writelane_b32 v6, s9, 24 1065; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008 1066; GCN-NEXT: v_writelane_b32 v6, s9, 25 1067; GCN-NEXT: s_bfe_u32 s9, s0, 0x70009 1068; GCN-NEXT: v_writelane_b32 v6, s9, 26 1069; GCN-NEXT: s_bfe_u32 s9, s0, 0x6000a 1070; GCN-NEXT: v_writelane_b32 v6, s9, 27 1071; GCN-NEXT: s_bfe_u32 s9, s0, 0x5000b 1072; GCN-NEXT: v_writelane_b32 v6, s9, 28 1073; GCN-NEXT: s_bfe_u32 s9, s0, 0x4000c 1074; GCN-NEXT: v_writelane_b32 v6, s9, 29 1075; GCN-NEXT: s_bfe_u32 s9, s0, 0x3000d 1076; GCN-NEXT: v_writelane_b32 v6, s9, 30 1077; GCN-NEXT: s_bfe_u32 s9, s0, 0x2000e 1078; GCN-NEXT: v_writelane_b32 v6, s9, 31 1079; GCN-NEXT: s_bfe_u32 s9, s0, 0x1000f 1080; GCN-NEXT: v_writelane_b32 v6, s9, 32 1081; GCN-NEXT: s_bfe_u32 s9, s1, 0xf0001 1082; GCN-NEXT: s_lshr_b32 s43, s1, 17 1083; GCN-NEXT: s_lshr_b32 s45, s1, 18 1084; GCN-NEXT: s_lshr_b32 s47, s1, 19 1085; GCN-NEXT: s_lshr_b32 s50, s1, 20 1086; GCN-NEXT: s_lshr_b32 s51, s1, 21 1087; GCN-NEXT: s_lshr_b32 s53, s1, 22 1088; GCN-NEXT: s_lshr_b32 s55, s1, 23 1089; GCN-NEXT: s_lshr_b32 s58, s1, 24 1090; GCN-NEXT: s_lshr_b32 s59, s1, 25 1091; GCN-NEXT: s_lshr_b32 s61, s1, 26 1092; GCN-NEXT: s_lshr_b32 s63, s1, 27 1093; GCN-NEXT: s_lshr_b32 s66, s1, 28 1094; GCN-NEXT: s_lshr_b32 s67, s1, 29 1095; GCN-NEXT: s_lshr_b32 s68, s1, 30 1096; GCN-NEXT: s_lshr_b32 s69, s1, 31 1097; GCN-NEXT: s_lshr_b32 s73, s2, 16 1098; GCN-NEXT: s_lshr_b32 s74, s2, 17 1099; GCN-NEXT: s_lshr_b32 s77, s2, 18 1100; GCN-NEXT: s_lshr_b32 s78, s2, 19 1101; GCN-NEXT: s_lshr_b32 s81, s2, 20 1102; GCN-NEXT: s_lshr_b32 s82, s2, 21 1103; GCN-NEXT: s_lshr_b32 s84, s2, 22 1104; GCN-NEXT: s_lshr_b32 s86, s2, 23 1105; GCN-NEXT: s_lshr_b32 s89, s2, 24 1106; GCN-NEXT: s_lshr_b32 s90, s2, 25 1107; GCN-NEXT: s_lshr_b32 s93, s2, 26 1108; GCN-NEXT: s_lshr_b32 s94, s2, 27 1109; GCN-NEXT: s_lshr_b32 vcc_hi, s2, 28 1110; GCN-NEXT: s_lshr_b32 s39, s2, 29 1111; GCN-NEXT: s_lshr_b32 s38, s2, 30 1112; GCN-NEXT: s_lshr_b32 s37, s2, 31 1113; GCN-NEXT: s_lshr_b32 s33, s3, 16 1114; GCN-NEXT: s_lshr_b32 s31, s3, 17 1115; GCN-NEXT: s_lshr_b32 s28, s3, 18 1116; GCN-NEXT: s_lshr_b32 s27, s3, 19 1117; GCN-NEXT: s_lshr_b32 s24, s3, 20 1118; GCN-NEXT: s_lshr_b32 s23, s3, 21 1119; GCN-NEXT: s_lshr_b32 s20, s3, 22 1120; GCN-NEXT: s_lshr_b32 s19, s3, 23 1121; GCN-NEXT: s_lshr_b32 s16, s3, 24 1122; GCN-NEXT: s_lshr_b32 s15, s3, 25 1123; GCN-NEXT: s_lshr_b32 s12, s3, 26 1124; GCN-NEXT: s_lshr_b32 s11, s3, 27 1125; GCN-NEXT: s_lshr_b32 s8, s3, 28 1126; GCN-NEXT: s_lshr_b32 s7, s3, 29 1127; GCN-NEXT: s_lshr_b32 s5, s3, 30 1128; GCN-NEXT: s_lshr_b32 s4, s3, 31 1129; GCN-NEXT: v_writelane_b32 v6, s9, 33 1130; GCN-NEXT: s_bfe_u32 s40, s1, 0xe0002 1131; GCN-NEXT: s_bfe_u32 s41, s1, 0xd0003 1132; GCN-NEXT: s_bfe_u32 s44, s1, 0xc0004 1133; GCN-NEXT: s_bfe_u32 s46, s1, 0xb0005 1134; GCN-NEXT: s_bfe_u32 s48, s1, 0xa0006 1135; GCN-NEXT: s_bfe_u32 s49, s1, 0x90007 1136; GCN-NEXT: s_bfe_u32 s52, s1, 0x80008 1137; GCN-NEXT: s_bfe_u32 s54, s1, 0x70009 1138; GCN-NEXT: s_bfe_u32 s56, s1, 0x6000a 1139; GCN-NEXT: s_bfe_u32 s57, s1, 0x5000b 1140; GCN-NEXT: s_bfe_u32 s60, s1, 0x4000c 1141; GCN-NEXT: s_bfe_u32 s62, s1, 0x3000d 1142; GCN-NEXT: s_bfe_u32 s64, s1, 0x2000e 1143; GCN-NEXT: s_bfe_u32 s65, s1, 0x1000f 1144; GCN-NEXT: s_bfe_u32 s70, s2, 0xf0001 1145; GCN-NEXT: s_bfe_u32 s71, s2, 0xe0002 1146; GCN-NEXT: s_bfe_u32 s72, s2, 0xd0003 1147; GCN-NEXT: s_bfe_u32 s75, s2, 0xc0004 1148; GCN-NEXT: s_bfe_u32 s76, s2, 0xb0005 1149; GCN-NEXT: s_bfe_u32 s79, s2, 0xa0006 1150; GCN-NEXT: s_bfe_u32 s80, s2, 0x90007 1151; GCN-NEXT: s_bfe_u32 s83, s2, 0x80008 1152; GCN-NEXT: s_bfe_u32 s85, s2, 0x70009 1153; GCN-NEXT: s_bfe_u32 s87, s2, 0x6000a 1154; GCN-NEXT: s_bfe_u32 s88, s2, 0x5000b 1155; GCN-NEXT: s_bfe_u32 s91, s2, 0x4000c 1156; GCN-NEXT: s_bfe_u32 s92, s2, 0x3000d 1157; GCN-NEXT: s_bfe_u32 s95, s2, 0x2000e 1158; GCN-NEXT: s_bfe_u32 vcc_lo, s2, 0x1000f 1159; GCN-NEXT: s_bfe_u32 s36, s3, 0xf0001 1160; GCN-NEXT: s_bfe_u32 s35, s3, 0xe0002 1161; GCN-NEXT: s_bfe_u32 s34, s3, 0xd0003 1162; GCN-NEXT: s_bfe_u32 s30, s3, 0xc0004 1163; GCN-NEXT: s_bfe_u32 s29, s3, 0xb0005 1164; GCN-NEXT: s_bfe_u32 s26, s3, 0xa0006 1165; GCN-NEXT: s_bfe_u32 s25, s3, 0x90007 1166; GCN-NEXT: s_bfe_u32 s22, s3, 0x80008 1167; GCN-NEXT: s_bfe_u32 s21, s3, 0x70009 1168; GCN-NEXT: s_bfe_u32 s18, s3, 0x6000a 1169; GCN-NEXT: s_bfe_u32 s17, s3, 0x5000b 1170; GCN-NEXT: s_bfe_u32 s14, s3, 0x4000c 1171; GCN-NEXT: s_bfe_u32 s13, s3, 0x3000d 1172; GCN-NEXT: s_bfe_u32 s10, s3, 0x2000e 1173; GCN-NEXT: s_bfe_u32 s9, s3, 0x1000f 1174; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7f 1175; GCN-NEXT: s_cselect_b32 s4, s4, 1 1176; GCN-NEXT: s_lshl_b32 s4, s4, 3 1177; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7e 1178; GCN-NEXT: s_cselect_b32 s5, s5, 1 1179; GCN-NEXT: s_and_b32 s5, s5, 1 1180; GCN-NEXT: s_lshl_b32 s5, s5, 2 1181; GCN-NEXT: s_or_b32 s4, s4, s5 1182; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7d 1183; GCN-NEXT: s_cselect_b32 s5, s7, 1 1184; GCN-NEXT: s_lshl_b32 s5, s5, 1 1185; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7c 1186; GCN-NEXT: s_cselect_b32 s7, s8, 1 1187; GCN-NEXT: s_and_b32 s7, s7, 1 1188; GCN-NEXT: s_or_b32 s5, s7, s5 1189; GCN-NEXT: s_and_b32 s5, s5, 3 1190; GCN-NEXT: s_or_b32 s4, s5, s4 1191; GCN-NEXT: s_lshl_b32 s4, s4, 12 1192; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7b 1193; GCN-NEXT: s_cselect_b32 s5, s11, 1 1194; GCN-NEXT: s_lshl_b32 s5, s5, 3 1195; GCN-NEXT: s_cmpk_lg_i32 s6, 0x7a 1196; GCN-NEXT: s_cselect_b32 s7, s12, 1 1197; GCN-NEXT: s_and_b32 s7, s7, 1 1198; GCN-NEXT: s_lshl_b32 s7, s7, 2 1199; GCN-NEXT: s_or_b32 s5, s5, s7 1200; GCN-NEXT: s_cmpk_lg_i32 s6, 0x79 1201; GCN-NEXT: s_cselect_b32 s7, s15, 1 1202; GCN-NEXT: s_lshl_b32 s7, s7, 1 1203; GCN-NEXT: s_cmpk_lg_i32 s6, 0x78 1204; GCN-NEXT: s_cselect_b32 s8, s16, 1 1205; GCN-NEXT: s_and_b32 s8, s8, 1 1206; GCN-NEXT: s_or_b32 s7, s8, s7 1207; GCN-NEXT: s_and_b32 s7, s7, 3 1208; GCN-NEXT: s_or_b32 s5, s7, s5 1209; GCN-NEXT: s_and_b32 s5, s5, 15 1210; GCN-NEXT: s_lshl_b32 s5, s5, 8 1211; GCN-NEXT: s_or_b32 s4, s4, s5 1212; GCN-NEXT: s_cmpk_lg_i32 s6, 0x77 1213; GCN-NEXT: s_cselect_b32 s5, s19, 1 1214; GCN-NEXT: s_lshl_b32 s5, s5, 3 1215; GCN-NEXT: s_cmpk_lg_i32 s6, 0x76 1216; GCN-NEXT: s_cselect_b32 s7, s20, 1 1217; GCN-NEXT: s_and_b32 s7, s7, 1 1218; GCN-NEXT: s_lshl_b32 s7, s7, 2 1219; GCN-NEXT: s_or_b32 s5, s5, s7 1220; GCN-NEXT: s_cmpk_lg_i32 s6, 0x75 1221; GCN-NEXT: s_cselect_b32 s7, s23, 1 1222; GCN-NEXT: s_lshl_b32 s7, s7, 1 1223; GCN-NEXT: s_cmpk_lg_i32 s6, 0x74 1224; GCN-NEXT: s_cselect_b32 s8, s24, 1 1225; GCN-NEXT: s_and_b32 s8, s8, 1 1226; GCN-NEXT: s_or_b32 s7, s8, s7 1227; GCN-NEXT: s_and_b32 s7, s7, 3 1228; GCN-NEXT: s_or_b32 s5, s7, s5 1229; GCN-NEXT: s_lshl_b32 s5, s5, 4 1230; GCN-NEXT: s_cmpk_lg_i32 s6, 0x73 1231; GCN-NEXT: s_cselect_b32 s7, s27, 1 1232; GCN-NEXT: s_lshl_b32 s7, s7, 3 1233; GCN-NEXT: s_cmpk_lg_i32 s6, 0x72 1234; GCN-NEXT: s_cselect_b32 s8, s28, 1 1235; GCN-NEXT: s_and_b32 s8, s8, 1 1236; GCN-NEXT: s_lshl_b32 s8, s8, 2 1237; GCN-NEXT: s_or_b32 s7, s7, s8 1238; GCN-NEXT: s_cmpk_lg_i32 s6, 0x71 1239; GCN-NEXT: s_cselect_b32 s8, s31, 1 1240; GCN-NEXT: s_lshl_b32 s8, s8, 1 1241; GCN-NEXT: s_cmpk_lg_i32 s6, 0x70 1242; GCN-NEXT: s_cselect_b32 s11, s33, 1 1243; GCN-NEXT: s_and_b32 s11, s11, 1 1244; GCN-NEXT: s_or_b32 s8, s11, s8 1245; GCN-NEXT: s_and_b32 s8, s8, 3 1246; GCN-NEXT: s_or_b32 s7, s8, s7 1247; GCN-NEXT: s_and_b32 s7, s7, 15 1248; GCN-NEXT: s_or_b32 s5, s7, s5 1249; GCN-NEXT: s_and_b32 s5, s5, 0xff 1250; GCN-NEXT: s_or_b32 s4, s5, s4 1251; GCN-NEXT: s_lshl_b32 s4, s4, 16 1252; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6f 1253; GCN-NEXT: s_cselect_b32 s5, s9, 1 1254; GCN-NEXT: s_lshl_b32 s5, s5, 3 1255; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6e 1256; GCN-NEXT: s_cselect_b32 s7, s10, 1 1257; GCN-NEXT: s_and_b32 s7, s7, 1 1258; GCN-NEXT: s_lshl_b32 s7, s7, 2 1259; GCN-NEXT: s_or_b32 s5, s5, s7 1260; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6d 1261; GCN-NEXT: s_cselect_b32 s7, s13, 1 1262; GCN-NEXT: s_lshl_b32 s7, s7, 1 1263; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6c 1264; GCN-NEXT: s_cselect_b32 s8, s14, 1 1265; GCN-NEXT: s_and_b32 s8, s8, 1 1266; GCN-NEXT: s_or_b32 s7, s8, s7 1267; GCN-NEXT: s_and_b32 s7, s7, 3 1268; GCN-NEXT: s_or_b32 s5, s7, s5 1269; GCN-NEXT: s_lshl_b32 s5, s5, 12 1270; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6b 1271; GCN-NEXT: s_cselect_b32 s7, s17, 1 1272; GCN-NEXT: s_lshl_b32 s7, s7, 3 1273; GCN-NEXT: s_cmpk_lg_i32 s6, 0x6a 1274; GCN-NEXT: s_cselect_b32 s8, s18, 1 1275; GCN-NEXT: s_and_b32 s8, s8, 1 1276; GCN-NEXT: s_lshl_b32 s8, s8, 2 1277; GCN-NEXT: s_or_b32 s7, s7, s8 1278; GCN-NEXT: s_cmpk_lg_i32 s6, 0x69 1279; GCN-NEXT: s_cselect_b32 s8, s21, 1 1280; GCN-NEXT: s_lshl_b32 s8, s8, 1 1281; GCN-NEXT: s_cmpk_lg_i32 s6, 0x68 1282; GCN-NEXT: s_cselect_b32 s9, s22, 1 1283; GCN-NEXT: s_and_b32 s9, s9, 1 1284; GCN-NEXT: s_or_b32 s8, s9, s8 1285; GCN-NEXT: s_and_b32 s8, s8, 3 1286; GCN-NEXT: s_or_b32 s7, s8, s7 1287; GCN-NEXT: s_and_b32 s7, s7, 15 1288; GCN-NEXT: s_lshl_b32 s7, s7, 8 1289; GCN-NEXT: s_or_b32 s5, s5, s7 1290; GCN-NEXT: s_cmpk_lg_i32 s6, 0x67 1291; GCN-NEXT: s_cselect_b32 s7, s25, 1 1292; GCN-NEXT: s_lshl_b32 s7, s7, 3 1293; GCN-NEXT: s_cmpk_lg_i32 s6, 0x66 1294; GCN-NEXT: s_cselect_b32 s8, s26, 1 1295; GCN-NEXT: s_and_b32 s8, s8, 1 1296; GCN-NEXT: s_lshl_b32 s8, s8, 2 1297; GCN-NEXT: s_or_b32 s7, s7, s8 1298; GCN-NEXT: s_cmpk_lg_i32 s6, 0x65 1299; GCN-NEXT: s_cselect_b32 s8, s29, 1 1300; GCN-NEXT: s_lshl_b32 s8, s8, 1 1301; GCN-NEXT: s_cmpk_lg_i32 s6, 0x64 1302; GCN-NEXT: s_cselect_b32 s9, s30, 1 1303; GCN-NEXT: s_and_b32 s9, s9, 1 1304; GCN-NEXT: s_or_b32 s8, s9, s8 1305; GCN-NEXT: s_and_b32 s8, s8, 3 1306; GCN-NEXT: s_or_b32 s7, s8, s7 1307; GCN-NEXT: s_lshl_b32 s7, s7, 4 1308; GCN-NEXT: s_cmpk_lg_i32 s6, 0x63 1309; GCN-NEXT: s_cselect_b32 s8, s34, 1 1310; GCN-NEXT: s_lshl_b32 s8, s8, 3 1311; GCN-NEXT: s_cmpk_lg_i32 s6, 0x62 1312; GCN-NEXT: s_cselect_b32 s9, s35, 1 1313; GCN-NEXT: s_and_b32 s9, s9, 1 1314; GCN-NEXT: s_lshl_b32 s9, s9, 2 1315; GCN-NEXT: s_or_b32 s8, s8, s9 1316; GCN-NEXT: s_cmpk_lg_i32 s6, 0x60 1317; GCN-NEXT: s_cselect_b32 s3, s3, 1 1318; GCN-NEXT: s_and_b32 s3, s3, 1 1319; GCN-NEXT: s_cmpk_lg_i32 s6, 0x61 1320; GCN-NEXT: s_cselect_b32 s9, s36, 1 1321; GCN-NEXT: s_lshl_b32 s9, s9, 1 1322; GCN-NEXT: s_or_b32 s3, s3, s9 1323; GCN-NEXT: s_and_b32 s3, s3, 3 1324; GCN-NEXT: s_or_b32 s3, s3, s8 1325; GCN-NEXT: s_and_b32 s3, s3, 15 1326; GCN-NEXT: s_or_b32 s3, s3, s7 1327; GCN-NEXT: s_and_b32 s3, s3, 0xff 1328; GCN-NEXT: s_or_b32 s3, s3, s5 1329; GCN-NEXT: s_and_b32 s3, s3, 0xffff 1330; GCN-NEXT: s_or_b32 s3, s3, s4 1331; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5f 1332; GCN-NEXT: s_cselect_b32 s4, s37, 1 1333; GCN-NEXT: s_lshl_b32 s4, s4, 3 1334; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5e 1335; GCN-NEXT: s_cselect_b32 s5, s38, 1 1336; GCN-NEXT: s_and_b32 s5, s5, 1 1337; GCN-NEXT: s_lshl_b32 s5, s5, 2 1338; GCN-NEXT: s_or_b32 s4, s4, s5 1339; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5d 1340; GCN-NEXT: s_cselect_b32 s5, s39, 1 1341; GCN-NEXT: s_lshl_b32 s5, s5, 1 1342; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5c 1343; GCN-NEXT: s_cselect_b32 s7, vcc_hi, 1 1344; GCN-NEXT: s_and_b32 s7, s7, 1 1345; GCN-NEXT: s_or_b32 s5, s7, s5 1346; GCN-NEXT: s_and_b32 s5, s5, 3 1347; GCN-NEXT: s_or_b32 s4, s5, s4 1348; GCN-NEXT: s_lshl_b32 s4, s4, 12 1349; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5b 1350; GCN-NEXT: s_cselect_b32 s5, s94, 1 1351; GCN-NEXT: s_lshl_b32 s5, s5, 3 1352; GCN-NEXT: s_cmpk_lg_i32 s6, 0x5a 1353; GCN-NEXT: s_cselect_b32 s7, s93, 1 1354; GCN-NEXT: s_and_b32 s7, s7, 1 1355; GCN-NEXT: s_lshl_b32 s7, s7, 2 1356; GCN-NEXT: s_or_b32 s5, s5, s7 1357; GCN-NEXT: s_cmpk_lg_i32 s6, 0x59 1358; GCN-NEXT: s_cselect_b32 s7, s90, 1 1359; GCN-NEXT: s_lshl_b32 s7, s7, 1 1360; GCN-NEXT: s_cmpk_lg_i32 s6, 0x58 1361; GCN-NEXT: s_cselect_b32 s8, s89, 1 1362; GCN-NEXT: s_and_b32 s8, s8, 1 1363; GCN-NEXT: s_or_b32 s7, s8, s7 1364; GCN-NEXT: s_and_b32 s7, s7, 3 1365; GCN-NEXT: s_or_b32 s5, s7, s5 1366; GCN-NEXT: s_and_b32 s5, s5, 15 1367; GCN-NEXT: s_lshl_b32 s5, s5, 8 1368; GCN-NEXT: s_or_b32 s4, s4, s5 1369; GCN-NEXT: s_cmpk_lg_i32 s6, 0x57 1370; GCN-NEXT: s_cselect_b32 s5, s86, 1 1371; GCN-NEXT: s_lshl_b32 s5, s5, 3 1372; GCN-NEXT: s_cmpk_lg_i32 s6, 0x56 1373; GCN-NEXT: s_cselect_b32 s7, s84, 1 1374; GCN-NEXT: s_and_b32 s7, s7, 1 1375; GCN-NEXT: s_lshl_b32 s7, s7, 2 1376; GCN-NEXT: s_or_b32 s5, s5, s7 1377; GCN-NEXT: s_cmpk_lg_i32 s6, 0x55 1378; GCN-NEXT: s_cselect_b32 s7, s82, 1 1379; GCN-NEXT: s_lshl_b32 s7, s7, 1 1380; GCN-NEXT: s_cmpk_lg_i32 s6, 0x54 1381; GCN-NEXT: s_cselect_b32 s8, s81, 1 1382; GCN-NEXT: s_and_b32 s8, s8, 1 1383; GCN-NEXT: s_or_b32 s7, s8, s7 1384; GCN-NEXT: s_and_b32 s7, s7, 3 1385; GCN-NEXT: s_or_b32 s5, s7, s5 1386; GCN-NEXT: s_lshl_b32 s5, s5, 4 1387; GCN-NEXT: s_cmpk_lg_i32 s6, 0x53 1388; GCN-NEXT: s_cselect_b32 s7, s78, 1 1389; GCN-NEXT: s_lshl_b32 s7, s7, 3 1390; GCN-NEXT: s_cmpk_lg_i32 s6, 0x52 1391; GCN-NEXT: s_cselect_b32 s8, s77, 1 1392; GCN-NEXT: s_and_b32 s8, s8, 1 1393; GCN-NEXT: s_lshl_b32 s8, s8, 2 1394; GCN-NEXT: s_or_b32 s7, s7, s8 1395; GCN-NEXT: s_cmpk_lg_i32 s6, 0x51 1396; GCN-NEXT: s_cselect_b32 s8, s74, 1 1397; GCN-NEXT: s_lshl_b32 s8, s8, 1 1398; GCN-NEXT: s_cmpk_lg_i32 s6, 0x50 1399; GCN-NEXT: s_cselect_b32 s9, s73, 1 1400; GCN-NEXT: s_and_b32 s9, s9, 1 1401; GCN-NEXT: s_or_b32 s8, s9, s8 1402; GCN-NEXT: s_and_b32 s8, s8, 3 1403; GCN-NEXT: s_or_b32 s7, s8, s7 1404; GCN-NEXT: s_and_b32 s7, s7, 15 1405; GCN-NEXT: s_or_b32 s5, s7, s5 1406; GCN-NEXT: s_and_b32 s5, s5, 0xff 1407; GCN-NEXT: s_or_b32 s4, s5, s4 1408; GCN-NEXT: s_lshl_b32 s4, s4, 16 1409; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4f 1410; GCN-NEXT: s_cselect_b32 s5, vcc_lo, 1 1411; GCN-NEXT: s_lshl_b32 s5, s5, 3 1412; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4e 1413; GCN-NEXT: s_cselect_b32 s7, s95, 1 1414; GCN-NEXT: s_and_b32 s7, s7, 1 1415; GCN-NEXT: s_lshl_b32 s7, s7, 2 1416; GCN-NEXT: s_or_b32 s5, s5, s7 1417; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4d 1418; GCN-NEXT: s_cselect_b32 s7, s92, 1 1419; GCN-NEXT: s_lshl_b32 s7, s7, 1 1420; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4c 1421; GCN-NEXT: s_cselect_b32 s8, s91, 1 1422; GCN-NEXT: s_and_b32 s8, s8, 1 1423; GCN-NEXT: s_or_b32 s7, s8, s7 1424; GCN-NEXT: s_and_b32 s7, s7, 3 1425; GCN-NEXT: s_or_b32 s5, s7, s5 1426; GCN-NEXT: s_lshl_b32 s5, s5, 12 1427; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4b 1428; GCN-NEXT: s_cselect_b32 s7, s88, 1 1429; GCN-NEXT: s_lshl_b32 s7, s7, 3 1430; GCN-NEXT: s_cmpk_lg_i32 s6, 0x4a 1431; GCN-NEXT: s_cselect_b32 s8, s87, 1 1432; GCN-NEXT: s_and_b32 s8, s8, 1 1433; GCN-NEXT: s_lshl_b32 s8, s8, 2 1434; GCN-NEXT: s_or_b32 s7, s7, s8 1435; GCN-NEXT: s_cmpk_lg_i32 s6, 0x49 1436; GCN-NEXT: s_cselect_b32 s8, s85, 1 1437; GCN-NEXT: s_lshl_b32 s8, s8, 1 1438; GCN-NEXT: s_cmpk_lg_i32 s6, 0x48 1439; GCN-NEXT: s_cselect_b32 s9, s83, 1 1440; GCN-NEXT: s_and_b32 s9, s9, 1 1441; GCN-NEXT: s_or_b32 s8, s9, s8 1442; GCN-NEXT: s_and_b32 s8, s8, 3 1443; GCN-NEXT: s_or_b32 s7, s8, s7 1444; GCN-NEXT: s_and_b32 s7, s7, 15 1445; GCN-NEXT: s_lshl_b32 s7, s7, 8 1446; GCN-NEXT: s_or_b32 s5, s5, s7 1447; GCN-NEXT: s_cmpk_lg_i32 s6, 0x47 1448; GCN-NEXT: s_cselect_b32 s7, s80, 1 1449; GCN-NEXT: s_lshl_b32 s7, s7, 3 1450; GCN-NEXT: s_cmpk_lg_i32 s6, 0x46 1451; GCN-NEXT: s_cselect_b32 s8, s79, 1 1452; GCN-NEXT: s_and_b32 s8, s8, 1 1453; GCN-NEXT: s_lshl_b32 s8, s8, 2 1454; GCN-NEXT: s_or_b32 s7, s7, s8 1455; GCN-NEXT: s_cmpk_lg_i32 s6, 0x45 1456; GCN-NEXT: s_cselect_b32 s8, s76, 1 1457; GCN-NEXT: s_lshl_b32 s8, s8, 1 1458; GCN-NEXT: s_cmpk_lg_i32 s6, 0x44 1459; GCN-NEXT: s_cselect_b32 s9, s75, 1 1460; GCN-NEXT: s_and_b32 s9, s9, 1 1461; GCN-NEXT: s_or_b32 s8, s9, s8 1462; GCN-NEXT: s_and_b32 s8, s8, 3 1463; GCN-NEXT: s_or_b32 s7, s8, s7 1464; GCN-NEXT: s_lshl_b32 s7, s7, 4 1465; GCN-NEXT: s_cmpk_lg_i32 s6, 0x43 1466; GCN-NEXT: s_cselect_b32 s8, s72, 1 1467; GCN-NEXT: s_lshl_b32 s8, s8, 3 1468; GCN-NEXT: s_cmpk_lg_i32 s6, 0x42 1469; GCN-NEXT: s_cselect_b32 s9, s71, 1 1470; GCN-NEXT: s_and_b32 s9, s9, 1 1471; GCN-NEXT: s_lshl_b32 s9, s9, 2 1472; GCN-NEXT: s_or_b32 s8, s8, s9 1473; GCN-NEXT: s_cmp_lg_u32 s6, 64 1474; GCN-NEXT: s_cselect_b32 s2, s2, 1 1475; GCN-NEXT: s_and_b32 s2, s2, 1 1476; GCN-NEXT: s_cmpk_lg_i32 s6, 0x41 1477; GCN-NEXT: s_cselect_b32 s9, s70, 1 1478; GCN-NEXT: s_lshl_b32 s9, s9, 1 1479; GCN-NEXT: s_or_b32 s2, s2, s9 1480; GCN-NEXT: s_and_b32 s2, s2, 3 1481; GCN-NEXT: s_or_b32 s2, s2, s8 1482; GCN-NEXT: s_and_b32 s2, s2, 15 1483; GCN-NEXT: s_or_b32 s2, s2, s7 1484; GCN-NEXT: s_and_b32 s2, s2, 0xff 1485; GCN-NEXT: s_or_b32 s2, s2, s5 1486; GCN-NEXT: s_and_b32 s2, s2, 0xffff 1487; GCN-NEXT: s_or_b32 s2, s2, s4 1488; GCN-NEXT: s_cmp_lg_u32 s6, 63 1489; GCN-NEXT: s_cselect_b32 s4, s69, 1 1490; GCN-NEXT: s_lshl_b32 s4, s4, 3 1491; GCN-NEXT: s_cmp_lg_u32 s6, 62 1492; GCN-NEXT: s_cselect_b32 s5, s68, 1 1493; GCN-NEXT: s_and_b32 s5, s5, 1 1494; GCN-NEXT: s_lshl_b32 s5, s5, 2 1495; GCN-NEXT: s_or_b32 s4, s4, s5 1496; GCN-NEXT: s_cmp_lg_u32 s6, 61 1497; GCN-NEXT: s_cselect_b32 s5, s67, 1 1498; GCN-NEXT: s_lshl_b32 s5, s5, 1 1499; GCN-NEXT: s_cmp_lg_u32 s6, 60 1500; GCN-NEXT: s_cselect_b32 s7, s66, 1 1501; GCN-NEXT: s_and_b32 s7, s7, 1 1502; GCN-NEXT: s_or_b32 s5, s7, s5 1503; GCN-NEXT: s_and_b32 s5, s5, 3 1504; GCN-NEXT: s_or_b32 s4, s5, s4 1505; GCN-NEXT: s_lshl_b32 s4, s4, 12 1506; GCN-NEXT: s_cmp_lg_u32 s6, 59 1507; GCN-NEXT: s_cselect_b32 s5, s63, 1 1508; GCN-NEXT: s_lshl_b32 s5, s5, 3 1509; GCN-NEXT: s_cmp_lg_u32 s6, 58 1510; GCN-NEXT: s_cselect_b32 s7, s61, 1 1511; GCN-NEXT: s_and_b32 s7, s7, 1 1512; GCN-NEXT: s_lshl_b32 s7, s7, 2 1513; GCN-NEXT: s_or_b32 s5, s5, s7 1514; GCN-NEXT: s_cmp_lg_u32 s6, 57 1515; GCN-NEXT: s_cselect_b32 s7, s59, 1 1516; GCN-NEXT: s_lshl_b32 s7, s7, 1 1517; GCN-NEXT: s_cmp_lg_u32 s6, 56 1518; GCN-NEXT: s_cselect_b32 s8, s58, 1 1519; GCN-NEXT: s_and_b32 s8, s8, 1 1520; GCN-NEXT: s_or_b32 s7, s8, s7 1521; GCN-NEXT: s_and_b32 s7, s7, 3 1522; GCN-NEXT: s_or_b32 s5, s7, s5 1523; GCN-NEXT: s_and_b32 s5, s5, 15 1524; GCN-NEXT: s_lshl_b32 s5, s5, 8 1525; GCN-NEXT: s_or_b32 s4, s4, s5 1526; GCN-NEXT: s_cmp_lg_u32 s6, 55 1527; GCN-NEXT: s_cselect_b32 s5, s55, 1 1528; GCN-NEXT: s_lshl_b32 s5, s5, 3 1529; GCN-NEXT: s_cmp_lg_u32 s6, 54 1530; GCN-NEXT: s_cselect_b32 s7, s53, 1 1531; GCN-NEXT: s_and_b32 s7, s7, 1 1532; GCN-NEXT: s_lshl_b32 s7, s7, 2 1533; GCN-NEXT: s_or_b32 s5, s5, s7 1534; GCN-NEXT: s_cmp_lg_u32 s6, 53 1535; GCN-NEXT: s_cselect_b32 s7, s51, 1 1536; GCN-NEXT: s_lshl_b32 s7, s7, 1 1537; GCN-NEXT: s_cmp_lg_u32 s6, 52 1538; GCN-NEXT: s_cselect_b32 s8, s50, 1 1539; GCN-NEXT: s_and_b32 s8, s8, 1 1540; GCN-NEXT: s_or_b32 s7, s8, s7 1541; GCN-NEXT: s_and_b32 s7, s7, 3 1542; GCN-NEXT: s_or_b32 s5, s7, s5 1543; GCN-NEXT: s_lshl_b32 s5, s5, 4 1544; GCN-NEXT: s_cmp_lg_u32 s6, 51 1545; GCN-NEXT: s_cselect_b32 s7, s47, 1 1546; GCN-NEXT: s_lshl_b32 s7, s7, 3 1547; GCN-NEXT: s_cmp_lg_u32 s6, 50 1548; GCN-NEXT: s_cselect_b32 s8, s45, 1 1549; GCN-NEXT: s_and_b32 s8, s8, 1 1550; GCN-NEXT: s_lshl_b32 s8, s8, 2 1551; GCN-NEXT: s_or_b32 s7, s7, s8 1552; GCN-NEXT: s_cmp_lg_u32 s6, 49 1553; GCN-NEXT: s_cselect_b32 s8, s43, 1 1554; GCN-NEXT: s_lshl_b32 s8, s8, 1 1555; GCN-NEXT: s_cmp_lg_u32 s6, 48 1556; GCN-NEXT: s_cselect_b32 s9, s42, 1 1557; GCN-NEXT: s_and_b32 s9, s9, 1 1558; GCN-NEXT: s_or_b32 s8, s9, s8 1559; GCN-NEXT: s_and_b32 s8, s8, 3 1560; GCN-NEXT: s_or_b32 s7, s8, s7 1561; GCN-NEXT: s_and_b32 s7, s7, 15 1562; GCN-NEXT: s_or_b32 s5, s7, s5 1563; GCN-NEXT: s_and_b32 s5, s5, 0xff 1564; GCN-NEXT: s_or_b32 s4, s5, s4 1565; GCN-NEXT: s_lshl_b32 s4, s4, 16 1566; GCN-NEXT: s_cmp_lg_u32 s6, 47 1567; GCN-NEXT: s_cselect_b32 s5, s65, 1 1568; GCN-NEXT: s_lshl_b32 s5, s5, 3 1569; GCN-NEXT: s_cmp_lg_u32 s6, 46 1570; GCN-NEXT: s_cselect_b32 s7, s64, 1 1571; GCN-NEXT: s_and_b32 s7, s7, 1 1572; GCN-NEXT: s_lshl_b32 s7, s7, 2 1573; GCN-NEXT: s_or_b32 s5, s5, s7 1574; GCN-NEXT: s_cmp_lg_u32 s6, 45 1575; GCN-NEXT: s_cselect_b32 s7, s62, 1 1576; GCN-NEXT: s_lshl_b32 s7, s7, 1 1577; GCN-NEXT: s_cmp_lg_u32 s6, 44 1578; GCN-NEXT: s_cselect_b32 s8, s60, 1 1579; GCN-NEXT: s_and_b32 s8, s8, 1 1580; GCN-NEXT: s_or_b32 s7, s8, s7 1581; GCN-NEXT: s_and_b32 s7, s7, 3 1582; GCN-NEXT: s_or_b32 s5, s7, s5 1583; GCN-NEXT: s_lshl_b32 s5, s5, 12 1584; GCN-NEXT: s_cmp_lg_u32 s6, 43 1585; GCN-NEXT: s_cselect_b32 s7, s57, 1 1586; GCN-NEXT: s_lshl_b32 s7, s7, 3 1587; GCN-NEXT: s_cmp_lg_u32 s6, 42 1588; GCN-NEXT: s_cselect_b32 s8, s56, 1 1589; GCN-NEXT: s_and_b32 s8, s8, 1 1590; GCN-NEXT: s_lshl_b32 s8, s8, 2 1591; GCN-NEXT: s_or_b32 s7, s7, s8 1592; GCN-NEXT: s_cmp_lg_u32 s6, 41 1593; GCN-NEXT: s_cselect_b32 s8, s54, 1 1594; GCN-NEXT: s_lshl_b32 s8, s8, 1 1595; GCN-NEXT: s_cmp_lg_u32 s6, 40 1596; GCN-NEXT: s_cselect_b32 s9, s52, 1 1597; GCN-NEXT: s_and_b32 s9, s9, 1 1598; GCN-NEXT: s_or_b32 s8, s9, s8 1599; GCN-NEXT: s_and_b32 s8, s8, 3 1600; GCN-NEXT: s_or_b32 s7, s8, s7 1601; GCN-NEXT: s_and_b32 s7, s7, 15 1602; GCN-NEXT: s_lshl_b32 s7, s7, 8 1603; GCN-NEXT: s_or_b32 s5, s5, s7 1604; GCN-NEXT: s_cmp_lg_u32 s6, 39 1605; GCN-NEXT: s_cselect_b32 s7, s49, 1 1606; GCN-NEXT: s_lshl_b32 s7, s7, 3 1607; GCN-NEXT: s_cmp_lg_u32 s6, 38 1608; GCN-NEXT: s_cselect_b32 s8, s48, 1 1609; GCN-NEXT: s_and_b32 s8, s8, 1 1610; GCN-NEXT: s_lshl_b32 s8, s8, 2 1611; GCN-NEXT: s_or_b32 s7, s7, s8 1612; GCN-NEXT: s_cmp_lg_u32 s6, 37 1613; GCN-NEXT: s_cselect_b32 s8, s46, 1 1614; GCN-NEXT: s_lshl_b32 s8, s8, 1 1615; GCN-NEXT: s_cmp_lg_u32 s6, 36 1616; GCN-NEXT: s_cselect_b32 s9, s44, 1 1617; GCN-NEXT: s_and_b32 s9, s9, 1 1618; GCN-NEXT: s_or_b32 s8, s9, s8 1619; GCN-NEXT: s_and_b32 s8, s8, 3 1620; GCN-NEXT: s_or_b32 s7, s8, s7 1621; GCN-NEXT: s_lshl_b32 s7, s7, 4 1622; GCN-NEXT: s_cmp_lg_u32 s6, 35 1623; GCN-NEXT: s_cselect_b32 s8, s41, 1 1624; GCN-NEXT: s_lshl_b32 s8, s8, 3 1625; GCN-NEXT: s_cmp_lg_u32 s6, 34 1626; GCN-NEXT: s_cselect_b32 s9, s40, 1 1627; GCN-NEXT: s_and_b32 s9, s9, 1 1628; GCN-NEXT: s_lshl_b32 s9, s9, 2 1629; GCN-NEXT: s_or_b32 s8, s8, s9 1630; GCN-NEXT: s_cmp_lg_u32 s6, 32 1631; GCN-NEXT: s_cselect_b32 s1, s1, 1 1632; GCN-NEXT: s_and_b32 s1, s1, 1 1633; GCN-NEXT: s_cmp_lg_u32 s6, 33 1634; GCN-NEXT: v_readlane_b32 s9, v6, 33 1635; GCN-NEXT: s_cselect_b32 s9, s9, 1 1636; GCN-NEXT: s_lshl_b32 s9, s9, 1 1637; GCN-NEXT: s_or_b32 s1, s1, s9 1638; GCN-NEXT: s_and_b32 s1, s1, 3 1639; GCN-NEXT: s_or_b32 s1, s1, s8 1640; GCN-NEXT: s_and_b32 s1, s1, 15 1641; GCN-NEXT: s_or_b32 s1, s1, s7 1642; GCN-NEXT: s_and_b32 s1, s1, 0xff 1643; GCN-NEXT: s_or_b32 s1, s1, s5 1644; GCN-NEXT: s_and_b32 s1, s1, 0xffff 1645; GCN-NEXT: s_or_b32 s1, s1, s4 1646; GCN-NEXT: s_cmp_lg_u32 s6, 31 1647; GCN-NEXT: v_readlane_b32 s4, v6, 17 1648; GCN-NEXT: s_cselect_b32 s4, s4, 1 1649; GCN-NEXT: s_lshl_b32 s4, s4, 3 1650; GCN-NEXT: s_cmp_lg_u32 s6, 30 1651; GCN-NEXT: v_readlane_b32 s5, v6, 16 1652; GCN-NEXT: s_cselect_b32 s5, s5, 1 1653; GCN-NEXT: s_and_b32 s5, s5, 1 1654; GCN-NEXT: s_lshl_b32 s5, s5, 2 1655; GCN-NEXT: s_or_b32 s4, s4, s5 1656; GCN-NEXT: s_cmp_lg_u32 s6, 29 1657; GCN-NEXT: v_readlane_b32 s5, v6, 15 1658; GCN-NEXT: s_cselect_b32 s5, s5, 1 1659; GCN-NEXT: s_lshl_b32 s5, s5, 1 1660; GCN-NEXT: s_cmp_lg_u32 s6, 28 1661; GCN-NEXT: v_readlane_b32 s7, v6, 14 1662; GCN-NEXT: s_cselect_b32 s7, s7, 1 1663; GCN-NEXT: s_and_b32 s7, s7, 1 1664; GCN-NEXT: s_or_b32 s5, s7, s5 1665; GCN-NEXT: s_and_b32 s5, s5, 3 1666; GCN-NEXT: s_or_b32 s4, s5, s4 1667; GCN-NEXT: s_lshl_b32 s4, s4, 12 1668; GCN-NEXT: s_cmp_lg_u32 s6, 27 1669; GCN-NEXT: v_readlane_b32 s5, v6, 13 1670; GCN-NEXT: s_cselect_b32 s5, s5, 1 1671; GCN-NEXT: s_lshl_b32 s5, s5, 3 1672; GCN-NEXT: s_cmp_lg_u32 s6, 26 1673; GCN-NEXT: v_readlane_b32 s7, v6, 12 1674; GCN-NEXT: s_cselect_b32 s7, s7, 1 1675; GCN-NEXT: s_and_b32 s7, s7, 1 1676; GCN-NEXT: s_lshl_b32 s7, s7, 2 1677; GCN-NEXT: s_or_b32 s5, s5, s7 1678; GCN-NEXT: s_cmp_lg_u32 s6, 25 1679; GCN-NEXT: v_readlane_b32 s7, v6, 11 1680; GCN-NEXT: s_cselect_b32 s7, s7, 1 1681; GCN-NEXT: s_lshl_b32 s7, s7, 1 1682; GCN-NEXT: s_cmp_lg_u32 s6, 24 1683; GCN-NEXT: v_readlane_b32 s8, v6, 10 1684; GCN-NEXT: s_cselect_b32 s8, s8, 1 1685; GCN-NEXT: s_and_b32 s8, s8, 1 1686; GCN-NEXT: s_or_b32 s7, s8, s7 1687; GCN-NEXT: s_and_b32 s7, s7, 3 1688; GCN-NEXT: s_or_b32 s5, s7, s5 1689; GCN-NEXT: s_and_b32 s5, s5, 15 1690; GCN-NEXT: s_lshl_b32 s5, s5, 8 1691; GCN-NEXT: s_or_b32 s4, s4, s5 1692; GCN-NEXT: s_cmp_lg_u32 s6, 23 1693; GCN-NEXT: v_readlane_b32 s5, v6, 9 1694; GCN-NEXT: s_cselect_b32 s5, s5, 1 1695; GCN-NEXT: s_lshl_b32 s5, s5, 3 1696; GCN-NEXT: s_cmp_lg_u32 s6, 22 1697; GCN-NEXT: v_readlane_b32 s7, v6, 8 1698; GCN-NEXT: s_cselect_b32 s7, s7, 1 1699; GCN-NEXT: s_and_b32 s7, s7, 1 1700; GCN-NEXT: s_lshl_b32 s7, s7, 2 1701; GCN-NEXT: s_or_b32 s5, s5, s7 1702; GCN-NEXT: s_cmp_lg_u32 s6, 21 1703; GCN-NEXT: v_readlane_b32 s7, v6, 7 1704; GCN-NEXT: s_cselect_b32 s7, s7, 1 1705; GCN-NEXT: s_lshl_b32 s7, s7, 1 1706; GCN-NEXT: s_cmp_lg_u32 s6, 20 1707; GCN-NEXT: v_readlane_b32 s8, v6, 6 1708; GCN-NEXT: s_cselect_b32 s8, s8, 1 1709; GCN-NEXT: s_and_b32 s8, s8, 1 1710; GCN-NEXT: s_or_b32 s7, s8, s7 1711; GCN-NEXT: s_and_b32 s7, s7, 3 1712; GCN-NEXT: s_or_b32 s5, s7, s5 1713; GCN-NEXT: s_lshl_b32 s5, s5, 4 1714; GCN-NEXT: s_cmp_lg_u32 s6, 19 1715; GCN-NEXT: v_readlane_b32 s7, v6, 5 1716; GCN-NEXT: s_cselect_b32 s7, s7, 1 1717; GCN-NEXT: s_lshl_b32 s7, s7, 3 1718; GCN-NEXT: s_cmp_lg_u32 s6, 18 1719; GCN-NEXT: v_readlane_b32 s8, v6, 4 1720; GCN-NEXT: s_cselect_b32 s8, s8, 1 1721; GCN-NEXT: s_and_b32 s8, s8, 1 1722; GCN-NEXT: s_lshl_b32 s8, s8, 2 1723; GCN-NEXT: s_or_b32 s7, s7, s8 1724; GCN-NEXT: s_cmp_lg_u32 s6, 17 1725; GCN-NEXT: v_readlane_b32 s8, v6, 3 1726; GCN-NEXT: s_cselect_b32 s8, s8, 1 1727; GCN-NEXT: s_lshl_b32 s8, s8, 1 1728; GCN-NEXT: s_cmp_lg_u32 s6, 16 1729; GCN-NEXT: v_readlane_b32 s9, v6, 2 1730; GCN-NEXT: s_cselect_b32 s9, s9, 1 1731; GCN-NEXT: s_and_b32 s9, s9, 1 1732; GCN-NEXT: s_or_b32 s8, s9, s8 1733; GCN-NEXT: s_and_b32 s8, s8, 3 1734; GCN-NEXT: s_or_b32 s7, s8, s7 1735; GCN-NEXT: s_and_b32 s7, s7, 15 1736; GCN-NEXT: s_or_b32 s5, s7, s5 1737; GCN-NEXT: s_and_b32 s5, s5, 0xff 1738; GCN-NEXT: s_or_b32 s4, s5, s4 1739; GCN-NEXT: s_lshl_b32 s4, s4, 16 1740; GCN-NEXT: s_cmp_lg_u32 s6, 15 1741; GCN-NEXT: v_readlane_b32 s5, v6, 32 1742; GCN-NEXT: s_cselect_b32 s5, s5, 1 1743; GCN-NEXT: s_lshl_b32 s5, s5, 3 1744; GCN-NEXT: s_cmp_lg_u32 s6, 14 1745; GCN-NEXT: v_readlane_b32 s7, v6, 31 1746; GCN-NEXT: s_cselect_b32 s7, s7, 1 1747; GCN-NEXT: s_and_b32 s7, s7, 1 1748; GCN-NEXT: s_lshl_b32 s7, s7, 2 1749; GCN-NEXT: s_or_b32 s5, s5, s7 1750; GCN-NEXT: s_cmp_lg_u32 s6, 13 1751; GCN-NEXT: v_readlane_b32 s7, v6, 30 1752; GCN-NEXT: s_cselect_b32 s7, s7, 1 1753; GCN-NEXT: s_lshl_b32 s7, s7, 1 1754; GCN-NEXT: s_cmp_lg_u32 s6, 12 1755; GCN-NEXT: v_readlane_b32 s8, v6, 29 1756; GCN-NEXT: s_cselect_b32 s8, s8, 1 1757; GCN-NEXT: s_and_b32 s8, s8, 1 1758; GCN-NEXT: s_or_b32 s7, s8, s7 1759; GCN-NEXT: s_and_b32 s7, s7, 3 1760; GCN-NEXT: s_or_b32 s5, s7, s5 1761; GCN-NEXT: s_lshl_b32 s5, s5, 12 1762; GCN-NEXT: s_cmp_lg_u32 s6, 11 1763; GCN-NEXT: v_readlane_b32 s7, v6, 28 1764; GCN-NEXT: s_cselect_b32 s7, s7, 1 1765; GCN-NEXT: s_lshl_b32 s7, s7, 3 1766; GCN-NEXT: s_cmp_lg_u32 s6, 10 1767; GCN-NEXT: v_readlane_b32 s8, v6, 27 1768; GCN-NEXT: s_cselect_b32 s8, s8, 1 1769; GCN-NEXT: s_and_b32 s8, s8, 1 1770; GCN-NEXT: s_lshl_b32 s8, s8, 2 1771; GCN-NEXT: s_or_b32 s7, s7, s8 1772; GCN-NEXT: s_cmp_lg_u32 s6, 9 1773; GCN-NEXT: v_readlane_b32 s8, v6, 26 1774; GCN-NEXT: s_cselect_b32 s8, s8, 1 1775; GCN-NEXT: s_lshl_b32 s8, s8, 1 1776; GCN-NEXT: s_cmp_lg_u32 s6, 8 1777; GCN-NEXT: v_readlane_b32 s9, v6, 25 1778; GCN-NEXT: s_cselect_b32 s9, s9, 1 1779; GCN-NEXT: s_and_b32 s9, s9, 1 1780; GCN-NEXT: s_or_b32 s8, s9, s8 1781; GCN-NEXT: s_and_b32 s8, s8, 3 1782; GCN-NEXT: s_or_b32 s7, s8, s7 1783; GCN-NEXT: s_and_b32 s7, s7, 15 1784; GCN-NEXT: s_lshl_b32 s7, s7, 8 1785; GCN-NEXT: s_or_b32 s5, s5, s7 1786; GCN-NEXT: s_cmp_lg_u32 s6, 7 1787; GCN-NEXT: v_readlane_b32 s7, v6, 24 1788; GCN-NEXT: s_cselect_b32 s7, s7, 1 1789; GCN-NEXT: s_lshl_b32 s7, s7, 3 1790; GCN-NEXT: s_cmp_lg_u32 s6, 6 1791; GCN-NEXT: v_readlane_b32 s8, v6, 23 1792; GCN-NEXT: s_cselect_b32 s8, s8, 1 1793; GCN-NEXT: s_and_b32 s8, s8, 1 1794; GCN-NEXT: s_lshl_b32 s8, s8, 2 1795; GCN-NEXT: s_or_b32 s7, s7, s8 1796; GCN-NEXT: s_cmp_lg_u32 s6, 5 1797; GCN-NEXT: v_readlane_b32 s8, v6, 22 1798; GCN-NEXT: s_cselect_b32 s8, s8, 1 1799; GCN-NEXT: s_lshl_b32 s8, s8, 1 1800; GCN-NEXT: s_cmp_lg_u32 s6, 4 1801; GCN-NEXT: v_readlane_b32 s9, v6, 21 1802; GCN-NEXT: s_cselect_b32 s9, s9, 1 1803; GCN-NEXT: s_and_b32 s9, s9, 1 1804; GCN-NEXT: s_or_b32 s8, s9, s8 1805; GCN-NEXT: s_and_b32 s8, s8, 3 1806; GCN-NEXT: s_or_b32 s7, s8, s7 1807; GCN-NEXT: s_lshl_b32 s7, s7, 4 1808; GCN-NEXT: s_cmp_lg_u32 s6, 3 1809; GCN-NEXT: v_readlane_b32 s8, v6, 20 1810; GCN-NEXT: s_cselect_b32 s8, s8, 1 1811; GCN-NEXT: s_lshl_b32 s8, s8, 3 1812; GCN-NEXT: s_cmp_lg_u32 s6, 2 1813; GCN-NEXT: v_readlane_b32 s9, v6, 19 1814; GCN-NEXT: s_cselect_b32 s9, s9, 1 1815; GCN-NEXT: s_and_b32 s9, s9, 1 1816; GCN-NEXT: s_lshl_b32 s9, s9, 2 1817; GCN-NEXT: s_or_b32 s8, s8, s9 1818; GCN-NEXT: s_cmp_lg_u32 s6, 0 1819; GCN-NEXT: s_cselect_b32 s0, s0, 1 1820; GCN-NEXT: s_and_b32 s0, s0, 1 1821; GCN-NEXT: s_cmp_lg_u32 s6, 1 1822; GCN-NEXT: v_readlane_b32 s6, v6, 18 1823; GCN-NEXT: s_cselect_b32 s6, s6, 1 1824; GCN-NEXT: s_lshl_b32 s6, s6, 1 1825; GCN-NEXT: s_or_b32 s0, s0, s6 1826; GCN-NEXT: s_and_b32 s0, s0, 3 1827; GCN-NEXT: s_or_b32 s0, s0, s8 1828; GCN-NEXT: s_and_b32 s0, s0, 15 1829; GCN-NEXT: s_or_b32 s0, s0, s7 1830; GCN-NEXT: s_and_b32 s0, s0, 0xff 1831; GCN-NEXT: s_or_b32 s0, s0, s5 1832; GCN-NEXT: s_and_b32 s0, s0, 0xffff 1833; GCN-NEXT: s_or_b32 s0, s0, s4 1834; GCN-NEXT: v_mov_b32_e32 v0, s0 1835; GCN-NEXT: v_mov_b32_e32 v1, s1 1836; GCN-NEXT: v_readlane_b32 s0, v6, 0 1837; GCN-NEXT: v_readlane_b32 s1, v6, 1 1838; GCN-NEXT: v_mov_b32_e32 v5, s1 1839; GCN-NEXT: v_mov_b32_e32 v2, s2 1840; GCN-NEXT: v_mov_b32_e32 v3, s3 1841; GCN-NEXT: v_mov_b32_e32 v4, s0 1842; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1843; GCN-NEXT: s_endpgm 1844entry: 1845 %v = insertelement <128 x i1> %vec, i1 1, i32 %sel 1846 store <128 x i1> %v, ptr addrspace(1) %out 1847 ret void 1848} 1849 1850define amdgpu_ps <32 x float> @float32_inselt_vec(<32 x float> %vec, i32 %sel) { 1851; GCN-LABEL: float32_inselt_vec: 1852; GCN: ; %bb.0: ; %entry 1853; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v32 1854; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 2, v32 1855; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 3, v32 1856; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 4, v32 1857; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 5, v32 1858; GCN-NEXT: v_cmp_ne_u32_e64 s[8:9], 6, v32 1859; GCN-NEXT: v_cmp_ne_u32_e64 s[10:11], 7, v32 1860; GCN-NEXT: v_cmp_ne_u32_e64 s[12:13], 8, v32 1861; GCN-NEXT: v_cmp_ne_u32_e64 s[14:15], 9, v32 1862; GCN-NEXT: v_cmp_ne_u32_e64 s[16:17], 10, v32 1863; GCN-NEXT: v_cmp_ne_u32_e64 s[18:19], 11, v32 1864; GCN-NEXT: v_cmp_ne_u32_e64 s[20:21], 12, v32 1865; GCN-NEXT: v_cmp_ne_u32_e64 s[22:23], 13, v32 1866; GCN-NEXT: v_cmp_ne_u32_e64 s[24:25], 14, v32 1867; GCN-NEXT: v_cmp_ne_u32_e64 s[26:27], 15, v32 1868; GCN-NEXT: v_cmp_ne_u32_e64 s[28:29], 16, v32 1869; GCN-NEXT: v_cmp_ne_u32_e64 s[30:31], 17, v32 1870; GCN-NEXT: v_cmp_ne_u32_e64 s[34:35], 18, v32 1871; GCN-NEXT: v_cmp_ne_u32_e64 s[36:37], 19, v32 1872; GCN-NEXT: v_cmp_ne_u32_e64 s[38:39], 20, v32 1873; GCN-NEXT: v_cmp_ne_u32_e64 s[40:41], 21, v32 1874; GCN-NEXT: v_cmp_ne_u32_e64 s[42:43], 22, v32 1875; GCN-NEXT: v_cmp_ne_u32_e64 s[44:45], 23, v32 1876; GCN-NEXT: v_cmp_ne_u32_e64 s[46:47], 24, v32 1877; GCN-NEXT: v_cmp_ne_u32_e64 s[48:49], 25, v32 1878; GCN-NEXT: v_cmp_ne_u32_e64 s[50:51], 26, v32 1879; GCN-NEXT: v_cmp_ne_u32_e64 s[52:53], 27, v32 1880; GCN-NEXT: v_cmp_ne_u32_e64 s[54:55], 28, v32 1881; GCN-NEXT: v_cmp_ne_u32_e64 s[56:57], 29, v32 1882; GCN-NEXT: v_cmp_ne_u32_e64 s[58:59], 30, v32 1883; GCN-NEXT: v_cmp_ne_u32_e64 s[60:61], 31, v32 1884; GCN-NEXT: v_cmp_ne_u32_e64 s[62:63], 0, v32 1885; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v0, s[62:63] 1886; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc 1887; GCN-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[0:1] 1888; GCN-NEXT: v_cndmask_b32_e64 v3, 1.0, v3, s[2:3] 1889; GCN-NEXT: v_cndmask_b32_e64 v4, 1.0, v4, s[4:5] 1890; GCN-NEXT: v_cndmask_b32_e64 v5, 1.0, v5, s[6:7] 1891; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, v6, s[8:9] 1892; GCN-NEXT: v_cndmask_b32_e64 v7, 1.0, v7, s[10:11] 1893; GCN-NEXT: v_cndmask_b32_e64 v8, 1.0, v8, s[12:13] 1894; GCN-NEXT: v_cndmask_b32_e64 v9, 1.0, v9, s[14:15] 1895; GCN-NEXT: v_cndmask_b32_e64 v10, 1.0, v10, s[16:17] 1896; GCN-NEXT: v_cndmask_b32_e64 v11, 1.0, v11, s[18:19] 1897; GCN-NEXT: v_cndmask_b32_e64 v12, 1.0, v12, s[20:21] 1898; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, v13, s[22:23] 1899; GCN-NEXT: v_cndmask_b32_e64 v14, 1.0, v14, s[24:25] 1900; GCN-NEXT: v_cndmask_b32_e64 v15, 1.0, v15, s[26:27] 1901; GCN-NEXT: v_cndmask_b32_e64 v16, 1.0, v16, s[28:29] 1902; GCN-NEXT: v_cndmask_b32_e64 v17, 1.0, v17, s[30:31] 1903; GCN-NEXT: v_cndmask_b32_e64 v18, 1.0, v18, s[34:35] 1904; GCN-NEXT: v_cndmask_b32_e64 v19, 1.0, v19, s[36:37] 1905; GCN-NEXT: v_cndmask_b32_e64 v20, 1.0, v20, s[38:39] 1906; GCN-NEXT: v_cndmask_b32_e64 v21, 1.0, v21, s[40:41] 1907; GCN-NEXT: v_cndmask_b32_e64 v22, 1.0, v22, s[42:43] 1908; GCN-NEXT: v_cndmask_b32_e64 v23, 1.0, v23, s[44:45] 1909; GCN-NEXT: v_cndmask_b32_e64 v24, 1.0, v24, s[46:47] 1910; GCN-NEXT: v_cndmask_b32_e64 v25, 1.0, v25, s[48:49] 1911; GCN-NEXT: v_cndmask_b32_e64 v26, 1.0, v26, s[50:51] 1912; GCN-NEXT: v_cndmask_b32_e64 v27, 1.0, v27, s[52:53] 1913; GCN-NEXT: v_cndmask_b32_e64 v28, 1.0, v28, s[54:55] 1914; GCN-NEXT: v_cndmask_b32_e64 v29, 1.0, v29, s[56:57] 1915; GCN-NEXT: v_cndmask_b32_e64 v30, 1.0, v30, s[58:59] 1916; GCN-NEXT: v_cndmask_b32_e64 v31, 1.0, v31, s[60:61] 1917; GCN-NEXT: ; return to shader part epilog 1918entry: 1919 %v = insertelement <32 x float> %vec, float 1.000000e+00, i32 %sel 1920 ret <32 x float> %v 1921} 1922 1923define <8 x double> @double8_inselt_vec(<8 x double> %vec, i32 %sel) { 1924; GCN-LABEL: double8_inselt_vec: 1925; GCN: ; %bb.0: ; %entry 1926; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1927; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 1928; GCN-NEXT: v_mov_b32_e32 v17, 0x3ff00000 1929; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1930; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc 1931; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 1932; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 1933; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc 1934; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 1935; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1936; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v17, vcc 1937; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 1938; GCN-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1939; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc 1940; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 1941; GCN-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 1942; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v17, vcc 1943; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 1944; GCN-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 1945; GCN-NEXT: v_cndmask_b32_e32 v11, v11, v17, vcc 1946; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16 1947; GCN-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc 1948; GCN-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc 1949; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16 1950; GCN-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc 1951; GCN-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc 1952; GCN-NEXT: s_setpc_b64 s[30:31] 1953entry: 1954 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel 1955 ret <8 x double> %v 1956} 1957