1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s 4 5define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 { 6; SI-LABEL: extract_vector_elt_v1i8: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 9; SI-NEXT: s_load_dword s2, s[8:9], 0x2 10; SI-NEXT: s_waitcnt lgkmcnt(0) 11; SI-NEXT: v_mov_b32_e32 v0, s0 12; SI-NEXT: v_mov_b32_e32 v1, s1 13; SI-NEXT: v_mov_b32_e32 v2, s2 14; SI-NEXT: flat_store_byte v[0:1], v2 15; SI-NEXT: s_endpgm 16; 17; VI-LABEL: extract_vector_elt_v1i8: 18; VI: ; %bb.0: 19; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 20; VI-NEXT: s_load_dword s2, s[8:9], 0x8 21; VI-NEXT: s_waitcnt lgkmcnt(0) 22; VI-NEXT: v_mov_b32_e32 v0, s0 23; VI-NEXT: v_mov_b32_e32 v1, s1 24; VI-NEXT: v_mov_b32_e32 v2, s2 25; VI-NEXT: flat_store_byte v[0:1], v2 26; VI-NEXT: s_endpgm 27 %p0 = extractelement <1 x i8> %foo, i32 0 28 store i8 %p0, ptr addrspace(1) %out 29 ret void 30} 31 32define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 { 33; SI-LABEL: extract_vector_elt_v2i8: 34; SI: ; %bb.0: 35; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 36; SI-NEXT: s_load_dword s2, s[8:9], 0x2 37; SI-NEXT: s_waitcnt lgkmcnt(0) 38; SI-NEXT: s_lshr_b32 s3, s2, 8 39; SI-NEXT: v_mov_b32_e32 v0, s0 40; SI-NEXT: v_mov_b32_e32 v1, s1 41; SI-NEXT: v_mov_b32_e32 v2, s2 42; SI-NEXT: s_add_u32 s0, s0, 1 43; SI-NEXT: v_mov_b32_e32 v3, s3 44; SI-NEXT: s_addc_u32 s1, s1, 0 45; SI-NEXT: flat_store_byte v[0:1], v3 46; SI-NEXT: s_waitcnt vmcnt(0) 47; SI-NEXT: v_mov_b32_e32 v0, s0 48; SI-NEXT: v_mov_b32_e32 v1, s1 49; SI-NEXT: flat_store_byte v[0:1], v2 50; SI-NEXT: s_waitcnt vmcnt(0) 51; SI-NEXT: s_endpgm 52; 53; VI-LABEL: extract_vector_elt_v2i8: 54; VI: ; %bb.0: 55; VI-NEXT: s_load_dword s2, s[8:9], 0x8 56; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 57; VI-NEXT: s_waitcnt lgkmcnt(0) 58; VI-NEXT: s_lshr_b32 s3, s2, 8 59; VI-NEXT: v_mov_b32_e32 v0, s0 60; VI-NEXT: v_mov_b32_e32 v1, s1 61; VI-NEXT: s_add_u32 s0, s0, 1 62; VI-NEXT: v_mov_b32_e32 v2, s3 63; VI-NEXT: s_addc_u32 s1, s1, 0 64; VI-NEXT: flat_store_byte v[0:1], v2 65; VI-NEXT: s_waitcnt vmcnt(0) 66; VI-NEXT: v_mov_b32_e32 v0, s0 67; VI-NEXT: v_mov_b32_e32 v1, s1 68; VI-NEXT: v_mov_b32_e32 v2, s2 69; VI-NEXT: flat_store_byte v[0:1], v2 70; VI-NEXT: s_waitcnt vmcnt(0) 71; VI-NEXT: s_endpgm 72 %p0 = extractelement <2 x i8> %foo, i32 0 73 %p1 = extractelement <2 x i8> %foo, i32 1 74 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 75 store volatile i8 %p1, ptr addrspace(1) %out 76 store volatile i8 %p0, ptr addrspace(1) %out1 77 ret void 78} 79 80define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 { 81; SI-LABEL: extract_vector_elt_v3i8: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 84; SI-NEXT: s_load_dword s2, s[8:9], 0x2 85; SI-NEXT: s_waitcnt lgkmcnt(0) 86; SI-NEXT: s_lshr_b32 s3, s2, 16 87; SI-NEXT: v_mov_b32_e32 v0, s0 88; SI-NEXT: v_mov_b32_e32 v1, s1 89; SI-NEXT: v_mov_b32_e32 v2, s2 90; SI-NEXT: s_add_u32 s0, s0, 1 91; SI-NEXT: v_mov_b32_e32 v3, s3 92; SI-NEXT: s_addc_u32 s1, s1, 0 93; SI-NEXT: flat_store_byte v[0:1], v3 94; SI-NEXT: s_waitcnt vmcnt(0) 95; SI-NEXT: v_mov_b32_e32 v0, s0 96; SI-NEXT: v_mov_b32_e32 v1, s1 97; SI-NEXT: flat_store_byte v[0:1], v2 98; SI-NEXT: s_waitcnt vmcnt(0) 99; SI-NEXT: s_endpgm 100; 101; VI-LABEL: extract_vector_elt_v3i8: 102; VI: ; %bb.0: 103; VI-NEXT: s_load_dword s2, s[8:9], 0x8 104; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 105; VI-NEXT: s_waitcnt lgkmcnt(0) 106; VI-NEXT: s_lshr_b32 s3, s2, 16 107; VI-NEXT: v_mov_b32_e32 v0, s0 108; VI-NEXT: v_mov_b32_e32 v1, s1 109; VI-NEXT: s_add_u32 s0, s0, 1 110; VI-NEXT: v_mov_b32_e32 v2, s3 111; VI-NEXT: s_addc_u32 s1, s1, 0 112; VI-NEXT: flat_store_byte v[0:1], v2 113; VI-NEXT: s_waitcnt vmcnt(0) 114; VI-NEXT: v_mov_b32_e32 v0, s0 115; VI-NEXT: v_mov_b32_e32 v1, s1 116; VI-NEXT: v_mov_b32_e32 v2, s2 117; VI-NEXT: flat_store_byte v[0:1], v2 118; VI-NEXT: s_waitcnt vmcnt(0) 119; VI-NEXT: s_endpgm 120 %p0 = extractelement <3 x i8> %foo, i32 0 121 %p1 = extractelement <3 x i8> %foo, i32 2 122 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 123 store volatile i8 %p1, ptr addrspace(1) %out 124 store volatile i8 %p0, ptr addrspace(1) %out1 125 ret void 126} 127 128define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 { 129; SI-LABEL: extract_vector_elt_v4i8: 130; SI: ; %bb.0: 131; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 132; SI-NEXT: s_load_dword s2, s[8:9], 0x2 133; SI-NEXT: s_waitcnt lgkmcnt(0) 134; SI-NEXT: s_lshr_b32 s3, s2, 16 135; SI-NEXT: v_mov_b32_e32 v0, s0 136; SI-NEXT: v_mov_b32_e32 v1, s1 137; SI-NEXT: v_mov_b32_e32 v2, s2 138; SI-NEXT: s_add_u32 s0, s0, 1 139; SI-NEXT: v_mov_b32_e32 v3, s3 140; SI-NEXT: s_addc_u32 s1, s1, 0 141; SI-NEXT: flat_store_byte v[0:1], v3 142; SI-NEXT: s_waitcnt vmcnt(0) 143; SI-NEXT: v_mov_b32_e32 v0, s0 144; SI-NEXT: v_mov_b32_e32 v1, s1 145; SI-NEXT: flat_store_byte v[0:1], v2 146; SI-NEXT: s_waitcnt vmcnt(0) 147; SI-NEXT: s_endpgm 148; 149; VI-LABEL: extract_vector_elt_v4i8: 150; VI: ; %bb.0: 151; VI-NEXT: s_load_dword s2, s[8:9], 0x8 152; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 153; VI-NEXT: s_waitcnt lgkmcnt(0) 154; VI-NEXT: s_lshr_b32 s3, s2, 16 155; VI-NEXT: v_mov_b32_e32 v0, s0 156; VI-NEXT: v_mov_b32_e32 v1, s1 157; VI-NEXT: s_add_u32 s0, s0, 1 158; VI-NEXT: v_mov_b32_e32 v2, s3 159; VI-NEXT: s_addc_u32 s1, s1, 0 160; VI-NEXT: flat_store_byte v[0:1], v2 161; VI-NEXT: s_waitcnt vmcnt(0) 162; VI-NEXT: v_mov_b32_e32 v0, s0 163; VI-NEXT: v_mov_b32_e32 v1, s1 164; VI-NEXT: v_mov_b32_e32 v2, s2 165; VI-NEXT: flat_store_byte v[0:1], v2 166; VI-NEXT: s_waitcnt vmcnt(0) 167; VI-NEXT: s_endpgm 168 %p0 = extractelement <4 x i8> %foo, i32 0 169 %p1 = extractelement <4 x i8> %foo, i32 2 170 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 171 store volatile i8 %p1, ptr addrspace(1) %out 172 store volatile i8 %p0, ptr addrspace(1) %out1 173 ret void 174} 175 176define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 { 177; SI-LABEL: extract_vector_elt_v8i8: 178; SI: ; %bb.0: 179; SI-NEXT: s_load_dword s0, s[8:9], 0x0 180; SI-NEXT: s_waitcnt lgkmcnt(0) 181; SI-NEXT: s_lshr_b32 s1, s0, 16 182; SI-NEXT: v_mov_b32_e32 v0, 0 183; SI-NEXT: v_mov_b32_e32 v1, 0 184; SI-NEXT: v_mov_b32_e32 v2, s0 185; SI-NEXT: v_mov_b32_e32 v3, s1 186; SI-NEXT: flat_store_byte v[0:1], v3 187; SI-NEXT: s_waitcnt vmcnt(0) 188; SI-NEXT: flat_store_byte v[0:1], v2 189; SI-NEXT: s_waitcnt vmcnt(0) 190; SI-NEXT: s_endpgm 191; 192; VI-LABEL: extract_vector_elt_v8i8: 193; VI: ; %bb.0: 194; VI-NEXT: s_load_dword s0, s[8:9], 0x0 195; VI-NEXT: v_mov_b32_e32 v0, 0 196; VI-NEXT: v_mov_b32_e32 v1, 0 197; VI-NEXT: s_waitcnt lgkmcnt(0) 198; VI-NEXT: s_lshr_b32 s1, s0, 16 199; VI-NEXT: v_mov_b32_e32 v3, s1 200; VI-NEXT: v_mov_b32_e32 v2, s0 201; VI-NEXT: flat_store_byte v[0:1], v3 202; VI-NEXT: s_waitcnt vmcnt(0) 203; VI-NEXT: flat_store_byte v[0:1], v2 204; VI-NEXT: s_waitcnt vmcnt(0) 205; VI-NEXT: s_endpgm 206 %p0 = extractelement <8 x i8> %foo, i32 0 207 %p1 = extractelement <8 x i8> %foo, i32 2 208 store volatile i8 %p1, ptr addrspace(1) null 209 store volatile i8 %p0, ptr addrspace(1) null 210 ret void 211} 212 213define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 { 214; SI-LABEL: extract_vector_elt_v16i8: 215; SI: ; %bb.0: 216; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 217; SI-NEXT: s_load_dword s2, s[8:9], 0x4 218; SI-NEXT: s_waitcnt lgkmcnt(0) 219; SI-NEXT: s_lshr_b32 s3, s2, 16 220; SI-NEXT: v_mov_b32_e32 v0, s0 221; SI-NEXT: v_mov_b32_e32 v1, s1 222; SI-NEXT: v_mov_b32_e32 v2, s2 223; SI-NEXT: s_add_u32 s0, s0, 1 224; SI-NEXT: v_mov_b32_e32 v3, s3 225; SI-NEXT: s_addc_u32 s1, s1, 0 226; SI-NEXT: flat_store_byte v[0:1], v3 227; SI-NEXT: s_waitcnt vmcnt(0) 228; SI-NEXT: v_mov_b32_e32 v0, s0 229; SI-NEXT: v_mov_b32_e32 v1, s1 230; SI-NEXT: flat_store_byte v[0:1], v2 231; SI-NEXT: s_waitcnt vmcnt(0) 232; SI-NEXT: s_endpgm 233; 234; VI-LABEL: extract_vector_elt_v16i8: 235; VI: ; %bb.0: 236; VI-NEXT: s_load_dword s2, s[8:9], 0x10 237; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 238; VI-NEXT: s_waitcnt lgkmcnt(0) 239; VI-NEXT: s_lshr_b32 s3, s2, 16 240; VI-NEXT: v_mov_b32_e32 v0, s0 241; VI-NEXT: v_mov_b32_e32 v1, s1 242; VI-NEXT: s_add_u32 s0, s0, 1 243; VI-NEXT: v_mov_b32_e32 v2, s3 244; VI-NEXT: s_addc_u32 s1, s1, 0 245; VI-NEXT: flat_store_byte v[0:1], v2 246; VI-NEXT: s_waitcnt vmcnt(0) 247; VI-NEXT: v_mov_b32_e32 v0, s0 248; VI-NEXT: v_mov_b32_e32 v1, s1 249; VI-NEXT: v_mov_b32_e32 v2, s2 250; VI-NEXT: flat_store_byte v[0:1], v2 251; VI-NEXT: s_waitcnt vmcnt(0) 252; VI-NEXT: s_endpgm 253 %p0 = extractelement <16 x i8> %foo, i32 0 254 %p1 = extractelement <16 x i8> %foo, i32 2 255 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 256 store volatile i8 %p1, ptr addrspace(1) %out 257 store volatile i8 %p0, ptr addrspace(1) %out1 258 ret void 259} 260 261define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 { 262; SI-LABEL: extract_vector_elt_v32i8: 263; SI: ; %bb.0: 264; SI-NEXT: s_load_dword s0, s[8:9], 0x0 265; SI-NEXT: s_waitcnt lgkmcnt(0) 266; SI-NEXT: s_lshr_b32 s1, s0, 16 267; SI-NEXT: v_mov_b32_e32 v0, 0 268; SI-NEXT: v_mov_b32_e32 v1, 0 269; SI-NEXT: v_mov_b32_e32 v2, s0 270; SI-NEXT: v_mov_b32_e32 v3, s1 271; SI-NEXT: flat_store_byte v[0:1], v3 272; SI-NEXT: s_waitcnt vmcnt(0) 273; SI-NEXT: flat_store_byte v[0:1], v2 274; SI-NEXT: s_waitcnt vmcnt(0) 275; SI-NEXT: s_endpgm 276; 277; VI-LABEL: extract_vector_elt_v32i8: 278; VI: ; %bb.0: 279; VI-NEXT: s_load_dword s0, s[8:9], 0x0 280; VI-NEXT: v_mov_b32_e32 v0, 0 281; VI-NEXT: v_mov_b32_e32 v1, 0 282; VI-NEXT: s_waitcnt lgkmcnt(0) 283; VI-NEXT: s_lshr_b32 s1, s0, 16 284; VI-NEXT: v_mov_b32_e32 v3, s1 285; VI-NEXT: v_mov_b32_e32 v2, s0 286; VI-NEXT: flat_store_byte v[0:1], v3 287; VI-NEXT: s_waitcnt vmcnt(0) 288; VI-NEXT: flat_store_byte v[0:1], v2 289; VI-NEXT: s_waitcnt vmcnt(0) 290; VI-NEXT: s_endpgm 291 %p0 = extractelement <32 x i8> %foo, i32 0 292 %p1 = extractelement <32 x i8> %foo, i32 2 293 store volatile i8 %p1, ptr addrspace(1) null 294 store volatile i8 %p0, ptr addrspace(1) null 295 ret void 296} 297 298define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 { 299; SI-LABEL: extract_vector_elt_v64i8: 300; SI: ; %bb.0: 301; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 302; SI-NEXT: s_load_dword s2, s[8:9], 0x10 303; SI-NEXT: s_waitcnt lgkmcnt(0) 304; SI-NEXT: s_lshr_b32 s3, s2, 16 305; SI-NEXT: v_mov_b32_e32 v0, s0 306; SI-NEXT: v_mov_b32_e32 v1, s1 307; SI-NEXT: v_mov_b32_e32 v2, s2 308; SI-NEXT: s_add_u32 s0, s0, 1 309; SI-NEXT: v_mov_b32_e32 v3, s3 310; SI-NEXT: s_addc_u32 s1, s1, 0 311; SI-NEXT: flat_store_byte v[0:1], v3 312; SI-NEXT: s_waitcnt vmcnt(0) 313; SI-NEXT: v_mov_b32_e32 v0, s0 314; SI-NEXT: v_mov_b32_e32 v1, s1 315; SI-NEXT: flat_store_byte v[0:1], v2 316; SI-NEXT: s_waitcnt vmcnt(0) 317; SI-NEXT: s_endpgm 318; 319; VI-LABEL: extract_vector_elt_v64i8: 320; VI: ; %bb.0: 321; VI-NEXT: s_load_dword s2, s[8:9], 0x40 322; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 323; VI-NEXT: s_waitcnt lgkmcnt(0) 324; VI-NEXT: s_lshr_b32 s3, s2, 16 325; VI-NEXT: v_mov_b32_e32 v0, s0 326; VI-NEXT: v_mov_b32_e32 v1, s1 327; VI-NEXT: s_add_u32 s0, s0, 1 328; VI-NEXT: v_mov_b32_e32 v2, s3 329; VI-NEXT: s_addc_u32 s1, s1, 0 330; VI-NEXT: flat_store_byte v[0:1], v2 331; VI-NEXT: s_waitcnt vmcnt(0) 332; VI-NEXT: v_mov_b32_e32 v0, s0 333; VI-NEXT: v_mov_b32_e32 v1, s1 334; VI-NEXT: v_mov_b32_e32 v2, s2 335; VI-NEXT: flat_store_byte v[0:1], v2 336; VI-NEXT: s_waitcnt vmcnt(0) 337; VI-NEXT: s_endpgm 338 %p0 = extractelement <64 x i8> %foo, i32 0 339 %p1 = extractelement <64 x i8> %foo, i32 2 340 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 341 store volatile i8 %p1, ptr addrspace(1) %out 342 store volatile i8 %p0, ptr addrspace(1) %out1 343 ret void 344} 345 346; FIXME: SI generates much worse code from that's a pain to match 347 348; FIXME: 16-bit and 32-bit shift not combined after legalize to to 349; isTypeDesirableForOp in SimplifyDemandedBits 350 351define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 { 352; SI-LABEL: dynamic_extract_vector_elt_v2i8: 353; SI: ; %bb.0: 354; SI-NEXT: s_load_dword s2, s[8:9], 0xa 355; SI-NEXT: s_load_dword s3, s[8:9], 0x13 356; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 357; SI-NEXT: s_waitcnt lgkmcnt(0) 358; SI-NEXT: s_and_b32 s2, s2, 0xffff 359; SI-NEXT: s_lshl_b32 s3, s3, 3 360; SI-NEXT: s_lshr_b32 s2, s2, s3 361; SI-NEXT: v_mov_b32_e32 v0, s0 362; SI-NEXT: v_mov_b32_e32 v1, s1 363; SI-NEXT: v_mov_b32_e32 v2, s2 364; SI-NEXT: flat_store_byte v[0:1], v2 365; SI-NEXT: s_waitcnt vmcnt(0) 366; SI-NEXT: s_endpgm 367; 368; VI-LABEL: dynamic_extract_vector_elt_v2i8: 369; VI: ; %bb.0: 370; VI-NEXT: s_load_dword s2, s[8:9], 0x4c 371; VI-NEXT: s_load_dword s3, s[8:9], 0x28 372; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 373; VI-NEXT: s_waitcnt lgkmcnt(0) 374; VI-NEXT: s_lshl_b32 s2, s2, 3 375; VI-NEXT: s_and_b32 s3, s3, 0xffff 376; VI-NEXT: s_lshr_b32 s2, s3, s2 377; VI-NEXT: v_mov_b32_e32 v0, s0 378; VI-NEXT: v_mov_b32_e32 v1, s1 379; VI-NEXT: v_mov_b32_e32 v2, s2 380; VI-NEXT: flat_store_byte v[0:1], v2 381; VI-NEXT: s_waitcnt vmcnt(0) 382; VI-NEXT: s_endpgm 383 %elt = extractelement <2 x i8> %foo, i32 %idx 384 store volatile i8 %elt, ptr addrspace(1) %out 385 ret void 386} 387 388define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 { 389; SI-LABEL: dynamic_extract_vector_elt_v3i8: 390; SI: ; %bb.0: 391; SI-NEXT: s_load_dword s2, s[8:9], 0x13 392; SI-NEXT: s_load_dword s3, s[8:9], 0xa 393; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 394; SI-NEXT: s_waitcnt lgkmcnt(0) 395; SI-NEXT: s_lshl_b32 s2, s2, 3 396; SI-NEXT: s_lshr_b32 s2, s3, s2 397; SI-NEXT: v_mov_b32_e32 v0, s0 398; SI-NEXT: v_mov_b32_e32 v1, s1 399; SI-NEXT: v_mov_b32_e32 v2, s2 400; SI-NEXT: flat_store_byte v[0:1], v2 401; SI-NEXT: s_waitcnt vmcnt(0) 402; SI-NEXT: s_endpgm 403; 404; VI-LABEL: dynamic_extract_vector_elt_v3i8: 405; VI: ; %bb.0: 406; VI-NEXT: s_load_dword s2, s[8:9], 0x4c 407; VI-NEXT: s_load_dword s3, s[8:9], 0x28 408; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 409; VI-NEXT: s_waitcnt lgkmcnt(0) 410; VI-NEXT: s_lshl_b32 s2, s2, 3 411; VI-NEXT: s_lshr_b32 s2, s3, s2 412; VI-NEXT: v_mov_b32_e32 v0, s0 413; VI-NEXT: v_mov_b32_e32 v1, s1 414; VI-NEXT: v_mov_b32_e32 v2, s2 415; VI-NEXT: flat_store_byte v[0:1], v2 416; VI-NEXT: s_waitcnt vmcnt(0) 417; VI-NEXT: s_endpgm 418 %p0 = extractelement <3 x i8> %foo, i32 %idx 419 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 420 store volatile i8 %p0, ptr addrspace(1) %out 421 ret void 422} 423 424define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 { 425; SI-LABEL: dynamic_extract_vector_elt_v4i8: 426; SI: ; %bb.0: 427; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 428; SI-NEXT: s_load_dword s4, s[8:9], 0xc 429; SI-NEXT: s_waitcnt lgkmcnt(0) 430; SI-NEXT: s_load_dword s2, s[2:3], 0x0 431; SI-NEXT: s_lshl_b32 s3, s4, 3 432; SI-NEXT: s_waitcnt lgkmcnt(0) 433; SI-NEXT: s_lshr_b32 s2, s2, s3 434; SI-NEXT: v_mov_b32_e32 v0, s0 435; SI-NEXT: v_mov_b32_e32 v1, s1 436; SI-NEXT: v_mov_b32_e32 v2, s2 437; SI-NEXT: flat_store_byte v[0:1], v2 438; SI-NEXT: s_waitcnt vmcnt(0) 439; SI-NEXT: s_endpgm 440; 441; VI-LABEL: dynamic_extract_vector_elt_v4i8: 442; VI: ; %bb.0: 443; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 444; VI-NEXT: s_load_dword s4, s[8:9], 0x30 445; VI-NEXT: s_waitcnt lgkmcnt(0) 446; VI-NEXT: s_load_dword s2, s[2:3], 0x0 447; VI-NEXT: v_mov_b32_e32 v0, s0 448; VI-NEXT: s_lshl_b32 s0, s4, 3 449; VI-NEXT: v_mov_b32_e32 v1, s1 450; VI-NEXT: s_waitcnt lgkmcnt(0) 451; VI-NEXT: s_lshr_b32 s0, s2, s0 452; VI-NEXT: v_mov_b32_e32 v2, s0 453; VI-NEXT: flat_store_byte v[0:1], v2 454; VI-NEXT: s_waitcnt vmcnt(0) 455; VI-NEXT: s_endpgm 456 %vec = load <4 x i8>, ptr addrspace(4) %vec.ptr 457 %p0 = extractelement <4 x i8> %vec, i32 %idx 458 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 459 store volatile i8 %p0, ptr addrspace(1) %out 460 ret void 461} 462 463define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { 464; SI-LABEL: dynamic_extract_vector_elt_v8i8: 465; SI: ; %bb.0: 466; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 467; SI-NEXT: s_load_dword s4, s[8:9], 0x4 468; SI-NEXT: s_waitcnt lgkmcnt(0) 469; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 470; SI-NEXT: s_lshl_b32 s4, s4, 3 471; SI-NEXT: s_waitcnt lgkmcnt(0) 472; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 473; SI-NEXT: v_mov_b32_e32 v0, s0 474; SI-NEXT: v_mov_b32_e32 v1, s1 475; SI-NEXT: v_mov_b32_e32 v2, s2 476; SI-NEXT: flat_store_byte v[0:1], v2 477; SI-NEXT: s_waitcnt vmcnt(0) 478; SI-NEXT: s_endpgm 479; 480; VI-LABEL: dynamic_extract_vector_elt_v8i8: 481; VI: ; %bb.0: 482; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 483; VI-NEXT: s_load_dword s4, s[8:9], 0x10 484; VI-NEXT: s_waitcnt lgkmcnt(0) 485; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 486; VI-NEXT: v_mov_b32_e32 v0, s0 487; VI-NEXT: s_lshl_b32 s0, s4, 3 488; VI-NEXT: v_mov_b32_e32 v1, s1 489; VI-NEXT: s_waitcnt lgkmcnt(0) 490; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], s0 491; VI-NEXT: v_mov_b32_e32 v2, s0 492; VI-NEXT: flat_store_byte v[0:1], v2 493; VI-NEXT: s_waitcnt vmcnt(0) 494; VI-NEXT: s_endpgm 495 %vec = load <8 x i8>, ptr addrspace(4) %vec.ptr 496 %p0 = extractelement <8 x i8> %vec, i32 %idx 497 %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1 498 store volatile i8 %p0, ptr addrspace(1) %out 499 ret void 500} 501 502define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 { 503; SI-LABEL: reduce_load_vector_v8i8_extract_0123: 504; SI: ; %bb.0: 505; SI-NEXT: s_mov_b64 s[0:1], 0 506; SI-NEXT: s_load_dword s0, s[0:1], 0x0 507; SI-NEXT: s_waitcnt lgkmcnt(0) 508; SI-NEXT: s_lshr_b32 s1, s0, 8 509; SI-NEXT: s_lshr_b32 s2, s0, 16 510; SI-NEXT: s_lshr_b32 s3, s0, 24 511; SI-NEXT: v_mov_b32_e32 v0, s0 512; SI-NEXT: flat_store_byte v[0:1], v0 513; SI-NEXT: s_waitcnt vmcnt(0) 514; SI-NEXT: v_mov_b32_e32 v0, s1 515; SI-NEXT: flat_store_byte v[0:1], v0 516; SI-NEXT: s_waitcnt vmcnt(0) 517; SI-NEXT: v_mov_b32_e32 v0, s2 518; SI-NEXT: flat_store_byte v[0:1], v0 519; SI-NEXT: s_waitcnt vmcnt(0) 520; SI-NEXT: v_mov_b32_e32 v0, s3 521; SI-NEXT: flat_store_byte v[0:1], v0 522; SI-NEXT: s_waitcnt vmcnt(0) 523; SI-NEXT: s_endpgm 524; 525; VI-LABEL: reduce_load_vector_v8i8_extract_0123: 526; VI: ; %bb.0: 527; VI-NEXT: s_mov_b64 s[0:1], 0 528; VI-NEXT: s_load_dword s0, s[0:1], 0x0 529; VI-NEXT: s_waitcnt lgkmcnt(0) 530; VI-NEXT: s_lshr_b32 s1, s0, 8 531; VI-NEXT: v_mov_b32_e32 v0, s0 532; VI-NEXT: s_lshr_b32 s2, s0, 16 533; VI-NEXT: flat_store_byte v[0:1], v0 534; VI-NEXT: s_waitcnt vmcnt(0) 535; VI-NEXT: v_mov_b32_e32 v0, s1 536; VI-NEXT: s_lshr_b32 s3, s0, 24 537; VI-NEXT: flat_store_byte v[0:1], v0 538; VI-NEXT: s_waitcnt vmcnt(0) 539; VI-NEXT: v_mov_b32_e32 v0, s2 540; VI-NEXT: flat_store_byte v[0:1], v0 541; VI-NEXT: s_waitcnt vmcnt(0) 542; VI-NEXT: v_mov_b32_e32 v0, s3 543; VI-NEXT: flat_store_byte v[0:1], v0 544; VI-NEXT: s_waitcnt vmcnt(0) 545; VI-NEXT: s_endpgm 546 %load = load <8 x i8>, ptr addrspace(4) null 547 %elt0 = extractelement <8 x i8> %load, i32 0 548 %elt1 = extractelement <8 x i8> %load, i32 1 549 %elt2 = extractelement <8 x i8> %load, i32 2 550 %elt3 = extractelement <8 x i8> %load, i32 3 551 store volatile i8 %elt0, ptr addrspace(1) undef, align 1 552 store volatile i8 %elt1, ptr addrspace(1) undef, align 1 553 store volatile i8 %elt2, ptr addrspace(1) undef, align 1 554 store volatile i8 %elt3, ptr addrspace(1) undef, align 1 555 ret void 556} 557 558define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 { 559; SI-LABEL: reduce_load_vector_v8i8_extract_0145: 560; SI: ; %bb.0: 561; SI-NEXT: s_mov_b64 s[0:1], 0 562; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 563; SI-NEXT: s_waitcnt lgkmcnt(0) 564; SI-NEXT: s_lshr_b32 s2, s0, 8 565; SI-NEXT: s_lshr_b32 s3, s1, 8 566; SI-NEXT: v_mov_b32_e32 v0, s0 567; SI-NEXT: flat_store_byte v[0:1], v0 568; SI-NEXT: s_waitcnt vmcnt(0) 569; SI-NEXT: v_mov_b32_e32 v0, s1 570; SI-NEXT: v_mov_b32_e32 v1, s2 571; SI-NEXT: flat_store_byte v[0:1], v1 572; SI-NEXT: s_waitcnt vmcnt(0) 573; SI-NEXT: flat_store_byte v[0:1], v0 574; SI-NEXT: s_waitcnt vmcnt(0) 575; SI-NEXT: v_mov_b32_e32 v0, s3 576; SI-NEXT: flat_store_byte v[0:1], v0 577; SI-NEXT: s_waitcnt vmcnt(0) 578; SI-NEXT: s_endpgm 579; 580; VI-LABEL: reduce_load_vector_v8i8_extract_0145: 581; VI: ; %bb.0: 582; VI-NEXT: s_mov_b64 s[0:1], 0 583; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 584; VI-NEXT: s_waitcnt lgkmcnt(0) 585; VI-NEXT: s_lshr_b32 s2, s0, 8 586; VI-NEXT: v_mov_b32_e32 v0, s0 587; VI-NEXT: v_mov_b32_e32 v1, s1 588; VI-NEXT: s_lshr_b32 s3, s1, 8 589; VI-NEXT: flat_store_byte v[0:1], v0 590; VI-NEXT: s_waitcnt vmcnt(0) 591; VI-NEXT: v_mov_b32_e32 v0, s2 592; VI-NEXT: flat_store_byte v[0:1], v0 593; VI-NEXT: s_waitcnt vmcnt(0) 594; VI-NEXT: flat_store_byte v[0:1], v1 595; VI-NEXT: s_waitcnt vmcnt(0) 596; VI-NEXT: v_mov_b32_e32 v0, s3 597; VI-NEXT: flat_store_byte v[0:1], v0 598; VI-NEXT: s_waitcnt vmcnt(0) 599; VI-NEXT: s_endpgm 600 %load = load <8 x i8>, ptr addrspace(4) null 601 %elt0 = extractelement <8 x i8> %load, i32 0 602 %elt1 = extractelement <8 x i8> %load, i32 1 603 %elt4 = extractelement <8 x i8> %load, i32 4 604 %elt5 = extractelement <8 x i8> %load, i32 5 605 store volatile i8 %elt0, ptr addrspace(1) undef, align 1 606 store volatile i8 %elt1, ptr addrspace(1) undef, align 1 607 store volatile i8 %elt4, ptr addrspace(1) undef, align 1 608 store volatile i8 %elt5, ptr addrspace(1) undef, align 1 609 ret void 610} 611 612define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 { 613; SI-LABEL: reduce_load_vector_v8i8_extract_45: 614; SI: ; %bb.0: 615; SI-NEXT: s_mov_b64 s[0:1], 4 616; SI-NEXT: s_load_dword s0, s[0:1], 0x0 617; SI-NEXT: s_waitcnt lgkmcnt(0) 618; SI-NEXT: s_lshr_b32 s1, s0, 8 619; SI-NEXT: v_mov_b32_e32 v0, s0 620; SI-NEXT: flat_store_byte v[0:1], v0 621; SI-NEXT: s_waitcnt vmcnt(0) 622; SI-NEXT: v_mov_b32_e32 v0, s1 623; SI-NEXT: flat_store_byte v[0:1], v0 624; SI-NEXT: s_waitcnt vmcnt(0) 625; SI-NEXT: s_endpgm 626; 627; VI-LABEL: reduce_load_vector_v8i8_extract_45: 628; VI: ; %bb.0: 629; VI-NEXT: s_mov_b64 s[0:1], 4 630; VI-NEXT: s_load_dword s0, s[0:1], 0x0 631; VI-NEXT: s_waitcnt lgkmcnt(0) 632; VI-NEXT: s_lshr_b32 s1, s0, 8 633; VI-NEXT: v_mov_b32_e32 v0, s0 634; VI-NEXT: flat_store_byte v[0:1], v0 635; VI-NEXT: s_waitcnt vmcnt(0) 636; VI-NEXT: v_mov_b32_e32 v0, s1 637; VI-NEXT: flat_store_byte v[0:1], v0 638; VI-NEXT: s_waitcnt vmcnt(0) 639; VI-NEXT: s_endpgm 640 %load = load <8 x i8>, ptr addrspace(4) null 641 %elt4 = extractelement <8 x i8> %load, i32 4 642 %elt5 = extractelement <8 x i8> %load, i32 5 643 store volatile i8 %elt4, ptr addrspace(1) undef, align 1 644 store volatile i8 %elt5, ptr addrspace(1) undef, align 1 645 ret void 646} 647 648; FIXME: ought to be able to eliminate high half of load 649define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 { 650; SI-LABEL: reduce_load_vector_v16i8_extract_0145: 651; SI: ; %bb.0: 652; SI-NEXT: s_mov_b64 s[0:1], 0 653; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 654; SI-NEXT: s_waitcnt lgkmcnt(0) 655; SI-NEXT: s_lshr_b32 s2, s0, 8 656; SI-NEXT: s_lshr_b32 s3, s1, 8 657; SI-NEXT: v_mov_b32_e32 v0, s0 658; SI-NEXT: flat_store_byte v[0:1], v0 659; SI-NEXT: s_waitcnt vmcnt(0) 660; SI-NEXT: v_mov_b32_e32 v0, s1 661; SI-NEXT: v_mov_b32_e32 v1, s2 662; SI-NEXT: flat_store_byte v[0:1], v1 663; SI-NEXT: s_waitcnt vmcnt(0) 664; SI-NEXT: flat_store_byte v[0:1], v0 665; SI-NEXT: s_waitcnt vmcnt(0) 666; SI-NEXT: v_mov_b32_e32 v0, s3 667; SI-NEXT: flat_store_byte v[0:1], v0 668; SI-NEXT: s_waitcnt vmcnt(0) 669; SI-NEXT: s_endpgm 670; 671; VI-LABEL: reduce_load_vector_v16i8_extract_0145: 672; VI: ; %bb.0: 673; VI-NEXT: s_mov_b64 s[0:1], 0 674; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 675; VI-NEXT: s_waitcnt lgkmcnt(0) 676; VI-NEXT: s_lshr_b32 s2, s0, 8 677; VI-NEXT: v_mov_b32_e32 v0, s0 678; VI-NEXT: v_mov_b32_e32 v1, s1 679; VI-NEXT: s_lshr_b32 s3, s1, 8 680; VI-NEXT: flat_store_byte v[0:1], v0 681; VI-NEXT: s_waitcnt vmcnt(0) 682; VI-NEXT: v_mov_b32_e32 v0, s2 683; VI-NEXT: flat_store_byte v[0:1], v0 684; VI-NEXT: s_waitcnt vmcnt(0) 685; VI-NEXT: flat_store_byte v[0:1], v1 686; VI-NEXT: s_waitcnt vmcnt(0) 687; VI-NEXT: v_mov_b32_e32 v0, s3 688; VI-NEXT: flat_store_byte v[0:1], v0 689; VI-NEXT: s_waitcnt vmcnt(0) 690; VI-NEXT: s_endpgm 691 %load = load <16 x i8>, ptr addrspace(4) null 692 %elt0 = extractelement <16 x i8> %load, i32 0 693 %elt1 = extractelement <16 x i8> %load, i32 1 694 %elt4 = extractelement <16 x i8> %load, i32 4 695 %elt5 = extractelement <16 x i8> %load, i32 5 696 store volatile i8 %elt0, ptr addrspace(1) undef, align 1 697 store volatile i8 %elt1, ptr addrspace(1) undef, align 1 698 store volatile i8 %elt4, ptr addrspace(1) undef, align 1 699 store volatile i8 %elt5, ptr addrspace(1) undef, align 1 700 ret void 701} 702 703attributes #0 = { nounwind } 704