1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI 3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI 4 5define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { 6; SI-LABEL: s_sext_i1_to_i32: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 9; SI-NEXT: s_mov_b32 s7, 0xf000 10; SI-NEXT: s_mov_b32 s6, -1 11; SI-NEXT: s_waitcnt lgkmcnt(0) 12; SI-NEXT: s_cmp_eq_u32 s2, s3 13; SI-NEXT: s_mov_b32 s4, s0 14; SI-NEXT: s_mov_b32 s5, s1 15; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 16; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 17; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 18; SI-NEXT: s_endpgm 19; 20; VI-LABEL: s_sext_i1_to_i32: 21; VI: ; %bb.0: 22; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 23; VI-NEXT: s_mov_b32 s7, 0xf000 24; VI-NEXT: s_mov_b32 s6, -1 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_cmp_eq_u32 s2, s3 27; VI-NEXT: s_mov_b32 s4, s0 28; VI-NEXT: s_mov_b32 s5, s1 29; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 30; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 31; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 32; VI-NEXT: s_endpgm 33 %cmp = icmp eq i32 %a, %b 34 %sext = sext i1 %cmp to i32 35 store i32 %sext, ptr addrspace(1) %out, align 4 36 ret void 37} 38 39define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { 40; SI-LABEL: test_s_sext_i32_to_i64: 41; SI: ; %bb.0: ; %entry 42; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 43; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 44; SI-NEXT: s_mov_b32 s7, 0xf000 45; SI-NEXT: s_mov_b32 s6, -1 46; SI-NEXT: s_waitcnt lgkmcnt(0) 47; SI-NEXT: s_mul_i32 s0, s0, s1 48; SI-NEXT: s_add_i32 s0, s0, s2 49; SI-NEXT: s_ashr_i32 s1, s0, 31 50; SI-NEXT: v_mov_b32_e32 v0, s0 51; SI-NEXT: v_mov_b32_e32 v1, s1 52; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 53; SI-NEXT: s_endpgm 54; 55; VI-LABEL: test_s_sext_i32_to_i64: 56; VI: ; %bb.0: ; %entry 57; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 58; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 59; VI-NEXT: s_mov_b32 s7, 0xf000 60; VI-NEXT: s_mov_b32 s6, -1 61; VI-NEXT: s_waitcnt lgkmcnt(0) 62; VI-NEXT: s_mul_i32 s0, s0, s1 63; VI-NEXT: s_add_i32 s0, s0, s2 64; VI-NEXT: s_ashr_i32 s1, s0, 31 65; VI-NEXT: v_mov_b32_e32 v0, s0 66; VI-NEXT: v_mov_b32_e32 v1, s1 67; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 68; VI-NEXT: s_endpgm 69entry: 70 %mul = mul i32 %a, %b 71 %add = add i32 %mul, %c 72 %sext = sext i32 %add to i64 73 store i64 %sext, ptr addrspace(1) %out, align 8 74 ret void 75} 76 77define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { 78; SI-LABEL: s_sext_i1_to_i64: 79; SI: ; %bb.0: 80; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 81; SI-NEXT: s_mov_b32 s7, 0xf000 82; SI-NEXT: s_mov_b32 s6, -1 83; SI-NEXT: s_waitcnt lgkmcnt(0) 84; SI-NEXT: s_cmp_eq_u32 s2, s3 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 88; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 89; SI-NEXT: v_mov_b32_e32 v1, v0 90; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: s_sext_i1_to_i64: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 96; VI-NEXT: s_mov_b32 s7, 0xf000 97; VI-NEXT: s_mov_b32 s6, -1 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: s_cmp_eq_u32 s2, s3 100; VI-NEXT: s_mov_b32 s4, s0 101; VI-NEXT: s_mov_b32 s5, s1 102; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 103; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 104; VI-NEXT: v_mov_b32_e32 v1, v0 105; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 106; VI-NEXT: s_endpgm 107 %cmp = icmp eq i32 %a, %b 108 %sext = sext i1 %cmp to i64 109 store i64 %sext, ptr addrspace(1) %out, align 8 110 ret void 111} 112 113define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind { 114; SI-LABEL: s_sext_i32_to_i64: 115; SI: ; %bb.0: 116; SI-NEXT: s_load_dword s6, s[4:5], 0xb 117; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 118; SI-NEXT: s_mov_b32 s3, 0xf000 119; SI-NEXT: s_mov_b32 s2, -1 120; SI-NEXT: s_waitcnt lgkmcnt(0) 121; SI-NEXT: s_ashr_i32 s4, s6, 31 122; SI-NEXT: v_mov_b32_e32 v0, s6 123; SI-NEXT: v_mov_b32_e32 v1, s4 124; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 125; SI-NEXT: s_endpgm 126; 127; VI-LABEL: s_sext_i32_to_i64: 128; VI: ; %bb.0: 129; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 130; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 131; VI-NEXT: s_mov_b32 s3, 0xf000 132; VI-NEXT: s_mov_b32 s2, -1 133; VI-NEXT: s_waitcnt lgkmcnt(0) 134; VI-NEXT: s_ashr_i32 s4, s6, 31 135; VI-NEXT: v_mov_b32_e32 v0, s6 136; VI-NEXT: v_mov_b32_e32 v1, s4 137; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 138; VI-NEXT: s_endpgm 139 %sext = sext i32 %a to i64 140 store i64 %sext, ptr addrspace(1) %out, align 8 141 ret void 142} 143 144define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 145; SI-LABEL: v_sext_i32_to_i64: 146; SI: ; %bb.0: 147; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 148; SI-NEXT: s_mov_b32 s7, 0xf000 149; SI-NEXT: s_mov_b32 s6, -1 150; SI-NEXT: s_mov_b32 s10, s6 151; SI-NEXT: s_mov_b32 s11, s7 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: s_mov_b32 s8, s2 154; SI-NEXT: s_mov_b32 s9, s3 155; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 156; SI-NEXT: s_mov_b32 s4, s0 157; SI-NEXT: s_mov_b32 s5, s1 158; SI-NEXT: s_waitcnt vmcnt(0) 159; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 160; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 161; SI-NEXT: s_endpgm 162; 163; VI-LABEL: v_sext_i32_to_i64: 164; VI: ; %bb.0: 165; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 166; VI-NEXT: s_mov_b32 s7, 0xf000 167; VI-NEXT: s_mov_b32 s6, -1 168; VI-NEXT: s_mov_b32 s10, s6 169; VI-NEXT: s_mov_b32 s11, s7 170; VI-NEXT: s_waitcnt lgkmcnt(0) 171; VI-NEXT: s_mov_b32 s8, s2 172; VI-NEXT: s_mov_b32 s9, s3 173; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 174; VI-NEXT: s_mov_b32 s4, s0 175; VI-NEXT: s_mov_b32 s5, s1 176; VI-NEXT: s_waitcnt vmcnt(0) 177; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 178; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 179; VI-NEXT: s_endpgm 180 %val = load i32, ptr addrspace(1) %in, align 4 181 %sext = sext i32 %val to i64 182 store i64 %sext, ptr addrspace(1) %out, align 8 183 ret void 184} 185 186define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind { 187; SI-LABEL: s_sext_i16_to_i64: 188; SI: ; %bb.0: 189; SI-NEXT: s_load_dword s6, s[4:5], 0xb 190; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 191; SI-NEXT: s_mov_b32 s3, 0xf000 192; SI-NEXT: s_mov_b32 s2, -1 193; SI-NEXT: s_waitcnt lgkmcnt(0) 194; SI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 195; SI-NEXT: v_mov_b32_e32 v0, s4 196; SI-NEXT: v_mov_b32_e32 v1, s5 197; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 198; SI-NEXT: s_endpgm 199; 200; VI-LABEL: s_sext_i16_to_i64: 201; VI: ; %bb.0: 202; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 203; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 204; VI-NEXT: s_mov_b32 s3, 0xf000 205; VI-NEXT: s_mov_b32 s2, -1 206; VI-NEXT: s_waitcnt lgkmcnt(0) 207; VI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000 208; VI-NEXT: v_mov_b32_e32 v0, s4 209; VI-NEXT: v_mov_b32_e32 v1, s5 210; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 211; VI-NEXT: s_endpgm 212 %sext = sext i16 %a to i64 213 store i64 %sext, ptr addrspace(1) %out, align 8 214 ret void 215} 216 217define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind { 218; SI-LABEL: s_sext_i1_to_i16: 219; SI: ; %bb.0: 220; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 221; SI-NEXT: s_mov_b32 s7, 0xf000 222; SI-NEXT: s_mov_b32 s6, -1 223; SI-NEXT: s_waitcnt lgkmcnt(0) 224; SI-NEXT: s_cmp_eq_u32 s2, s3 225; SI-NEXT: s_mov_b32 s4, s0 226; SI-NEXT: s_mov_b32 s5, s1 227; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 228; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 229; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 230; SI-NEXT: s_endpgm 231; 232; VI-LABEL: s_sext_i1_to_i16: 233; VI: ; %bb.0: 234; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 235; VI-NEXT: s_mov_b32 s7, 0xf000 236; VI-NEXT: s_mov_b32 s6, -1 237; VI-NEXT: s_waitcnt lgkmcnt(0) 238; VI-NEXT: s_cmp_eq_u32 s2, s3 239; VI-NEXT: s_mov_b32 s4, s0 240; VI-NEXT: s_mov_b32 s5, s1 241; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 242; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 243; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 244; VI-NEXT: s_endpgm 245 %cmp = icmp eq i32 %a, %b 246 %sext = sext i1 %cmp to i16 247 store i16 %sext, ptr addrspace(1) %out 248 ret void 249} 250 251; This purpose of this test is to make sure the i16 = sign_extend i1 node 252; makes it all the way throught the legalizer/optimizer to make sure 253; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node 254; is optimized to a select very early. 255define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { 256; SI-LABEL: s_sext_i1_to_i16_with_and: 257; SI: ; %bb.0: 258; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 259; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 260; SI-NEXT: s_mov_b32 s7, 0xf000 261; SI-NEXT: s_mov_b32 s6, -1 262; SI-NEXT: s_waitcnt lgkmcnt(0) 263; SI-NEXT: s_cmp_eq_u32 s0, s1 264; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 265; SI-NEXT: s_cmp_eq_u32 s2, s3 266; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 267; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 268; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 269; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 270; SI-NEXT: s_endpgm 271; 272; VI-LABEL: s_sext_i1_to_i16_with_and: 273; VI: ; %bb.0: 274; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 275; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 276; VI-NEXT: s_mov_b32 s7, 0xf000 277; VI-NEXT: s_mov_b32 s6, -1 278; VI-NEXT: s_waitcnt lgkmcnt(0) 279; VI-NEXT: s_cmp_eq_u32 s0, s1 280; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 281; VI-NEXT: s_cmp_eq_u32 s2, s3 282; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 283; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] 284; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 285; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 286; VI-NEXT: s_endpgm 287 %cmp0 = icmp eq i32 %a, %b 288 %cmp1 = icmp eq i32 %c, %d 289 %cmp = and i1 %cmp0, %cmp1 290 %sext = sext i1 %cmp to i16 291 store i16 %sext, ptr addrspace(1) %out 292 ret void 293} 294 295define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind { 296; SI-LABEL: v_sext_i1_to_i16_with_and: 297; SI: ; %bb.0: 298; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 299; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 300; SI-NEXT: s_mov_b32 s7, 0xf000 301; SI-NEXT: s_mov_b32 s6, -1 302; SI-NEXT: s_waitcnt lgkmcnt(0) 303; SI-NEXT: s_cmp_eq_u32 s1, s2 304; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 305; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 306; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 307; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 308; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 309; SI-NEXT: s_endpgm 310; 311; VI-LABEL: v_sext_i1_to_i16_with_and: 312; VI: ; %bb.0: 313; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 314; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 315; VI-NEXT: s_mov_b32 s7, 0xf000 316; VI-NEXT: s_mov_b32 s6, -1 317; VI-NEXT: s_waitcnt lgkmcnt(0) 318; VI-NEXT: s_cmp_eq_u32 s1, s2 319; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 320; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 321; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 322; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 323; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 324; VI-NEXT: s_endpgm 325 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 326 %cmp0 = icmp eq i32 %a, %tid 327 %cmp1 = icmp eq i32 %b, %c 328 %cmp = and i1 %cmp0, %cmp1 329 %sext = sext i1 %cmp to i16 330 store i16 %sext, ptr addrspace(1) %out 331 ret void 332} 333 334; FIXME: We end up with a v_bfe instruction, because the i16 srl 335; gets selected to a v_lshrrev_b16 instructions, so the input to 336; the bfe is a vector registers. To fix this we need to be able to 337; optimize: 338; t29: i16 = truncate t10 339; t55: i16 = srl t29, Constant:i32<8> 340; t63: i32 = any_extend t55 341; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 342define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind { 343; SI-LABEL: s_sext_v4i8_to_v4i32: 344; SI: ; %bb.0: 345; SI-NEXT: s_load_dword s6, s[4:5], 0xb 346; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 347; SI-NEXT: s_mov_b32 s3, 0xf000 348; SI-NEXT: s_mov_b32 s2, -1 349; SI-NEXT: s_waitcnt lgkmcnt(0) 350; SI-NEXT: s_ashr_i32 s4, s6, 24 351; SI-NEXT: s_bfe_i32 s5, s6, 0x80010 352; SI-NEXT: s_bfe_i32 s7, s6, 0x80008 353; SI-NEXT: s_sext_i32_i8 s6, s6 354; SI-NEXT: v_mov_b32_e32 v0, s6 355; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 356; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 357; SI-NEXT: v_mov_b32_e32 v0, s7 358; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 359; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 360; SI-NEXT: v_mov_b32_e32 v0, s5 361; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 362; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 363; SI-NEXT: v_mov_b32_e32 v0, s4 364; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 365; SI-NEXT: s_waitcnt vmcnt(0) 366; SI-NEXT: s_endpgm 367; 368; VI-LABEL: s_sext_v4i8_to_v4i32: 369; VI: ; %bb.0: 370; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 371; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 372; VI-NEXT: s_mov_b32 s3, 0xf000 373; VI-NEXT: s_mov_b32 s2, -1 374; VI-NEXT: s_waitcnt lgkmcnt(0) 375; VI-NEXT: s_ashr_i32 s4, s6, 24 376; VI-NEXT: s_bfe_i32 s5, s6, 0x80010 377; VI-NEXT: s_bfe_i32 s7, s6, 0x80008 378; VI-NEXT: s_sext_i32_i8 s6, s6 379; VI-NEXT: v_mov_b32_e32 v0, s6 380; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 381; VI-NEXT: s_waitcnt vmcnt(0) 382; VI-NEXT: v_mov_b32_e32 v0, s7 383; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 384; VI-NEXT: s_waitcnt vmcnt(0) 385; VI-NEXT: v_mov_b32_e32 v0, s5 386; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 387; VI-NEXT: s_waitcnt vmcnt(0) 388; VI-NEXT: v_mov_b32_e32 v0, s4 389; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 390; VI-NEXT: s_waitcnt vmcnt(0) 391; VI-NEXT: s_endpgm 392 %cast = bitcast i32 %a to <4 x i8> 393 %ext = sext <4 x i8> %cast to <4 x i32> 394 %elt0 = extractelement <4 x i32> %ext, i32 0 395 %elt1 = extractelement <4 x i32> %ext, i32 1 396 %elt2 = extractelement <4 x i32> %ext, i32 2 397 %elt3 = extractelement <4 x i32> %ext, i32 3 398 store volatile i32 %elt0, ptr addrspace(1) %out 399 store volatile i32 %elt1, ptr addrspace(1) %out 400 store volatile i32 %elt2, ptr addrspace(1) %out 401 store volatile i32 %elt3, ptr addrspace(1) %out 402 ret void 403} 404 405; FIXME: need to optimize same sequence as above test to avoid 406; this shift. 407define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 408; SI-LABEL: v_sext_v4i8_to_v4i32: 409; SI: ; %bb.0: 410; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 411; SI-NEXT: s_mov_b32 s7, 0xf000 412; SI-NEXT: s_mov_b32 s6, -1 413; SI-NEXT: s_mov_b32 s10, s6 414; SI-NEXT: s_mov_b32 s11, s7 415; SI-NEXT: s_waitcnt lgkmcnt(0) 416; SI-NEXT: s_mov_b32 s8, s2 417; SI-NEXT: s_mov_b32 s9, s3 418; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 419; SI-NEXT: s_mov_b32 s4, s0 420; SI-NEXT: s_mov_b32 s5, s1 421; SI-NEXT: s_waitcnt vmcnt(0) 422; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 423; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 424; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 425; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 426; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 427; SI-NEXT: s_waitcnt vmcnt(0) 428; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 429; SI-NEXT: s_waitcnt vmcnt(0) 430; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 431; SI-NEXT: s_waitcnt vmcnt(0) 432; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 433; SI-NEXT: s_waitcnt vmcnt(0) 434; SI-NEXT: s_endpgm 435; 436; VI-LABEL: v_sext_v4i8_to_v4i32: 437; VI: ; %bb.0: 438; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 439; VI-NEXT: s_mov_b32 s7, 0xf000 440; VI-NEXT: s_mov_b32 s6, -1 441; VI-NEXT: s_mov_b32 s10, s6 442; VI-NEXT: s_mov_b32 s11, s7 443; VI-NEXT: s_waitcnt lgkmcnt(0) 444; VI-NEXT: s_mov_b32 s8, s2 445; VI-NEXT: s_mov_b32 s9, s3 446; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 447; VI-NEXT: s_mov_b32 s4, s0 448; VI-NEXT: s_mov_b32 s5, s1 449; VI-NEXT: s_waitcnt vmcnt(0) 450; VI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 451; VI-NEXT: v_bfe_i32 v2, v0, 16, 8 452; VI-NEXT: v_bfe_i32 v3, v0, 8, 8 453; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 454; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 455; VI-NEXT: s_waitcnt vmcnt(0) 456; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 457; VI-NEXT: s_waitcnt vmcnt(0) 458; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 459; VI-NEXT: s_waitcnt vmcnt(0) 460; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 461; VI-NEXT: s_waitcnt vmcnt(0) 462; VI-NEXT: s_endpgm 463 %a = load i32, ptr addrspace(1) %in 464 %cast = bitcast i32 %a to <4 x i8> 465 %ext = sext <4 x i8> %cast to <4 x i32> 466 %elt0 = extractelement <4 x i32> %ext, i32 0 467 %elt1 = extractelement <4 x i32> %ext, i32 1 468 %elt2 = extractelement <4 x i32> %ext, i32 2 469 %elt3 = extractelement <4 x i32> %ext, i32 3 470 store volatile i32 %elt0, ptr addrspace(1) %out 471 store volatile i32 %elt1, ptr addrspace(1) %out 472 store volatile i32 %elt2, ptr addrspace(1) %out 473 store volatile i32 %elt3, ptr addrspace(1) %out 474 ret void 475} 476 477; FIXME: s_bfe_i64, same on SI and VI 478define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind { 479; SI-LABEL: s_sext_v4i16_to_v4i32: 480; SI: ; %bb.0: 481; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 482; SI-NEXT: s_mov_b32 s7, 0xf000 483; SI-NEXT: s_mov_b32 s6, -1 484; SI-NEXT: s_waitcnt lgkmcnt(0) 485; SI-NEXT: s_mov_b32 s4, s0 486; SI-NEXT: s_mov_b32 s5, s1 487; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 488; SI-NEXT: s_ashr_i32 s1, s2, 16 489; SI-NEXT: s_sext_i32_i16 s2, s2 490; SI-NEXT: v_mov_b32_e32 v0, s2 491; SI-NEXT: s_sext_i32_i16 s3, s3 492; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 493; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 494; SI-NEXT: v_mov_b32_e32 v0, s1 495; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 496; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 497; SI-NEXT: v_mov_b32_e32 v0, s3 498; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 499; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 500; SI-NEXT: v_mov_b32_e32 v0, s0 501; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 502; SI-NEXT: s_waitcnt vmcnt(0) 503; SI-NEXT: s_endpgm 504; 505; VI-LABEL: s_sext_v4i16_to_v4i32: 506; VI: ; %bb.0: 507; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 508; VI-NEXT: s_mov_b32 s7, 0xf000 509; VI-NEXT: s_mov_b32 s6, -1 510; VI-NEXT: s_waitcnt lgkmcnt(0) 511; VI-NEXT: s_mov_b32 s5, s1 512; VI-NEXT: s_ashr_i32 s1, s2, 16 513; VI-NEXT: s_sext_i32_i16 s2, s2 514; VI-NEXT: s_mov_b32 s4, s0 515; VI-NEXT: v_mov_b32_e32 v0, s2 516; VI-NEXT: s_ashr_i32 s0, s3, 16 517; VI-NEXT: s_sext_i32_i16 s3, s3 518; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 519; VI-NEXT: s_waitcnt vmcnt(0) 520; VI-NEXT: v_mov_b32_e32 v0, s1 521; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 522; VI-NEXT: s_waitcnt vmcnt(0) 523; VI-NEXT: v_mov_b32_e32 v0, s3 524; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 525; VI-NEXT: s_waitcnt vmcnt(0) 526; VI-NEXT: v_mov_b32_e32 v0, s0 527; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 528; VI-NEXT: s_waitcnt vmcnt(0) 529; VI-NEXT: s_endpgm 530 %cast = bitcast i64 %a to <4 x i16> 531 %ext = sext <4 x i16> %cast to <4 x i32> 532 %elt0 = extractelement <4 x i32> %ext, i32 0 533 %elt1 = extractelement <4 x i32> %ext, i32 1 534 %elt2 = extractelement <4 x i32> %ext, i32 2 535 %elt3 = extractelement <4 x i32> %ext, i32 3 536 store volatile i32 %elt0, ptr addrspace(1) %out 537 store volatile i32 %elt1, ptr addrspace(1) %out 538 store volatile i32 %elt2, ptr addrspace(1) %out 539 store volatile i32 %elt3, ptr addrspace(1) %out 540 ret void 541} 542 543define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { 544; SI-LABEL: v_sext_v4i16_to_v4i32: 545; SI: ; %bb.0: 546; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 547; SI-NEXT: s_mov_b32 s7, 0xf000 548; SI-NEXT: s_mov_b32 s6, -1 549; SI-NEXT: s_mov_b32 s10, s6 550; SI-NEXT: s_mov_b32 s11, s7 551; SI-NEXT: s_waitcnt lgkmcnt(0) 552; SI-NEXT: s_mov_b32 s8, s2 553; SI-NEXT: s_mov_b32 s9, s3 554; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 555; SI-NEXT: s_mov_b32 s4, s0 556; SI-NEXT: s_mov_b32 s5, s1 557; SI-NEXT: s_waitcnt vmcnt(0) 558; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 559; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 560; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 561; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 562; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 563; SI-NEXT: s_waitcnt vmcnt(0) 564; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 565; SI-NEXT: s_waitcnt vmcnt(0) 566; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 567; SI-NEXT: s_waitcnt vmcnt(0) 568; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 569; SI-NEXT: s_waitcnt vmcnt(0) 570; SI-NEXT: s_endpgm 571; 572; VI-LABEL: v_sext_v4i16_to_v4i32: 573; VI: ; %bb.0: 574; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 575; VI-NEXT: s_mov_b32 s7, 0xf000 576; VI-NEXT: s_mov_b32 s6, -1 577; VI-NEXT: s_mov_b32 s10, s6 578; VI-NEXT: s_mov_b32 s11, s7 579; VI-NEXT: s_waitcnt lgkmcnt(0) 580; VI-NEXT: s_mov_b32 s8, s2 581; VI-NEXT: s_mov_b32 s9, s3 582; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 583; VI-NEXT: s_mov_b32 s4, s0 584; VI-NEXT: s_mov_b32 s5, s1 585; VI-NEXT: s_waitcnt vmcnt(0) 586; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 587; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 588; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 589; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 590; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 591; VI-NEXT: s_waitcnt vmcnt(0) 592; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 593; VI-NEXT: s_waitcnt vmcnt(0) 594; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 595; VI-NEXT: s_waitcnt vmcnt(0) 596; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 597; VI-NEXT: s_waitcnt vmcnt(0) 598; VI-NEXT: s_endpgm 599 %a = load i64, ptr addrspace(1) %in 600 %cast = bitcast i64 %a to <4 x i16> 601 %ext = sext <4 x i16> %cast to <4 x i32> 602 %elt0 = extractelement <4 x i32> %ext, i32 0 603 %elt1 = extractelement <4 x i32> %ext, i32 1 604 %elt2 = extractelement <4 x i32> %ext, i32 2 605 %elt3 = extractelement <4 x i32> %ext, i32 3 606 store volatile i32 %elt0, ptr addrspace(1) %out 607 store volatile i32 %elt1, ptr addrspace(1) %out 608 store volatile i32 %elt2, ptr addrspace(1) %out 609 store volatile i32 %elt3, ptr addrspace(1) %out 610 ret void 611} 612 613declare i32 @llvm.amdgcn.workitem.id.x() #1 614 615attributes #1 = { nounwind readnone } 616