1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s 8 9define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 10; SI-NOHSA-LABEL: global_load_i32: 11; SI-NOHSA: ; %bb.0: ; %entry 12; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 13; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 14; SI-NOHSA-NEXT: s_mov_b32 s6, -1 15; SI-NOHSA-NEXT: s_mov_b32 s10, s6 16; SI-NOHSA-NEXT: s_mov_b32 s11, s7 17; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NOHSA-NEXT: s_mov_b32 s8, s2 19; SI-NOHSA-NEXT: s_mov_b32 s9, s3 20; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 21; SI-NOHSA-NEXT: s_mov_b32 s4, s0 22; SI-NOHSA-NEXT: s_mov_b32 s5, s1 23; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 24; SI-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NOHSA-NEXT: s_endpgm 26; 27; GCNX3-HSA-LABEL: global_load_i32: 28; GCNX3-HSA: ; %bb.0: ; %entry 29; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 30; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 31; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 32; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 33; GCNX3-HSA-NEXT: flat_load_dword v2, v[0:1] 34; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 35; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 36; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 37; GCNX3-HSA-NEXT: flat_store_dword v[0:1], v2 38; GCNX3-HSA-NEXT: s_endpgm 39; 40; GCNX3-NOHSA-LABEL: global_load_i32: 41; GCNX3-NOHSA: ; %bb.0: ; %entry 42; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 43; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 44; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 45; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 46; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 47; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 48; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 49; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 50; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 51; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 52; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 53; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 54; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0 55; GCNX3-NOHSA-NEXT: s_endpgm 56; 57; EG-LABEL: global_load_i32: 58; EG: ; %bb.0: ; %entry 59; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 60; EG-NEXT: TEX 0 @6 61; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 62; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 63; EG-NEXT: CF_END 64; EG-NEXT: PAD 65; EG-NEXT: Fetch clause starting at 6: 66; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 67; EG-NEXT: ALU clause starting at 8: 68; EG-NEXT: MOV * T0.X, KC0[2].Z, 69; EG-NEXT: ALU clause starting at 9: 70; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 71; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 72; 73; GCN-HSA-LABEL: global_load_i32: 74; GCN-HSA: ; %bb.0: ; %entry 75; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 76; GCN-HSA-NEXT: v_mov_b32_e32 v0, 0 77; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 78; GCN-HSA-NEXT: global_load_dword v1, v0, s[2:3] 79; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 80; GCN-HSA-NEXT: global_store_dword v0, v1, s[0:1] 81; GCN-HSA-NEXT: s_endpgm 82entry: 83 %ld = load i32, ptr addrspace(1) %in 84 store i32 %ld, ptr addrspace(1) %out 85 ret void 86} 87 88define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 89; SI-NOHSA-LABEL: global_load_v2i32: 90; SI-NOHSA: ; %bb.0: ; %entry 91; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 92; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 93; SI-NOHSA-NEXT: s_mov_b32 s6, -1 94; SI-NOHSA-NEXT: s_mov_b32 s10, s6 95; SI-NOHSA-NEXT: s_mov_b32 s11, s7 96; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 97; SI-NOHSA-NEXT: s_mov_b32 s8, s2 98; SI-NOHSA-NEXT: s_mov_b32 s9, s3 99; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 100; SI-NOHSA-NEXT: s_mov_b32 s4, s0 101; SI-NOHSA-NEXT: s_mov_b32 s5, s1 102; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 103; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 104; SI-NOHSA-NEXT: s_endpgm 105; 106; GCNX3-HSA-LABEL: global_load_v2i32: 107; GCNX3-HSA: ; %bb.0: ; %entry 108; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 109; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 110; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 111; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 112; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 113; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0 114; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1 115; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 116; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 117; GCNX3-HSA-NEXT: s_endpgm 118; 119; GCNX3-NOHSA-LABEL: global_load_v2i32: 120; GCNX3-NOHSA: ; %bb.0: ; %entry 121; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 122; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 123; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 124; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 125; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 126; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 127; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 128; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 129; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 130; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 131; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 132; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 133; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 134; GCNX3-NOHSA-NEXT: s_endpgm 135; 136; EG-LABEL: global_load_v2i32: 137; EG: ; %bb.0: ; %entry 138; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 139; EG-NEXT: TEX 0 @6 140; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 141; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 142; EG-NEXT: CF_END 143; EG-NEXT: PAD 144; EG-NEXT: Fetch clause starting at 6: 145; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 146; EG-NEXT: ALU clause starting at 8: 147; EG-NEXT: MOV * T0.X, KC0[2].Z, 148; EG-NEXT: ALU clause starting at 9: 149; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 150; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 151; 152; GCN-HSA-LABEL: global_load_v2i32: 153; GCN-HSA: ; %bb.0: ; %entry 154; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 155; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 156; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 157; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 158; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 159; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 160; GCN-HSA-NEXT: s_endpgm 161entry: 162 %ld = load <2 x i32>, ptr addrspace(1) %in 163 store <2 x i32> %ld, ptr addrspace(1) %out 164 ret void 165} 166 167define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 168; SI-NOHSA-LABEL: global_load_v3i32: 169; SI-NOHSA: ; %bb.0: ; %entry 170; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 171; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 172; SI-NOHSA-NEXT: s_mov_b32 s6, -1 173; SI-NOHSA-NEXT: s_mov_b32 s10, s6 174; SI-NOHSA-NEXT: s_mov_b32 s11, s7 175; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 176; SI-NOHSA-NEXT: s_mov_b32 s8, s2 177; SI-NOHSA-NEXT: s_mov_b32 s9, s3 178; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 179; SI-NOHSA-NEXT: s_mov_b32 s4, s0 180; SI-NOHSA-NEXT: s_mov_b32 s5, s1 181; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 182; SI-NOHSA-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 183; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 184; SI-NOHSA-NEXT: s_endpgm 185; 186; GCNX3-HSA-LABEL: global_load_v3i32: 187; GCNX3-HSA: ; %bb.0: ; %entry 188; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 189; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 190; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 191; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 192; GCNX3-HSA-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 193; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s0 194; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s1 195; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 196; GCNX3-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 197; GCNX3-HSA-NEXT: s_endpgm 198; 199; GCNX3-NOHSA-LABEL: global_load_v3i32: 200; GCNX3-NOHSA: ; %bb.0: ; %entry 201; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 202; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 203; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 204; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 205; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 206; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 207; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 208; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 209; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 210; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 211; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 212; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 213; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 214; GCNX3-NOHSA-NEXT: s_endpgm 215; 216; EG-LABEL: global_load_v3i32: 217; EG: ; %bb.0: ; %entry 218; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 219; EG-NEXT: TEX 0 @6 220; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] 221; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 222; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 223; EG-NEXT: CF_END 224; EG-NEXT: Fetch clause starting at 6: 225; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 226; EG-NEXT: ALU clause starting at 8: 227; EG-NEXT: MOV * T0.X, KC0[2].Z, 228; EG-NEXT: ALU clause starting at 9: 229; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 230; EG-NEXT: MOV * T2.X, T0.Z, 231; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 232; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 233; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 234; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 235; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 236; 237; GCN-HSA-LABEL: global_load_v3i32: 238; GCN-HSA: ; %bb.0: ; %entry 239; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 240; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 241; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 242; GCN-HSA-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3] 243; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 244; GCN-HSA-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 245; GCN-HSA-NEXT: s_endpgm 246entry: 247 %ld = load <3 x i32>, ptr addrspace(1) %in 248 store <3 x i32> %ld, ptr addrspace(1) %out 249 ret void 250} 251 252define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 253; SI-NOHSA-LABEL: global_load_v4i32: 254; SI-NOHSA: ; %bb.0: ; %entry 255; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 256; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 257; SI-NOHSA-NEXT: s_mov_b32 s6, -1 258; SI-NOHSA-NEXT: s_mov_b32 s10, s6 259; SI-NOHSA-NEXT: s_mov_b32 s11, s7 260; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 261; SI-NOHSA-NEXT: s_mov_b32 s8, s2 262; SI-NOHSA-NEXT: s_mov_b32 s9, s3 263; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 264; SI-NOHSA-NEXT: s_mov_b32 s4, s0 265; SI-NOHSA-NEXT: s_mov_b32 s5, s1 266; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 267; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 268; SI-NOHSA-NEXT: s_endpgm 269; 270; GCNX3-HSA-LABEL: global_load_v4i32: 271; GCNX3-HSA: ; %bb.0: ; %entry 272; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 273; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 274; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 275; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 276; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 277; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 278; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 279; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 280; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 281; GCNX3-HSA-NEXT: s_endpgm 282; 283; GCNX3-NOHSA-LABEL: global_load_v4i32: 284; GCNX3-NOHSA: ; %bb.0: ; %entry 285; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 286; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 287; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 288; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 289; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 290; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 291; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 292; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 293; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 294; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 295; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 296; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 297; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 298; GCNX3-NOHSA-NEXT: s_endpgm 299; 300; EG-LABEL: global_load_v4i32: 301; EG: ; %bb.0: ; %entry 302; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 303; EG-NEXT: TEX 0 @6 304; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 305; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 306; EG-NEXT: CF_END 307; EG-NEXT: PAD 308; EG-NEXT: Fetch clause starting at 6: 309; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 310; EG-NEXT: ALU clause starting at 8: 311; EG-NEXT: MOV * T0.X, KC0[2].Z, 312; EG-NEXT: ALU clause starting at 9: 313; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 314; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 315; 316; GCN-HSA-LABEL: global_load_v4i32: 317; GCN-HSA: ; %bb.0: ; %entry 318; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 319; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 320; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 321; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 322; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 323; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 324; GCN-HSA-NEXT: s_endpgm 325entry: 326 %ld = load <4 x i32>, ptr addrspace(1) %in 327 store <4 x i32> %ld, ptr addrspace(1) %out 328 ret void 329} 330 331define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 332; SI-NOHSA-LABEL: global_load_v8i32: 333; SI-NOHSA: ; %bb.0: ; %entry 334; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 335; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 336; SI-NOHSA-NEXT: s_mov_b32 s6, -1 337; SI-NOHSA-NEXT: s_mov_b32 s10, s6 338; SI-NOHSA-NEXT: s_mov_b32 s11, s7 339; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 340; SI-NOHSA-NEXT: s_mov_b32 s8, s2 341; SI-NOHSA-NEXT: s_mov_b32 s9, s3 342; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 343; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 344; SI-NOHSA-NEXT: s_mov_b32 s4, s0 345; SI-NOHSA-NEXT: s_mov_b32 s5, s1 346; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) 347; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 348; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) 349; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 350; SI-NOHSA-NEXT: s_endpgm 351; 352; GCNX3-HSA-LABEL: global_load_v8i32: 353; GCNX3-HSA: ; %bb.0: ; %entry 354; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 355; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 356; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 357; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 358; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 359; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 360; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 361; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 362; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 363; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 364; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 365; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 366; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 367; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 368; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 369; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 370; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) 371; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] 372; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) 373; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 374; GCNX3-HSA-NEXT: s_endpgm 375; 376; GCNX3-NOHSA-LABEL: global_load_v8i32: 377; GCNX3-NOHSA: ; %bb.0: ; %entry 378; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 379; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 380; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 381; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 382; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 383; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 384; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 385; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 386; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 387; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 388; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 389; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 390; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 391; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 392; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 393; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 394; GCNX3-NOHSA-NEXT: s_endpgm 395; 396; EG-LABEL: global_load_v8i32: 397; EG: ; %bb.0: ; %entry 398; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 399; EG-NEXT: TEX 1 @6 400; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 401; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 402; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 403; EG-NEXT: CF_END 404; EG-NEXT: Fetch clause starting at 6: 405; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 406; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 407; EG-NEXT: ALU clause starting at 10: 408; EG-NEXT: MOV * T0.X, KC0[2].Z, 409; EG-NEXT: ALU clause starting at 11: 410; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 411; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 412; EG-NEXT: LSHR T2.X, PV.W, literal.x, 413; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 414; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 415; 416; GCN-HSA-LABEL: global_load_v8i32: 417; GCN-HSA: ; %bb.0: ; %entry 418; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 419; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 420; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 421; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 422; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] 423; GCN-HSA-NEXT: s_waitcnt vmcnt(1) 424; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 425; GCN-HSA-NEXT: s_waitcnt vmcnt(1) 426; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] 427; GCN-HSA-NEXT: s_endpgm 428entry: 429 %ld = load <8 x i32>, ptr addrspace(1) %in 430 store <8 x i32> %ld, ptr addrspace(1) %out 431 ret void 432} 433 434define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 435; SI-NOHSA-LABEL: global_load_v9i32: 436; SI-NOHSA: ; %bb.0: ; %entry 437; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 438; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 439; SI-NOHSA-NEXT: s_mov_b32 s6, -1 440; SI-NOHSA-NEXT: s_mov_b32 s10, s6 441; SI-NOHSA-NEXT: s_mov_b32 s11, s7 442; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 443; SI-NOHSA-NEXT: s_mov_b32 s8, s2 444; SI-NOHSA-NEXT: s_mov_b32 s9, s3 445; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 446; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 447; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32 448; SI-NOHSA-NEXT: s_mov_b32 s4, s0 449; SI-NOHSA-NEXT: s_mov_b32 s5, s1 450; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 451; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 452; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 453; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 454; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 455; SI-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32 456; SI-NOHSA-NEXT: s_endpgm 457; 458; GCNX3-HSA-LABEL: global_load_v9i32: 459; GCNX3-HSA: ; %bb.0: ; %entry 460; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 461; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 462; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 463; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 464; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 465; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 466; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 467; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 468; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 469; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 470; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 471; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 472; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 473; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 474; GCNX3-HSA-NEXT: flat_load_dword v14, v[8:9] 475; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 476; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 477; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 478; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 479; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 480; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 481; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s3 482; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 483; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s2 484; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 485; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 486; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 487; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 488; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 489; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 490; GCNX3-HSA-NEXT: flat_store_dword v[12:13], v14 491; GCNX3-HSA-NEXT: s_endpgm 492; 493; GCNX3-NOHSA-LABEL: global_load_v9i32: 494; GCNX3-NOHSA: ; %bb.0: ; %entry 495; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 496; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 497; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 498; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 499; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 500; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 501; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 502; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 503; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 504; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 505; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32 506; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 507; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 508; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 509; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 510; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 511; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 512; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 513; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32 514; GCNX3-NOHSA-NEXT: s_endpgm 515; 516; EG-LABEL: global_load_v9i32: 517; EG: ; %bb.0: ; %entry 518; EG-NEXT: ALU 8, @14, KC0[CB0:0-32], KC1[] 519; EG-NEXT: TEX 2 @8 520; EG-NEXT: ALU 1, @23, KC0[CB0:0-32], KC1[] 521; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 0 522; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 523; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T0.X, 1 524; EG-NEXT: CF_END 525; EG-NEXT: PAD 526; EG-NEXT: Fetch clause starting at 8: 527; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 0, #1 528; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 16, #1 529; EG-NEXT: VTX_READ_32 T3.X, T3.X, 32, #1 530; EG-NEXT: ALU clause starting at 14: 531; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 532; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 533; EG-NEXT: LSHR T0.X, PV.W, literal.x, 534; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 535; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 536; EG-NEXT: LSHR T1.X, PV.W, literal.x, 537; EG-NEXT: MOV * T2.X, KC0[2].Z, 538; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 539; EG-NEXT: MOV * T3.X, PS, 540; EG-NEXT: ALU clause starting at 23: 541; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 542; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 543; 544; GCN-HSA-LABEL: global_load_v9i32: 545; GCN-HSA: ; %bb.0: ; %entry 546; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 547; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 548; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 549; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 550; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:16 551; GCN-HSA-NEXT: global_load_dword v9, v8, s[2:3] offset:32 552; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 553; GCN-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 554; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 555; GCN-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 556; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 557; GCN-HSA-NEXT: global_store_dword v8, v9, s[0:1] offset:32 558; GCN-HSA-NEXT: s_endpgm 559entry: 560 %ld = load <9 x i32>, ptr addrspace(1) %in 561 store <9 x i32> %ld, ptr addrspace(1) %out 562 ret void 563} 564 565define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 566; SI-NOHSA-LABEL: global_load_v10i32: 567; SI-NOHSA: ; %bb.0: ; %entry 568; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 569; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 570; SI-NOHSA-NEXT: s_mov_b32 s6, -1 571; SI-NOHSA-NEXT: s_mov_b32 s10, s6 572; SI-NOHSA-NEXT: s_mov_b32 s11, s7 573; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 574; SI-NOHSA-NEXT: s_mov_b32 s8, s2 575; SI-NOHSA-NEXT: s_mov_b32 s9, s3 576; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 577; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 578; SI-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32 579; SI-NOHSA-NEXT: s_mov_b32 s4, s0 580; SI-NOHSA-NEXT: s_mov_b32 s5, s1 581; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 582; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 583; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 584; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 585; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 586; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32 587; SI-NOHSA-NEXT: s_endpgm 588; 589; GCNX3-HSA-LABEL: global_load_v10i32: 590; GCNX3-HSA: ; %bb.0: ; %entry 591; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 592; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 593; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 594; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 595; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 596; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 597; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 598; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 599; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 600; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 601; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 602; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 603; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 604; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 605; GCNX3-HSA-NEXT: flat_load_dwordx2 v[8:9], v[8:9] 606; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 607; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s1 608; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 609; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, s0 610; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 611; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 612; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 613; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s1 614; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 615; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s0 616; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 617; GCNX3-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] 618; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 619; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 620; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 621; GCNX3-HSA-NEXT: flat_store_dwordx2 v[14:15], v[8:9] 622; GCNX3-HSA-NEXT: s_endpgm 623; 624; GCNX3-NOHSA-LABEL: global_load_v10i32: 625; GCNX3-NOHSA: ; %bb.0: ; %entry 626; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 627; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 628; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 629; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 630; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 631; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 632; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 633; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 634; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 635; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 636; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32 637; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 638; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 639; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 640; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 641; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 642; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 643; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 644; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32 645; GCNX3-NOHSA-NEXT: s_endpgm 646; 647; EG-LABEL: global_load_v10i32: 648; EG: ; %bb.0: ; %entry 649; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 650; EG-NEXT: TEX 2 @8 651; EG-NEXT: ALU 7, @15, KC0[CB0:0-32], KC1[] 652; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T5.X, 0 653; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T4.X, 0 654; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1 655; EG-NEXT: CF_END 656; EG-NEXT: PAD 657; EG-NEXT: Fetch clause starting at 8: 658; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 659; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 660; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1 661; EG-NEXT: ALU clause starting at 14: 662; EG-NEXT: MOV * T0.X, KC0[2].Z, 663; EG-NEXT: ALU clause starting at 15: 664; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 665; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 666; EG-NEXT: LSHR T3.X, PV.W, literal.x, 667; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 668; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 669; EG-NEXT: LSHR T4.X, PV.W, literal.x, 670; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 671; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 672; 673; GCN-HSA-LABEL: global_load_v10i32: 674; GCN-HSA: ; %bb.0: ; %entry 675; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 676; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 677; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 678; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v10, s[2:3] 679; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v10, s[2:3] offset:16 680; GCN-HSA-NEXT: global_load_dwordx2 v[8:9], v10, s[2:3] offset:32 681; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 682; GCN-HSA-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] 683; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 684; GCN-HSA-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 685; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 686; GCN-HSA-NEXT: global_store_dwordx2 v10, v[8:9], s[0:1] offset:32 687; GCN-HSA-NEXT: s_endpgm 688entry: 689 %ld = load <10 x i32>, ptr addrspace(1) %in 690 store <10 x i32> %ld, ptr addrspace(1) %out 691 ret void 692} 693 694define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 695; SI-NOHSA-LABEL: global_load_v11i32: 696; SI-NOHSA: ; %bb.0: ; %entry 697; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 698; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 699; SI-NOHSA-NEXT: s_mov_b32 s6, -1 700; SI-NOHSA-NEXT: s_mov_b32 s10, s6 701; SI-NOHSA-NEXT: s_mov_b32 s11, s7 702; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 703; SI-NOHSA-NEXT: s_mov_b32 s8, s2 704; SI-NOHSA-NEXT: s_mov_b32 s9, s3 705; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 706; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 707; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 708; SI-NOHSA-NEXT: s_mov_b32 s4, s0 709; SI-NOHSA-NEXT: s_mov_b32 s5, s1 710; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 711; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 712; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 713; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 714; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 715; SI-NOHSA-NEXT: buffer_store_dword v10, off, s[4:7], 0 offset:40 716; SI-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32 717; SI-NOHSA-NEXT: s_endpgm 718; 719; GCNX3-HSA-LABEL: global_load_v11i32: 720; GCNX3-HSA: ; %bb.0: ; %entry 721; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 722; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 723; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 724; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 725; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 726; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 727; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 728; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 729; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 730; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 731; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 732; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 733; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 734; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 735; GCNX3-HSA-NEXT: flat_load_dwordx3 v[8:10], v[8:9] 736; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 737; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 738; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 739; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 740; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 741; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 742; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 743; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s1 744; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 745; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s0 746; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 747; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[0:3] 748; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 749; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[4:7] 750; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 751; GCNX3-HSA-NEXT: flat_store_dwordx3 v[15:16], v[8:10] 752; GCNX3-HSA-NEXT: s_endpgm 753; 754; GCNX3-NOHSA-LABEL: global_load_v11i32: 755; GCNX3-NOHSA: ; %bb.0: ; %entry 756; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 757; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 758; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 759; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 760; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 761; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 762; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 763; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 764; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 765; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 766; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32 767; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 768; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 769; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 770; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 771; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 772; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 773; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 774; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32 775; GCNX3-NOHSA-NEXT: s_endpgm 776; 777; EG-LABEL: global_load_v11i32: 778; EG: ; %bb.0: ; %entry 779; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 780; EG-NEXT: TEX 2 @8 781; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[] 782; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T7.X, 0 783; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T6.X, 0 784; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0 785; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T3.X, 1 786; EG-NEXT: CF_END 787; EG-NEXT: Fetch clause starting at 8: 788; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1 789; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 790; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1 791; EG-NEXT: ALU clause starting at 14: 792; EG-NEXT: MOV * T0.X, KC0[2].Z, 793; EG-NEXT: ALU clause starting at 15: 794; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 795; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 796; EG-NEXT: LSHR T3.X, PV.W, literal.x, 797; EG-NEXT: MOV * T4.X, T0.Z, 798; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 799; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 800; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00) 801; EG-NEXT: LSHR T5.X, PV.W, literal.x, 802; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 803; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 804; EG-NEXT: LSHR T6.X, PV.W, literal.x, 805; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 806; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 807; 808; GCN-HSA-LABEL: global_load_v11i32: 809; GCN-HSA: ; %bb.0: ; %entry 810; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 811; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 812; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 813; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] 814; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v11, s[2:3] offset:16 815; GCN-HSA-NEXT: global_load_dwordx3 v[8:10], v11, s[2:3] offset:32 816; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 817; GCN-HSA-NEXT: global_store_dwordx4 v11, v[0:3], s[0:1] 818; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 819; GCN-HSA-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] offset:16 820; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 821; GCN-HSA-NEXT: global_store_dwordx3 v11, v[8:10], s[0:1] offset:32 822; GCN-HSA-NEXT: s_endpgm 823entry: 824 %ld = load <11 x i32>, ptr addrspace(1) %in 825 store <11 x i32> %ld, ptr addrspace(1) %out 826 ret void 827} 828 829 830define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 831; SI-NOHSA-LABEL: global_load_v12i32: 832; SI-NOHSA: ; %bb.0: ; %entry 833; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 834; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 835; SI-NOHSA-NEXT: s_mov_b32 s6, -1 836; SI-NOHSA-NEXT: s_mov_b32 s10, s6 837; SI-NOHSA-NEXT: s_mov_b32 s11, s7 838; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 839; SI-NOHSA-NEXT: s_mov_b32 s8, s2 840; SI-NOHSA-NEXT: s_mov_b32 s9, s3 841; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 842; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 843; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 844; SI-NOHSA-NEXT: s_mov_b32 s4, s0 845; SI-NOHSA-NEXT: s_mov_b32 s5, s1 846; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 847; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 848; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 849; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 850; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 851; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 852; SI-NOHSA-NEXT: s_endpgm 853; 854; GCNX3-HSA-LABEL: global_load_v12i32: 855; GCNX3-HSA: ; %bb.0: ; %entry 856; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 857; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 858; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 859; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 860; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 861; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 862; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 863; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 864; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 865; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 866; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 867; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 868; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 869; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 870; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 871; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 872; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 873; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 874; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 875; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 876; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 877; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 878; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 879; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 880; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 881; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 882; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 883; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 884; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] 885; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 886; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 887; GCNX3-HSA-NEXT: s_endpgm 888; 889; GCNX3-NOHSA-LABEL: global_load_v12i32: 890; GCNX3-NOHSA: ; %bb.0: ; %entry 891; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 892; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 893; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 894; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 895; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 896; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 897; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 898; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 899; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 900; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 901; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 902; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 903; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 904; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 905; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 906; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 907; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 908; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 909; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 910; GCNX3-NOHSA-NEXT: s_endpgm 911; 912; EG-LABEL: global_load_v12i32: 913; EG: ; %bb.0: ; %entry 914; EG-NEXT: ALU 7, @14, KC0[CB0:0-32], KC1[] 915; EG-NEXT: TEX 2 @8 916; EG-NEXT: ALU 1, @22, KC0[CB0:0-32], KC1[] 917; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T5.X, 0 918; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0 919; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 920; EG-NEXT: CF_END 921; EG-NEXT: PAD 922; EG-NEXT: Fetch clause starting at 8: 923; EG-NEXT: VTX_READ_128 T3.XYZW, T2.X, 0, #1 924; EG-NEXT: VTX_READ_128 T4.XYZW, T2.X, 16, #1 925; EG-NEXT: VTX_READ_128 T2.XYZW, T2.X, 32, #1 926; EG-NEXT: ALU clause starting at 14: 927; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 928; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 929; EG-NEXT: LSHR T0.X, PV.W, literal.x, 930; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 931; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 932; EG-NEXT: LSHR T1.X, PV.W, literal.x, 933; EG-NEXT: MOV * T2.X, KC0[2].Z, 934; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 935; EG-NEXT: ALU clause starting at 22: 936; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 937; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 938; 939; GCN-HSA-LABEL: global_load_v12i32: 940; GCN-HSA: ; %bb.0: ; %entry 941; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 942; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 943; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 944; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3] 945; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16 946; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v12, s[2:3] offset:32 947; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 948; GCN-HSA-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] 949; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 950; GCN-HSA-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16 951; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 952; GCN-HSA-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:32 953; GCN-HSA-NEXT: s_endpgm 954entry: 955 %ld = load <12 x i32>, ptr addrspace(1) %in 956 store <12 x i32> %ld, ptr addrspace(1) %out 957 ret void 958} 959 960define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 961; SI-NOHSA-LABEL: global_load_v16i32: 962; SI-NOHSA: ; %bb.0: ; %entry 963; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 964; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 965; SI-NOHSA-NEXT: s_mov_b32 s6, -1 966; SI-NOHSA-NEXT: s_mov_b32 s10, s6 967; SI-NOHSA-NEXT: s_mov_b32 s11, s7 968; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 969; SI-NOHSA-NEXT: s_mov_b32 s4, s0 970; SI-NOHSA-NEXT: s_mov_b32 s5, s1 971; SI-NOHSA-NEXT: s_mov_b32 s8, s2 972; SI-NOHSA-NEXT: s_mov_b32 s9, s3 973; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 974; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 975; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 976; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 977; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 978; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 979; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 980; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48 981; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 982; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 983; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 984; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 985; SI-NOHSA-NEXT: s_endpgm 986; 987; GCNX3-HSA-LABEL: global_load_v16i32: 988; GCNX3-HSA: ; %bb.0: ; %entry 989; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 990; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 991; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 992; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 993; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 994; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 995; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 996; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 997; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 998; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 999; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 32 1000; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 1001; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 1002; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 1003; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 1004; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 1005; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1006; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1007; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1008; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1009; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 1010; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1011; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 1012; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 1013; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 1014; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 1015; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1016; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 1017; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 1018; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 1019; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 1020; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 1021; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 1022; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 1023; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 1024; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1025; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 1026; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] 1027; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 1028; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] 1029; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 1030; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] 1031; GCNX3-HSA-NEXT: s_endpgm 1032; 1033; GCNX3-NOHSA-LABEL: global_load_v16i32: 1034; GCNX3-NOHSA: ; %bb.0: ; %entry 1035; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1036; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1037; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1038; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1039; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1040; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1041; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1042; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1043; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 1044; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 1045; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 1046; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16 1047; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1048; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1049; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 1050; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32 1051; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 1052; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48 1053; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 1054; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 1055; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 1056; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16 1057; GCNX3-NOHSA-NEXT: s_endpgm 1058; 1059; EG-LABEL: global_load_v16i32: 1060; EG: ; %bb.0: ; %entry 1061; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[] 1062; EG-NEXT: TEX 3 @8 1063; EG-NEXT: ALU 1, @28, KC0[], KC1[] 1064; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0 1065; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T2.X, 0 1066; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T1.X, 0 1067; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1 1068; EG-NEXT: CF_END 1069; EG-NEXT: Fetch clause starting at 8: 1070; EG-NEXT: VTX_READ_128 T4.XYZW, T3.X, 32, #1 1071; EG-NEXT: VTX_READ_128 T5.XYZW, T3.X, 48, #1 1072; EG-NEXT: VTX_READ_128 T6.XYZW, T3.X, 0, #1 1073; EG-NEXT: VTX_READ_128 T3.XYZW, T3.X, 16, #1 1074; EG-NEXT: ALU clause starting at 16: 1075; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1076; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1077; EG-NEXT: LSHR T0.X, PV.W, literal.x, 1078; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1079; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1080; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1081; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 1082; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1083; EG-NEXT: MOV * T3.X, KC0[2].Z, 1084; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1085; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1086; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1087; EG-NEXT: ALU clause starting at 28: 1088; EG-NEXT: LSHR * T7.X, T0.W, literal.x, 1089; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1090; 1091; GCN-HSA-LABEL: global_load_v16i32: 1092; GCN-HSA: ; %bb.0: ; %entry 1093; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1094; GCN-HSA-NEXT: v_mov_b32_e32 v16, 0 1095; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1096; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:32 1097; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:48 1098; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] 1099; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:16 1100; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 1101; GCN-HSA-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:32 1102; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 1103; GCN-HSA-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:48 1104; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 1105; GCN-HSA-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] 1106; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 1107; GCN-HSA-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:16 1108; GCN-HSA-NEXT: s_endpgm 1109entry: 1110 %ld = load <16 x i32>, ptr addrspace(1) %in 1111 store <16 x i32> %ld, ptr addrspace(1) %out 1112 ret void 1113} 1114 1115define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1116; SI-NOHSA-LABEL: global_zextload_i32_to_i64: 1117; SI-NOHSA: ; %bb.0: 1118; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1119; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1120; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1121; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1122; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1123; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1124; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1125; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1126; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1127; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1128; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1129; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1130; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1131; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1132; SI-NOHSA-NEXT: s_endpgm 1133; 1134; GCNX3-HSA-LABEL: global_zextload_i32_to_i64: 1135; GCNX3-HSA: ; %bb.0: 1136; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1137; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1138; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1139; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1140; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1] 1141; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0 1142; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1 1143; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 1144; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1145; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1146; GCNX3-HSA-NEXT: s_endpgm 1147; 1148; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64: 1149; GCNX3-NOHSA: ; %bb.0: 1150; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1151; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1152; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1153; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1154; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1155; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1156; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1157; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1158; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1159; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1160; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1161; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1162; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1163; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1164; GCNX3-NOHSA-NEXT: s_endpgm 1165; 1166; EG-LABEL: global_zextload_i32_to_i64: 1167; EG: ; %bb.0: 1168; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1169; EG-NEXT: TEX 0 @6 1170; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1171; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1172; EG-NEXT: CF_END 1173; EG-NEXT: PAD 1174; EG-NEXT: Fetch clause starting at 6: 1175; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1176; EG-NEXT: ALU clause starting at 8: 1177; EG-NEXT: MOV * T0.X, KC0[2].Z, 1178; EG-NEXT: ALU clause starting at 9: 1179; EG-NEXT: MOV * T0.Y, 0.0, 1180; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1181; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1182; 1183; GCN-HSA-LABEL: global_zextload_i32_to_i64: 1184; GCN-HSA: ; %bb.0: 1185; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1186; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 1187; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1188; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] 1189; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1190; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 1191; GCN-HSA-NEXT: s_endpgm 1192 %ld = load i32, ptr addrspace(1) %in 1193 %ext = zext i32 %ld to i64 1194 store i64 %ext, ptr addrspace(1) %out 1195 ret void 1196} 1197 1198define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1199; SI-NOHSA-LABEL: global_sextload_i32_to_i64: 1200; SI-NOHSA: ; %bb.0: 1201; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1202; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1203; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1204; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1205; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1206; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1207; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1208; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1209; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1210; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1211; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1212; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1213; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1214; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1215; SI-NOHSA-NEXT: s_endpgm 1216; 1217; GCNX3-HSA-LABEL: global_sextload_i32_to_i64: 1218; GCNX3-HSA: ; %bb.0: 1219; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1220; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1221; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1222; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1223; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1] 1224; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0 1225; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1 1226; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1227; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1228; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1229; GCNX3-HSA-NEXT: s_endpgm 1230; 1231; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64: 1232; GCNX3-NOHSA: ; %bb.0: 1233; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1234; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1235; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1236; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1237; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1238; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1239; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1240; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1241; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1242; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1243; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1244; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1245; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1246; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1247; GCNX3-NOHSA-NEXT: s_endpgm 1248; 1249; EG-LABEL: global_sextload_i32_to_i64: 1250; EG: ; %bb.0: 1251; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1252; EG-NEXT: TEX 0 @6 1253; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1254; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1255; EG-NEXT: CF_END 1256; EG-NEXT: PAD 1257; EG-NEXT: Fetch clause starting at 6: 1258; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1259; EG-NEXT: ALU clause starting at 8: 1260; EG-NEXT: MOV * T0.X, KC0[2].Z, 1261; EG-NEXT: ALU clause starting at 9: 1262; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1263; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 1264; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 1265; 1266; GCN-HSA-LABEL: global_sextload_i32_to_i64: 1267; GCN-HSA: ; %bb.0: 1268; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1269; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 1270; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1271; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] 1272; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1273; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1274; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1275; GCN-HSA-NEXT: s_endpgm 1276 %ld = load i32, ptr addrspace(1) %in 1277 %ext = sext i32 %ld to i64 1278 store i64 %ext, ptr addrspace(1) %out 1279 ret void 1280} 1281 1282define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1283; SI-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: 1284; SI-NOHSA: ; %bb.0: 1285; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1286; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1287; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1288; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1289; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1290; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1291; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1292; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1293; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1294; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1295; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1296; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1297; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1298; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1299; SI-NOHSA-NEXT: s_endpgm 1300; 1301; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64: 1302; GCNX3-HSA: ; %bb.0: 1303; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1304; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1305; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1306; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1307; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1] 1308; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0 1309; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1 1310; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 1311; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1312; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1313; GCNX3-HSA-NEXT: s_endpgm 1314; 1315; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64: 1316; GCNX3-NOHSA: ; %bb.0: 1317; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1318; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1319; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1320; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1321; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1322; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1323; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1324; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1325; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1326; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1327; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1328; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1329; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1330; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1331; GCNX3-NOHSA-NEXT: s_endpgm 1332; 1333; EG-LABEL: global_zextload_v1i32_to_v1i64: 1334; EG: ; %bb.0: 1335; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1336; EG-NEXT: TEX 0 @6 1337; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1338; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1339; EG-NEXT: CF_END 1340; EG-NEXT: PAD 1341; EG-NEXT: Fetch clause starting at 6: 1342; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1343; EG-NEXT: ALU clause starting at 8: 1344; EG-NEXT: MOV * T0.X, KC0[2].Z, 1345; EG-NEXT: ALU clause starting at 9: 1346; EG-NEXT: MOV * T0.Y, 0.0, 1347; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1348; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1349; 1350; GCN-HSA-LABEL: global_zextload_v1i32_to_v1i64: 1351; GCN-HSA: ; %bb.0: 1352; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1353; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 1354; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1355; GCN-HSA-NEXT: global_load_dword v0, v1, s[2:3] 1356; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1357; GCN-HSA-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 1358; GCN-HSA-NEXT: s_endpgm 1359 %ld = load <1 x i32>, ptr addrspace(1) %in 1360 %ext = zext <1 x i32> %ld to <1 x i64> 1361 store <1 x i64> %ext, ptr addrspace(1) %out 1362 ret void 1363} 1364 1365define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1366; SI-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: 1367; SI-NOHSA: ; %bb.0: 1368; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1369; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1370; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1371; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1372; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1373; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1374; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1375; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1376; SI-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1377; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1378; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1379; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1380; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1381; SI-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1382; SI-NOHSA-NEXT: s_endpgm 1383; 1384; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64: 1385; GCNX3-HSA: ; %bb.0: 1386; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1387; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1388; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1389; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1390; GCNX3-HSA-NEXT: flat_load_dword v0, v[0:1] 1391; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s0 1392; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s1 1393; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1394; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1395; GCNX3-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1396; GCNX3-HSA-NEXT: s_endpgm 1397; 1398; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64: 1399; GCNX3-NOHSA: ; %bb.0: 1400; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1401; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1402; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1403; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1404; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1405; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1406; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1407; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1408; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0 1409; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1410; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1411; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1412; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1413; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1414; GCNX3-NOHSA-NEXT: s_endpgm 1415; 1416; EG-LABEL: global_sextload_v1i32_to_v1i64: 1417; EG: ; %bb.0: 1418; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1419; EG-NEXT: TEX 0 @6 1420; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1421; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1422; EG-NEXT: CF_END 1423; EG-NEXT: PAD 1424; EG-NEXT: Fetch clause starting at 6: 1425; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1426; EG-NEXT: ALU clause starting at 8: 1427; EG-NEXT: MOV * T0.X, KC0[2].Z, 1428; EG-NEXT: ALU clause starting at 9: 1429; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1430; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 1431; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 1432; 1433; GCN-HSA-LABEL: global_sextload_v1i32_to_v1i64: 1434; GCN-HSA: ; %bb.0: 1435; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1436; GCN-HSA-NEXT: v_mov_b32_e32 v2, 0 1437; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1438; GCN-HSA-NEXT: global_load_dword v0, v2, s[2:3] 1439; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1440; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1441; GCN-HSA-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1442; GCN-HSA-NEXT: s_endpgm 1443 %ld = load <1 x i32>, ptr addrspace(1) %in 1444 %ext = sext <1 x i32> %ld to <1 x i64> 1445 store <1 x i64> %ext, ptr addrspace(1) %out 1446 ret void 1447} 1448 1449define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1450; SI-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: 1451; SI-NOHSA: ; %bb.0: 1452; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1453; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1454; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1455; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1456; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1457; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1458; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1459; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1460; SI-NOHSA-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 1461; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1462; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1 1463; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1464; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1465; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1466; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 1467; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 1468; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1469; SI-NOHSA-NEXT: s_endpgm 1470; 1471; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64: 1472; GCNX3-HSA: ; %bb.0: 1473; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1474; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1475; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1476; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1477; GCNX3-HSA-NEXT: flat_load_dwordx2 v[2:3], v[0:1] 1478; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 1479; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 1480; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 1481; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1482; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v2 1483; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v3 1484; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 1485; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1486; GCNX3-HSA-NEXT: s_endpgm 1487; 1488; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64: 1489; GCNX3-NOHSA: ; %bb.0: 1490; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1491; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1492; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1493; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1494; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1495; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1496; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1497; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1498; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 1499; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0 1500; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1501; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1502; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1503; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2 1504; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3 1505; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1 1506; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1507; GCNX3-NOHSA-NEXT: s_endpgm 1508; 1509; EG-LABEL: global_zextload_v2i32_to_v2i64: 1510; EG: ; %bb.0: 1511; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1512; EG-NEXT: TEX 0 @6 1513; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] 1514; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 1515; EG-NEXT: CF_END 1516; EG-NEXT: PAD 1517; EG-NEXT: Fetch clause starting at 6: 1518; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1519; EG-NEXT: ALU clause starting at 8: 1520; EG-NEXT: MOV * T0.X, KC0[2].Z, 1521; EG-NEXT: ALU clause starting at 9: 1522; EG-NEXT: MOV T1.X, T0.X, 1523; EG-NEXT: MOV T1.Y, 0.0, 1524; EG-NEXT: MOV T1.Z, T0.Y, 1525; EG-NEXT: MOV T1.W, 0.0, 1526; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1527; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1528; 1529; GCN-HSA-LABEL: global_zextload_v2i32_to_v2i64: 1530; GCN-HSA: ; %bb.0: 1531; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1532; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 1533; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1534; GCN-HSA-NEXT: global_load_dwordx2 v[2:3], v1, s[2:3] 1535; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1536; GCN-HSA-NEXT: v_mov_b32_e32 v0, v2 1537; GCN-HSA-NEXT: v_mov_b32_e32 v2, v3 1538; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 1539; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 1540; GCN-HSA-NEXT: s_endpgm 1541 %ld = load <2 x i32>, ptr addrspace(1) %in 1542 %ext = zext <2 x i32> %ld to <2 x i64> 1543 store <2 x i64> %ext, ptr addrspace(1) %out 1544 ret void 1545} 1546 1547define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1548; SI-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: 1549; SI-NOHSA: ; %bb.0: 1550; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1551; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1552; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1553; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1554; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1555; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1556; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1557; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1558; SI-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1559; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1560; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1561; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1562; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 1563; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v1 1564; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1565; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1566; SI-NOHSA-NEXT: s_endpgm 1567; 1568; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64: 1569; GCNX3-HSA: ; %bb.0: 1570; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1571; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1572; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1573; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1574; GCNX3-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1575; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 1576; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 1577; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1578; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 1579; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v1 1580; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1581; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1582; GCNX3-HSA-NEXT: s_endpgm 1583; 1584; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64: 1585; GCNX3-NOHSA: ; %bb.0: 1586; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1587; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1588; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1589; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1590; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1591; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1592; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1593; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1594; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1595; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1596; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1597; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1598; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 1599; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1 1600; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1601; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1602; GCNX3-NOHSA-NEXT: s_endpgm 1603; 1604; EG-LABEL: global_sextload_v2i32_to_v2i64: 1605; EG: ; %bb.0: 1606; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1607; EG-NEXT: TEX 0 @6 1608; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] 1609; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 1610; EG-NEXT: CF_END 1611; EG-NEXT: PAD 1612; EG-NEXT: Fetch clause starting at 6: 1613; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1614; EG-NEXT: ALU clause starting at 8: 1615; EG-NEXT: MOV * T0.X, KC0[2].Z, 1616; EG-NEXT: ALU clause starting at 9: 1617; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 1618; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1619; EG-NEXT: ASHR * T1.Y, T0.X, literal.x, 1620; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1621; EG-NEXT: MOV T1.X, T0.X, 1622; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1623; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1624; EG-NEXT: MOV * T1.Z, T0.Y, 1625; 1626; GCN-HSA-LABEL: global_sextload_v2i32_to_v2i64: 1627; GCN-HSA: ; %bb.0: 1628; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1629; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0 1630; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1631; GCN-HSA-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1632; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1633; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 1634; GCN-HSA-NEXT: v_mov_b32_e32 v2, v1 1635; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1636; GCN-HSA-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1637; GCN-HSA-NEXT: s_endpgm 1638 %ld = load <2 x i32>, ptr addrspace(1) %in 1639 %ext = sext <2 x i32> %ld to <2 x i64> 1640 store <2 x i64> %ext, ptr addrspace(1) %out 1641 ret void 1642} 1643 1644define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1645; SI-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: 1646; SI-NOHSA: ; %bb.0: 1647; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1648; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1649; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1650; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1651; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1652; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1653; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1654; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1655; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1656; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0 1657; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5 1658; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1659; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1660; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1661; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2 1662; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3 1663; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1664; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 1665; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0 1666; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1 1667; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 1668; SI-NOHSA-NEXT: s_endpgm 1669; 1670; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64: 1671; GCNX3-HSA: ; %bb.0: 1672; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1673; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0 1674; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5 1675; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1676; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1677; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1678; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1679; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 1680; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1681; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 1682; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 1683; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1684; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 1685; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3 1686; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 1687; GCNX3-HSA-NEXT: s_nop 0 1688; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v0 1689; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v1 1690; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 1691; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 1692; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 1693; GCNX3-HSA-NEXT: s_endpgm 1694; 1695; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64: 1696; GCNX3-NOHSA: ; %bb.0: 1697; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1698; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1699; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1700; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1701; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1702; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1703; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1704; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1705; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1706; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0 1707; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5 1708; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1709; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1710; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1711; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2 1712; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3 1713; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1714; GCNX3-NOHSA-NEXT: s_nop 0 1715; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0 1716; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1 1717; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 1718; GCNX3-NOHSA-NEXT: s_endpgm 1719; 1720; EG-LABEL: global_zextload_v4i32_to_v4i64: 1721; EG: ; %bb.0: 1722; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1723; EG-NEXT: TEX 0 @6 1724; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1725; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 1726; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 1727; EG-NEXT: CF_END 1728; EG-NEXT: Fetch clause starting at 6: 1729; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1730; EG-NEXT: ALU clause starting at 8: 1731; EG-NEXT: MOV * T0.X, KC0[2].Z, 1732; EG-NEXT: ALU clause starting at 9: 1733; EG-NEXT: MOV T1.X, T0.Z, 1734; EG-NEXT: MOV T1.Y, 0.0, 1735; EG-NEXT: MOV * T2.X, T0.X, 1736; EG-NEXT: MOV T2.Y, 0.0, 1737; EG-NEXT: MOV T1.Z, T0.W, 1738; EG-NEXT: MOV T1.W, 0.0, 1739; EG-NEXT: MOV * T2.Z, T0.Y, 1740; EG-NEXT: MOV * T2.W, 0.0, 1741; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 1742; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 1743; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1744; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 1745; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1746; 1747; GCN-HSA-LABEL: global_zextload_v4i32_to_v4i64: 1748; GCN-HSA: ; %bb.0: 1749; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1750; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 1751; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 1752; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1753; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] 1754; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1755; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6 1756; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7 1757; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16 1758; GCN-HSA-NEXT: s_nop 0 1759; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4 1760; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5 1761; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 1762; GCN-HSA-NEXT: s_endpgm 1763 %ld = load <4 x i32>, ptr addrspace(1) %in 1764 %ext = zext <4 x i32> %ld to <4 x i64> 1765 store <4 x i64> %ext, ptr addrspace(1) %out 1766 ret void 1767} 1768 1769define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1770; SI-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: 1771; SI-NOHSA: ; %bb.0: 1772; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1773; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1774; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1775; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1776; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1777; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1778; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1779; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1780; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1781; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1782; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1783; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 1784; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 1785; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1786; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 1787; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2 1788; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v2 1789; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v3 1790; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v0 1791; SI-NOHSA-NEXT: v_mov_b32_e32 v5, v1 1792; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16 1793; SI-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 1794; SI-NOHSA-NEXT: s_endpgm 1795; 1796; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64: 1797; GCNX3-HSA: ; %bb.0: 1798; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1799; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1800; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1801; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1802; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1803; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 1804; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1805; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s3 1806; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s1 1807; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s2 1808; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, s0 1809; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0) 1810; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 1811; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2 1812; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v2 1813; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, v3 1814; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 1815; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1816; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v0 1817; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, v1 1818; GCNX3-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1819; GCNX3-HSA-NEXT: flat_store_dwordx4 v[11:12], v[3:6] 1820; GCNX3-HSA-NEXT: s_endpgm 1821; 1822; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64: 1823; GCNX3-NOHSA: ; %bb.0: 1824; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1825; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1826; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1827; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1828; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1829; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1830; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1831; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1832; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1833; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1834; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1835; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 1836; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 1837; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2 1838; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v2 1839; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v3 1840; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 1841; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1842; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0 1843; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1 1844; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16 1845; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 1846; GCNX3-NOHSA-NEXT: s_endpgm 1847; 1848; EG-LABEL: global_sextload_v4i32_to_v4i64: 1849; EG: ; %bb.0: 1850; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1851; EG-NEXT: TEX 0 @6 1852; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 1853; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 0 1854; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 1 1855; EG-NEXT: CF_END 1856; EG-NEXT: Fetch clause starting at 6: 1857; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1858; EG-NEXT: ALU clause starting at 8: 1859; EG-NEXT: MOV * T0.X, KC0[2].Z, 1860; EG-NEXT: ALU clause starting at 9: 1861; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 1862; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1863; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 1864; EG-NEXT: ASHR T1.Y, T0.X, literal.y, 1865; EG-NEXT: ASHR T3.W, T0.W, literal.y, 1866; EG-NEXT: MOV * T1.X, T0.X, 1867; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 1868; EG-NEXT: ASHR * T3.Y, T0.Z, literal.x, 1869; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1870; EG-NEXT: MOV T3.X, T0.Z, 1871; EG-NEXT: MOV T1.Z, T0.Y, 1872; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1873; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1874; EG-NEXT: LSHR T0.X, PV.W, literal.x, 1875; EG-NEXT: MOV * T3.Z, T0.W, 1876; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1877; 1878; GCN-HSA-LABEL: global_sextload_v4i32_to_v4i64: 1879; GCN-HSA: ; %bb.0: 1880; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1881; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 1882; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 1883; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v11, s[2:3] 1884; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 1885; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3 1886; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2 1887; GCN-HSA-NEXT: v_mov_b32_e32 v7, v2 1888; GCN-HSA-NEXT: v_mov_b32_e32 v9, v3 1889; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v1 1890; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1891; GCN-HSA-NEXT: v_mov_b32_e32 v3, v0 1892; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 1893; GCN-HSA-NEXT: global_store_dwordx4 v11, v[7:10], s[0:1] offset:16 1894; GCN-HSA-NEXT: global_store_dwordx4 v11, v[3:6], s[0:1] 1895; GCN-HSA-NEXT: s_endpgm 1896 %ld = load <4 x i32>, ptr addrspace(1) %in 1897 %ext = sext <4 x i32> %ld to <4 x i64> 1898 store <4 x i64> %ext, ptr addrspace(1) %out 1899 ret void 1900} 1901 1902define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1903; SI-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: 1904; SI-NOHSA: ; %bb.0: 1905; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1906; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1907; SI-NOHSA-NEXT: s_mov_b32 s6, -1 1908; SI-NOHSA-NEXT: s_mov_b32 s10, s6 1909; SI-NOHSA-NEXT: s_mov_b32 s11, s7 1910; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1911; SI-NOHSA-NEXT: s_mov_b32 s8, s2 1912; SI-NOHSA-NEXT: s_mov_b32 s9, s3 1913; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 1914; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 1915; SI-NOHSA-NEXT: v_mov_b32_e32 v9, 0 1916; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v9 1917; SI-NOHSA-NEXT: s_mov_b32 s4, s0 1918; SI-NOHSA-NEXT: s_mov_b32 s5, s1 1919; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) 1920; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v2 1921; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v3 1922; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 1923; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 1924; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v0 1925; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v1 1926; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 1927; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) expcnt(0) 1928; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v6 1929; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v7 1930; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 1931; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 1932; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v4 1933; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v5 1934; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 1935; SI-NOHSA-NEXT: s_endpgm 1936; 1937; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64: 1938; GCNX3-HSA: ; %bb.0: 1939; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1940; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0 1941; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9 1942; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 1943; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 1944; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 1945; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 1946; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 1947; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1948; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 1949; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 1950; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1951; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 1952; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1953; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3 1954; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2 1955; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 1956; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1 1957; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 1958; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0 1959; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s3 1960; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 1961; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s2 1962; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 1963; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) 1964; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v2 1965; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v3 1966; GCNX3-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1967; GCNX3-HSA-NEXT: s_nop 0 1968; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 1969; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 1970; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 1971; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 1972; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 1973; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 1974; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 1975; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1976; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 1977; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v4 1978; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v5 1979; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 1980; GCNX3-HSA-NEXT: s_endpgm 1981; 1982; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64: 1983; GCNX3-NOHSA: ; %bb.0: 1984; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1985; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 1986; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 1987; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 1988; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 1989; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 1990; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 1991; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 1992; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 1993; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 1994; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0 1995; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9 1996; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 1997; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 1998; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 1999; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2 2000; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3 2001; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 2002; GCNX3-NOHSA-NEXT: s_nop 0 2003; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0 2004; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1 2005; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 2006; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 2007; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6 2008; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7 2009; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16 2010; GCNX3-NOHSA-NEXT: s_nop 0 2011; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4 2012; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5 2013; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 2014; GCNX3-NOHSA-NEXT: s_endpgm 2015; 2016; EG-LABEL: global_zextload_v8i32_to_v8i64: 2017; EG: ; %bb.0: 2018; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 2019; EG-NEXT: TEX 1 @8 2020; EG-NEXT: ALU 26, @13, KC0[CB0:0-32], KC1[] 2021; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T7.X, 0 2022; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 2023; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T1.X, 0 2024; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T0.X, 1 2025; EG-NEXT: CF_END 2026; EG-NEXT: Fetch clause starting at 8: 2027; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 2028; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2029; EG-NEXT: ALU clause starting at 12: 2030; EG-NEXT: MOV * T0.X, KC0[2].Z, 2031; EG-NEXT: ALU clause starting at 13: 2032; EG-NEXT: MOV T2.X, T1.Z, 2033; EG-NEXT: MOV T2.Y, 0.0, 2034; EG-NEXT: MOV * T3.X, T1.X, 2035; EG-NEXT: MOV * T3.Y, 0.0, 2036; EG-NEXT: MOV T4.X, T0.Z, 2037; EG-NEXT: MOV T4.Y, 0.0, 2038; EG-NEXT: MOV * T5.X, T0.X, 2039; EG-NEXT: MOV T5.Y, 0.0, 2040; EG-NEXT: MOV T2.Z, T1.W, 2041; EG-NEXT: MOV T2.W, 0.0, 2042; EG-NEXT: MOV * T3.Z, T1.Y, 2043; EG-NEXT: MOV * T3.W, 0.0, 2044; EG-NEXT: MOV T4.Z, T0.W, 2045; EG-NEXT: MOV T4.W, 0.0, 2046; EG-NEXT: MOV * T5.Z, T0.Y, 2047; EG-NEXT: MOV * T5.W, 0.0, 2048; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 2049; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2050; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2051; EG-NEXT: LSHR T1.X, PV.W, literal.x, 2052; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2053; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2054; EG-NEXT: LSHR T6.X, PV.W, literal.x, 2055; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2056; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44) 2057; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 2058; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2059; 2060; GCN-HSA-LABEL: global_zextload_v8i32_to_v8i64: 2061; GCN-HSA: ; %bb.0: 2062; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2063; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 2064; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 2065; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 2066; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:16 2067; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] 2068; GCN-HSA-NEXT: s_waitcnt vmcnt(1) 2069; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6 2070; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7 2071; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48 2072; GCN-HSA-NEXT: s_nop 0 2073; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4 2074; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5 2075; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32 2076; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 2077; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 2078; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 2079; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16 2080; GCN-HSA-NEXT: s_nop 0 2081; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8 2082; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9 2083; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 2084; GCN-HSA-NEXT: s_endpgm 2085 %ld = load <8 x i32>, ptr addrspace(1) %in 2086 %ext = zext <8 x i32> %ld to <8 x i64> 2087 store <8 x i64> %ext, ptr addrspace(1) %out 2088 ret void 2089} 2090 2091define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2092; SI-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: 2093; SI-NOHSA: ; %bb.0: 2094; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2095; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 2096; SI-NOHSA-NEXT: s_mov_b32 s6, -1 2097; SI-NOHSA-NEXT: s_mov_b32 s10, s6 2098; SI-NOHSA-NEXT: s_mov_b32 s11, s7 2099; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2100; SI-NOHSA-NEXT: s_mov_b32 s8, s2 2101; SI-NOHSA-NEXT: s_mov_b32 s9, s3 2102; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 2103; SI-NOHSA-NEXT: s_mov_b32 s4, s0 2104; SI-NOHSA-NEXT: s_mov_b32 s5, s1 2105; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 2106; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) 2107; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 2108; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 2109; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3 2110; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 2111; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 2112; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5 2113; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4 2114; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7 2115; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6 2116; SI-NOHSA-NEXT: v_mov_b32_e32 v19, v6 2117; SI-NOHSA-NEXT: v_mov_b32_e32 v21, v7 2118; SI-NOHSA-NEXT: v_mov_b32_e32 v15, v4 2119; SI-NOHSA-NEXT: v_mov_b32_e32 v17, v5 2120; SI-NOHSA-NEXT: v_mov_b32_e32 v11, v2 2121; SI-NOHSA-NEXT: v_mov_b32_e32 v13, v3 2122; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v0 2123; SI-NOHSA-NEXT: v_mov_b32_e32 v9, v1 2124; SI-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[4:7], 0 offset:48 2125; SI-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[4:7], 0 offset:32 2126; SI-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 offset:16 2127; SI-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 2128; SI-NOHSA-NEXT: s_endpgm 2129; 2130; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64: 2131; GCNX3-HSA: ; %bb.0: 2132; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2133; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 2134; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 2135; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 2136; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 2137; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 2138; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 2139; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2140; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 2141; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2142; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 2143; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2144; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, s3 2145; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, s2 2146; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48 2147; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s1 2148; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2149; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s0 2150; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 32 2151; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 2152; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 2153; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 2154; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s1 2155; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s0 2156; GCNX3-HSA-NEXT: s_waitcnt vmcnt(1) 2157; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 2158; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 2159; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v3 2160; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v2 2161; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v2 2162; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v3 2163; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 2164; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 2165; GCNX3-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] 2166; GCNX3-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 2167; GCNX3-HSA-NEXT: s_waitcnt vmcnt(2) 2168; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v5 2169; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 2170; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 2171; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 2172; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 2173; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v4 2174; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 2175; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 2176; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] 2177; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] 2178; GCNX3-HSA-NEXT: s_endpgm 2179; 2180; GCNX3-NOHSA-LABEL: global_sextload_v8i32_to_v8i64: 2181; GCNX3-NOHSA: ; %bb.0: 2182; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 2183; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 2184; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 2185; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 2186; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 2187; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2188; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 2189; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 2190; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 2191; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 2192; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 2193; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 2194; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 2195; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 2196; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 2197; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7 2198; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6 2199; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v6 2200; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v7 2201; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 2202; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3 2203; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 2204; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5 2205; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4 2206; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v15, v4 2207; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, v5 2208; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v2 2209; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v13, v3 2210; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v0 2211; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, v1 2212; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:48 2213; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:32 2214; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16 2215; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 2216; GCNX3-NOHSA-NEXT: s_endpgm 2217; 2218; EG-LABEL: global_sextload_v8i32_to_v8i64: 2219; EG: ; %bb.0: 2220; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 2221; EG-NEXT: TEX 1 @8 2222; EG-NEXT: ALU 31, @13, KC0[CB0:0-32], KC1[] 2223; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T0.X, 0 2224; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T5.X, 0 2225; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T3.X, 0 2226; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T2.X, 1 2227; EG-NEXT: CF_END 2228; EG-NEXT: Fetch clause starting at 8: 2229; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 2230; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2231; EG-NEXT: ALU clause starting at 12: 2232; EG-NEXT: MOV * T0.X, KC0[2].Z, 2233; EG-NEXT: ALU clause starting at 13: 2234; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2235; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2236; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2237; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2238; EG-NEXT: ADD_INT T2.W, KC0[2].Y, literal.y, 2239; EG-NEXT: ASHR * T4.W, T0.Y, literal.z, 2240; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2241; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2242; EG-NEXT: LSHR T5.X, PV.W, literal.x, 2243; EG-NEXT: ASHR T4.Y, T0.X, literal.y, 2244; EG-NEXT: ASHR T6.W, T0.W, literal.y, 2245; EG-NEXT: MOV * T4.X, T0.X, 2246; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 2247; EG-NEXT: ASHR T6.Y, T0.Z, literal.x, 2248; EG-NEXT: ASHR * T7.W, T1.Y, literal.x, 2249; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2250; EG-NEXT: MOV T6.X, T0.Z, 2251; EG-NEXT: ASHR T7.Y, T1.X, literal.x, 2252; EG-NEXT: MOV T4.Z, T0.Y, 2253; EG-NEXT: ASHR T8.W, T1.W, literal.x, 2254; EG-NEXT: MOV * T7.X, T1.X, 2255; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2256; EG-NEXT: ASHR T8.Y, T1.Z, literal.x, 2257; EG-NEXT: MOV * T6.Z, T0.W, 2258; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2259; EG-NEXT: MOV T8.X, T1.Z, 2260; EG-NEXT: MOV T7.Z, T1.Y, 2261; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 2262; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 2263; EG-NEXT: LSHR T0.X, PV.W, literal.x, 2264; EG-NEXT: MOV * T8.Z, T1.W, 2265; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2266; 2267; GCN-HSA-LABEL: global_sextload_v8i32_to_v8i64: 2268; GCN-HSA: ; %bb.0: 2269; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2270; GCN-HSA-NEXT: v_mov_b32_e32 v23, 0 2271; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 2272; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v23, s[2:3] 2273; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v23, s[2:3] offset:16 2274; GCN-HSA-NEXT: s_waitcnt vmcnt(1) 2275; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v1 2276; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 2277; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v7 2278; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v6 2279; GCN-HSA-NEXT: v_mov_b32_e32 v19, v6 2280; GCN-HSA-NEXT: v_mov_b32_e32 v21, v7 2281; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 2282; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v3 2283; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v2 2284; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v5 2285; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v4 2286; GCN-HSA-NEXT: v_mov_b32_e32 v15, v4 2287; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5 2288; GCN-HSA-NEXT: v_mov_b32_e32 v11, v2 2289; GCN-HSA-NEXT: v_mov_b32_e32 v13, v3 2290; GCN-HSA-NEXT: v_mov_b32_e32 v7, v0 2291; GCN-HSA-NEXT: v_mov_b32_e32 v9, v1 2292; GCN-HSA-NEXT: global_store_dwordx4 v23, v[19:22], s[0:1] offset:48 2293; GCN-HSA-NEXT: global_store_dwordx4 v23, v[15:18], s[0:1] offset:32 2294; GCN-HSA-NEXT: global_store_dwordx4 v23, v[11:14], s[0:1] offset:16 2295; GCN-HSA-NEXT: global_store_dwordx4 v23, v[7:10], s[0:1] 2296; GCN-HSA-NEXT: s_endpgm 2297 %ld = load <8 x i32>, ptr addrspace(1) %in 2298 %ext = sext <8 x i32> %ld to <8 x i64> 2299 store <8 x i64> %ext, ptr addrspace(1) %out 2300 ret void 2301} 2302 2303define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2304; SI-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: 2305; SI-NOHSA: ; %bb.0: 2306; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 2307; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 2308; SI-NOHSA-NEXT: s_mov_b32 s2, -1 2309; SI-NOHSA-NEXT: s_mov_b32 s10, s2 2310; SI-NOHSA-NEXT: s_mov_b32 s11, s3 2311; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2312; SI-NOHSA-NEXT: s_mov_b32 s8, s6 2313; SI-NOHSA-NEXT: s_mov_b32 s9, s7 2314; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 2315; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 2316; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 2317; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 2318; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 2319; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 2320; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 2321; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 2322; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v0 2323; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v0 2324; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v1 2325; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v2 2326; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v3 2327; SI-NOHSA-NEXT: s_waitcnt vmcnt(2) 2328; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 2329; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v6 2330; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 2331; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 2332; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v4 2333; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v5 2334; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 2335; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 2336; SI-NOHSA-NEXT: s_waitcnt vmcnt(1) 2337; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 2338; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 2339; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 2340; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 2341; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 2342; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 2343; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10 2344; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11 2345; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 2346; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 2347; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 2348; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 2349; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 2350; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v12 2351; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v13 2352; SI-NOHSA-NEXT: v_mov_b32_e32 v8, v14 2353; SI-NOHSA-NEXT: v_mov_b32_e32 v10, v15 2354; SI-NOHSA-NEXT: s_mov_b32 s0, s4 2355; SI-NOHSA-NEXT: s_mov_b32 s1, s5 2356; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 2357; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 2358; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64 2359; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 2360; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 2361; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 2362; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 2363; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 2364; SI-NOHSA-NEXT: s_endpgm 2365; 2366; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64: 2367; GCNX3-HSA: ; %bb.0: 2368; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2369; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 2370; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 2371; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 2372; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] 2373; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 2374; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 2375; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 2376; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 2377; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 2378; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] 2379; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 2380; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 2381; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 2382; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 2383; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 2384; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 2385; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 2386; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 2387; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2388; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 2389; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2390; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 2391; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 2392; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 2393; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2394; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 2395; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 2396; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 2397; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2398; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 2399; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 2400; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 2401; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 2402; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 2403; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2404; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 2405; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 2406; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v8 2407; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 2408; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 2409; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 2410; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 2411; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 2412; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 2413; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2414; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 2415; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 2416; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 2417; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 2418; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v10 2419; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 2420; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 2421; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2422; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] 2423; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) 2424; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 2425; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 2426; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 2427; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v12 2428; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 2429; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 2430; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v14 2431; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v15 2432; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 2433; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 2434; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 2435; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] 2436; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] 2437; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) 2438; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 2439; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 2440; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 2441; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 2442; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 2443; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 2444; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v6 2445; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v7 2446; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 2447; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s1 2448; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] 2449; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[8:11] 2450; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s0 2451; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) 2452; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v1 2453; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v0 2454; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, v0 2455; GCNX3-HSA-NEXT: v_mov_b32_e32 v10, v1 2456; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 2457; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 2458; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2 2459; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v3 2460; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[8:11] 2461; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[4:7] 2462; GCNX3-HSA-NEXT: s_endpgm 2463; 2464; GCNX3-NOHSA-LABEL: global_sextload_v16i32_to_v16i64: 2465; GCNX3-NOHSA: ; %bb.0: 2466; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 2467; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 2468; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 2469; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 2470; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 2471; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2472; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 2473; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 2474; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 2475; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 2476; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 2477; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 2478; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 2479; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 2480; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 2481; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 2482; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 2483; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 2484; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 2485; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v24, v4 2486; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v26, v5 2487; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 2488; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 2489; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 2490; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v20, v6 2491; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v22, v7 2492; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 2493; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 2494; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0 2495; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1 2496; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2 2497; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3 2498; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 2499; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 2500; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 2501; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 2502; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 2503; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 2504; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 2505; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v10 2506; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v11 2507; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0) 2508; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 2509; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 2510; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 2511; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 2512; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v12 2513; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v13 2514; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v14 2515; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v15 2516; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96 2517; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112 2518; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 2519; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 2520; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 2521; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 2522; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 2523; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 2524; GCNX3-NOHSA-NEXT: s_endpgm 2525; 2526; EG-LABEL: global_sextload_v16i32_to_v16i64: 2527; EG: ; %bb.0: 2528; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[] 2529; EG-NEXT: TEX 3 @12 2530; EG-NEXT: ALU 64, @21, KC0[CB0:0-32], KC1[] 2531; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T1.X, 0 2532; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T11.X, 0 2533; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T9.X, 0 2534; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 0 2535; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0 2536; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T6.X, 0 2537; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T5.X, 0 2538; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T4.X, 1 2539; EG-NEXT: CF_END 2540; EG-NEXT: Fetch clause starting at 12: 2541; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 2542; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 2543; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 2544; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2545; EG-NEXT: ALU clause starting at 20: 2546; EG-NEXT: MOV * T0.X, KC0[2].Z, 2547; EG-NEXT: ALU clause starting at 21: 2548; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x, 2549; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2550; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2551; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2552; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2553; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.x, 2554; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 2555; EG-NEXT: LSHR T6.X, PV.W, literal.x, 2556; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, 2557; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2558; EG-NEXT: LSHR T7.X, PV.W, literal.x, 2559; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, 2560; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 2561; EG-NEXT: LSHR T8.X, PV.W, literal.x, 2562; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y, 2563; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) 2564; EG-NEXT: LSHR T9.X, PV.W, literal.x, 2565; EG-NEXT: ADD_INT T4.W, KC0[2].Y, literal.y, 2566; EG-NEXT: ASHR * T10.W, T0.W, literal.z, 2567; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) 2568; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2569; EG-NEXT: LSHR T11.X, PV.W, literal.x, 2570; EG-NEXT: ASHR T10.Y, T0.Z, literal.y, 2571; EG-NEXT: ASHR T12.W, T0.Y, literal.y, 2572; EG-NEXT: MOV * T10.X, T0.Z, 2573; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 2574; EG-NEXT: ASHR T12.Y, T0.X, literal.x, 2575; EG-NEXT: ASHR * T13.W, T3.W, literal.x, 2576; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2577; EG-NEXT: MOV T12.X, T0.X, 2578; EG-NEXT: ASHR T13.Y, T3.Z, literal.x, 2579; EG-NEXT: MOV T10.Z, T0.W, 2580; EG-NEXT: ASHR T14.W, T3.Y, literal.x, 2581; EG-NEXT: MOV * T13.X, T3.Z, 2582; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2583; EG-NEXT: ASHR T14.Y, T3.X, literal.x, 2584; EG-NEXT: MOV T12.Z, T0.Y, 2585; EG-NEXT: ASHR * T0.W, T2.W, literal.x, 2586; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2587; EG-NEXT: MOV T14.X, T3.X, 2588; EG-NEXT: ASHR T0.Y, T2.Z, literal.x, 2589; EG-NEXT: MOV T13.Z, T3.W, 2590; EG-NEXT: ASHR T15.W, T2.Y, literal.x, 2591; EG-NEXT: MOV * T0.X, T2.Z, 2592; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2593; EG-NEXT: ASHR T15.Y, T2.X, literal.x, 2594; EG-NEXT: MOV T14.Z, T3.Y, 2595; EG-NEXT: ASHR * T3.W, T1.W, literal.x, 2596; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2597; EG-NEXT: MOV T15.X, T2.X, 2598; EG-NEXT: ASHR T3.Y, T1.Z, literal.x, 2599; EG-NEXT: MOV T0.Z, T2.W, 2600; EG-NEXT: ASHR T16.W, T1.Y, literal.x, 2601; EG-NEXT: MOV * T3.X, T1.Z, 2602; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2603; EG-NEXT: ASHR T16.Y, T1.X, literal.x, 2604; EG-NEXT: MOV * T15.Z, T2.Y, 2605; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 2606; EG-NEXT: MOV T16.X, T1.X, 2607; EG-NEXT: MOV T3.Z, T1.W, 2608; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2609; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) 2610; EG-NEXT: LSHR T1.X, PV.W, literal.x, 2611; EG-NEXT: MOV * T16.Z, T1.Y, 2612; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2613; 2614; GCN-HSA-LABEL: global_sextload_v16i32_to_v16i64: 2615; GCN-HSA: ; %bb.0: 2616; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2617; GCN-HSA-NEXT: v_mov_b32_e32 v36, 0 2618; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 2619; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v36, s[2:3] offset:32 2620; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v36, s[2:3] offset:48 2621; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v36, s[2:3] offset:16 2622; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v36, s[2:3] 2623; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 2624; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 2625; GCN-HSA-NEXT: s_waitcnt vmcnt(2) 2626; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v5 2627; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v4 2628; GCN-HSA-NEXT: v_mov_b32_e32 v24, v4 2629; GCN-HSA-NEXT: v_mov_b32_e32 v26, v5 2630; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v2 2631; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v7 2632; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v6 2633; GCN-HSA-NEXT: v_mov_b32_e32 v20, v6 2634; GCN-HSA-NEXT: v_mov_b32_e32 v22, v7 2635; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 2636; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 2637; GCN-HSA-NEXT: v_mov_b32_e32 v4, v0 2638; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1 2639; GCN-HSA-NEXT: v_mov_b32_e32 v16, v2 2640; GCN-HSA-NEXT: v_mov_b32_e32 v18, v3 2641; GCN-HSA-NEXT: s_waitcnt vmcnt(1) 2642; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v11 2643; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v10 2644; GCN-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 2645; GCN-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 2646; GCN-HSA-NEXT: v_mov_b32_e32 v28, v8 2647; GCN-HSA-NEXT: v_mov_b32_e32 v30, v9 2648; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 2649; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 2650; GCN-HSA-NEXT: s_waitcnt vmcnt(0) 2651; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v15 2652; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v14 2653; GCN-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v13 2654; GCN-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v12 2655; GCN-HSA-NEXT: v_mov_b32_e32 v32, v12 2656; GCN-HSA-NEXT: v_mov_b32_e32 v34, v13 2657; GCN-HSA-NEXT: v_mov_b32_e32 v8, v14 2658; GCN-HSA-NEXT: v_mov_b32_e32 v10, v15 2659; GCN-HSA-NEXT: global_store_dwordx4 v36, v[24:27], s[0:1] offset:96 2660; GCN-HSA-NEXT: global_store_dwordx4 v36, v[20:23], s[0:1] offset:112 2661; GCN-HSA-NEXT: global_store_dwordx4 v36, v[4:7], s[0:1] offset:64 2662; GCN-HSA-NEXT: global_store_dwordx4 v36, v[16:19], s[0:1] offset:80 2663; GCN-HSA-NEXT: global_store_dwordx4 v36, v[28:31], s[0:1] offset:32 2664; GCN-HSA-NEXT: global_store_dwordx4 v36, v[0:3], s[0:1] offset:48 2665; GCN-HSA-NEXT: global_store_dwordx4 v36, v[32:35], s[0:1] 2666; GCN-HSA-NEXT: global_store_dwordx4 v36, v[8:11], s[0:1] offset:16 2667; GCN-HSA-NEXT: s_endpgm 2668 %ld = load <16 x i32>, ptr addrspace(1) %in 2669 %ext = sext <16 x i32> %ld to <16 x i64> 2670 store <16 x i64> %ext, ptr addrspace(1) %out 2671 ret void 2672} 2673 2674define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2675; SI-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: 2676; SI-NOHSA: ; %bb.0: 2677; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 2678; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 2679; SI-NOHSA-NEXT: s_mov_b32 s2, -1 2680; SI-NOHSA-NEXT: s_mov_b32 s10, s2 2681; SI-NOHSA-NEXT: s_mov_b32 s11, s3 2682; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2683; SI-NOHSA-NEXT: s_mov_b32 s8, s6 2684; SI-NOHSA-NEXT: s_mov_b32 s9, s7 2685; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 2686; SI-NOHSA-NEXT: v_mov_b32_e32 v5, 0 2687; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 2688; SI-NOHSA-NEXT: v_mov_b32_e32 v7, v5 2689; SI-NOHSA-NEXT: s_mov_b32 s0, s4 2690; SI-NOHSA-NEXT: s_mov_b32 s1, s5 2691; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 2692; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 2693; SI-NOHSA-NEXT: s_waitcnt vmcnt(3) 2694; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v0 2695; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v1 2696; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 2697; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 2698; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2 2699; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3 2700; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:112 2701; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) 2702; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v8 2703; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v9 2704; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 2705; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 2706; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v10 2707; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v11 2708; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 2709; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) 2710; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v16 2711; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v17 2712; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 2713; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 2714; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v18 2715; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v19 2716; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 2717; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 2718; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v12 2719; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v13 2720; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 2721; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 2722; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v14 2723; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v15 2724; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 2725; SI-NOHSA-NEXT: s_endpgm 2726; 2727; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64: 2728; GCNX3-HSA: ; %bb.0: 2729; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2730; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0 2731; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17 2732; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 2733; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 2734; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 2735; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 2736; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 2737; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 2738; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 2739; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 48 2740; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 2741; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2742; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 2743; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 2744; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2745; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 2746; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 2747; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 2748; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 2749; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 2750; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 2751; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 2752; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2753; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, s3 2754; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, s2 2755; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 2756; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2757; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 2758; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 2759; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 2760; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2761; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 2762; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1 2763; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 2764; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 2765; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0 2766; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2767; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3) 2768; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v0 2769; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v1 2770; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 2771; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 2772; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 2773; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 2774; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v2 2775; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v3 2776; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2777; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s2 2778; GCNX3-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] 2779; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s3 2780; GCNX3-HSA-NEXT: s_waitcnt vmcnt(4) 2781; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v4 2782; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v5 2783; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 2784; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] 2785; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 2786; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v6 2787; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v7 2788; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] 2789; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s3 2790; GCNX3-HSA-NEXT: s_waitcnt vmcnt(5) 2791; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v8 2792; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v9 2793; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 2794; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] 2795; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s2 2796; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v10 2797; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v11 2798; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 2799; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[16:19] 2800; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 2801; GCNX3-HSA-NEXT: s_waitcnt vmcnt(6) 2802; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v12 2803; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v13 2804; GCNX3-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] 2805; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 2806; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, v14 2807; GCNX3-HSA-NEXT: v_mov_b32_e32 v18, v15 2808; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] 2809; GCNX3-HSA-NEXT: s_endpgm 2810; 2811; GCNX3-NOHSA-LABEL: global_zextload_v16i32_to_v16i64: 2812; GCNX3-NOHSA: ; %bb.0: 2813; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 2814; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 2815; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 2816; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 2817; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 2818; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 2819; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 2820; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 2821; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 2822; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 2823; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 2824; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 2825; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v17, 0 2826; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v17 2827; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 2828; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 2829; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 2830; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v0 2831; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v1 2832; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 2833; GCNX3-NOHSA-NEXT: s_nop 0 2834; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v2 2835; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v3 2836; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 2837; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4) 2838; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v4 2839; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v5 2840; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 2841; GCNX3-NOHSA-NEXT: s_nop 0 2842; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v6 2843; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v7 2844; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 2845; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5) 2846; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v8 2847; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v9 2848; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 2849; GCNX3-NOHSA-NEXT: s_nop 0 2850; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v10 2851; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v11 2852; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48 2853; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) 2854; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v12 2855; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v13 2856; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 2857; GCNX3-NOHSA-NEXT: s_nop 0 2858; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v14 2859; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v15 2860; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 2861; GCNX3-NOHSA-NEXT: s_endpgm 2862; 2863; EG-LABEL: global_zextload_v16i32_to_v16i64: 2864; EG: ; %bb.0: 2865; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[] 2866; EG-NEXT: TEX 3 @12 2867; EG-NEXT: ALU 55, @21, KC0[CB0:0-32], KC1[] 2868; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T15.X, 0 2869; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T14.X, 0 2870; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T13.X, 0 2871; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0 2872; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T3.X, 0 2873; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T2.X, 0 2874; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T1.X, 0 2875; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T0.X, 1 2876; EG-NEXT: CF_END 2877; EG-NEXT: Fetch clause starting at 12: 2878; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 2879; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 2880; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 2881; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 32, #1 2882; EG-NEXT: ALU clause starting at 20: 2883; EG-NEXT: MOV * T0.X, KC0[2].Z, 2884; EG-NEXT: ALU clause starting at 21: 2885; EG-NEXT: MOV T4.X, T1.X, 2886; EG-NEXT: MOV T4.Y, 0.0, 2887; EG-NEXT: MOV * T5.X, T1.Z, 2888; EG-NEXT: MOV * T5.Y, 0.0, 2889; EG-NEXT: MOV T6.X, T0.X, 2890; EG-NEXT: MOV T6.Y, 0.0, 2891; EG-NEXT: MOV * T7.X, T0.Z, 2892; EG-NEXT: MOV * T7.Y, 0.0, 2893; EG-NEXT: MOV T8.X, T3.X, 2894; EG-NEXT: MOV T8.Y, 0.0, 2895; EG-NEXT: MOV * T9.X, T3.Z, 2896; EG-NEXT: MOV * T9.Y, 0.0, 2897; EG-NEXT: MOV T10.X, T2.X, 2898; EG-NEXT: MOV T10.Y, 0.0, 2899; EG-NEXT: MOV * T11.X, T2.Z, 2900; EG-NEXT: MOV T11.Y, 0.0, 2901; EG-NEXT: MOV T4.Z, T1.Y, 2902; EG-NEXT: MOV T4.W, 0.0, 2903; EG-NEXT: MOV * T5.Z, T1.W, 2904; EG-NEXT: MOV * T5.W, 0.0, 2905; EG-NEXT: MOV T6.Z, T0.Y, 2906; EG-NEXT: MOV T6.W, 0.0, 2907; EG-NEXT: MOV * T7.Z, T0.W, 2908; EG-NEXT: MOV * T7.W, 0.0, 2909; EG-NEXT: MOV T8.Z, T3.Y, 2910; EG-NEXT: MOV T8.W, 0.0, 2911; EG-NEXT: MOV * T9.Z, T3.W, 2912; EG-NEXT: MOV * T9.W, 0.0, 2913; EG-NEXT: MOV T10.Z, T2.Y, 2914; EG-NEXT: MOV T10.W, 0.0, 2915; EG-NEXT: MOV * T11.Z, T2.W, 2916; EG-NEXT: MOV T11.W, 0.0, 2917; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 2918; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2919; EG-NEXT: LSHR T0.X, PS, literal.x, 2920; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2921; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2922; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 2923; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 2924; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2925; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2926; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2927; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2928; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2929; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 2930; EG-NEXT: LSHR T12.X, PV.W, literal.x, 2931; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2932; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) 2933; EG-NEXT: LSHR T13.X, PV.W, literal.x, 2934; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2935; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) 2936; EG-NEXT: LSHR T14.X, PV.W, literal.x, 2937; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 2938; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) 2939; EG-NEXT: LSHR * T15.X, PV.W, literal.x, 2940; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2941; 2942; GCN-HSA-LABEL: global_zextload_v16i32_to_v16i64: 2943; GCN-HSA: ; %bb.0: 2944; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2945; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 2946; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 2947; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 2948; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:48 2949; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:32 2950; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:16 2951; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] 2952; GCN-HSA-NEXT: s_waitcnt vmcnt(3) 2953; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4 2954; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5 2955; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96 2956; GCN-HSA-NEXT: s_nop 0 2957; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6 2958; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7 2959; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112 2960; GCN-HSA-NEXT: s_waitcnt vmcnt(4) 2961; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8 2962; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9 2963; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64 2964; GCN-HSA-NEXT: s_nop 0 2965; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 2966; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 2967; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80 2968; GCN-HSA-NEXT: s_waitcnt vmcnt(5) 2969; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12 2970; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13 2971; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32 2972; GCN-HSA-NEXT: s_nop 0 2973; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14 2974; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15 2975; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48 2976; GCN-HSA-NEXT: s_waitcnt vmcnt(6) 2977; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16 2978; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17 2979; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 2980; GCN-HSA-NEXT: s_nop 0 2981; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18 2982; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19 2983; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16 2984; GCN-HSA-NEXT: s_endpgm 2985 %ld = load <16 x i32>, ptr addrspace(1) %in 2986 %ext = zext <16 x i32> %ld to <16 x i64> 2987 store <16 x i64> %ext, ptr addrspace(1) %out 2988 ret void 2989} 2990 2991define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2992; SI-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: 2993; SI-NOHSA: ; %bb.0: 2994; SI-NOHSA-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 2995; SI-NOHSA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 2996; SI-NOHSA-NEXT: s_mov_b32 s14, -1 2997; SI-NOHSA-NEXT: s_mov_b32 s15, 0xe8f000 2998; SI-NOHSA-NEXT: s_add_u32 s12, s12, s11 2999; SI-NOHSA-NEXT: s_addc_u32 s13, s13, 0 3000; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 3001; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 3002; SI-NOHSA-NEXT: s_mov_b32 s2, -1 3003; SI-NOHSA-NEXT: s_mov_b32 s10, s2 3004; SI-NOHSA-NEXT: s_mov_b32 s11, s3 3005; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 3006; SI-NOHSA-NEXT: s_mov_b32 s8, s6 3007; SI-NOHSA-NEXT: s_mov_b32 s9, s7 3008; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:96 3009; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 3010; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 3011; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 3012; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 3013; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 3014; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 3015; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 3016; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) 3017; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v31 3018; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v30 3019; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) 3020; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 3021; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 3022; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 3023; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 3024; SI-NOHSA-NEXT: v_mov_b32_e32 v40, v12 3025; SI-NOHSA-NEXT: v_mov_b32_e32 v42, v13 3026; SI-NOHSA-NEXT: v_mov_b32_e32 v36, v14 3027; SI-NOHSA-NEXT: v_mov_b32_e32 v38, v15 3028; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 3029; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 3030; SI-NOHSA-NEXT: v_mov_b32_e32 v32, v28 3031; SI-NOHSA-NEXT: v_mov_b32_e32 v34, v29 3032; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v30 3033; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v31 3034; SI-NOHSA-NEXT: buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill 3035; SI-NOHSA-NEXT: buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill 3036; SI-NOHSA-NEXT: buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill 3037; SI-NOHSA-NEXT: buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill 3038; SI-NOHSA-NEXT: s_waitcnt vmcnt(9) 3039; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v7 3040; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v6 3041; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3042; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 3043; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 3044; SI-NOHSA-NEXT: v_mov_b32_e32 v44, v4 3045; SI-NOHSA-NEXT: v_mov_b32_e32 v46, v5 3046; SI-NOHSA-NEXT: v_mov_b32_e32 v12, v6 3047; SI-NOHSA-NEXT: v_mov_b32_e32 v14, v7 3048; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) 3049; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 3050; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 3051; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 3052; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 3053; SI-NOHSA-NEXT: v_mov_b32_e32 v48, v0 3054; SI-NOHSA-NEXT: v_mov_b32_e32 v50, v1 3055; SI-NOHSA-NEXT: v_mov_b32_e32 v4, v2 3056; SI-NOHSA-NEXT: v_mov_b32_e32 v6, v3 3057; SI-NOHSA-NEXT: s_waitcnt vmcnt(7) 3058; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 3059; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 3060; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 3061; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 3062; SI-NOHSA-NEXT: v_mov_b32_e32 v52, v16 3063; SI-NOHSA-NEXT: v_mov_b32_e32 v54, v17 3064; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18 3065; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 3066; SI-NOHSA-NEXT: s_waitcnt vmcnt(6) 3067; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 3068; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 3069; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 3070; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 3071; SI-NOHSA-NEXT: v_mov_b32_e32 v56, v20 3072; SI-NOHSA-NEXT: v_mov_b32_e32 v58, v21 3073; SI-NOHSA-NEXT: v_mov_b32_e32 v16, v22 3074; SI-NOHSA-NEXT: v_mov_b32_e32 v18, v23 3075; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) 3076; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v23, 31, v27 3077; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v21, 31, v26 3078; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v63, 31, v25 3079; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v61, 31, v24 3080; SI-NOHSA-NEXT: v_mov_b32_e32 v60, v24 3081; SI-NOHSA-NEXT: v_mov_b32_e32 v62, v25 3082; SI-NOHSA-NEXT: v_mov_b32_e32 v20, v26 3083; SI-NOHSA-NEXT: v_mov_b32_e32 v22, v27 3084; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) 3085; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v27, 31, v11 3086; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v25, 31, v10 3087; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v31, 31, v9 3088; SI-NOHSA-NEXT: v_ashrrev_i32_e32 v29, 31, v8 3089; SI-NOHSA-NEXT: v_mov_b32_e32 v28, v8 3090; SI-NOHSA-NEXT: v_mov_b32_e32 v30, v9 3091; SI-NOHSA-NEXT: v_mov_b32_e32 v24, v10 3092; SI-NOHSA-NEXT: v_mov_b32_e32 v26, v11 3093; SI-NOHSA-NEXT: s_mov_b32 s0, s4 3094; SI-NOHSA-NEXT: s_mov_b32 s1, s5 3095; SI-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 3096; SI-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 3097; SI-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:192 3098; SI-NOHSA-NEXT: buffer_load_dword v8, off, s[12:15], 0 ; 4-byte Folded Reload 3099; SI-NOHSA-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload 3100; SI-NOHSA-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload 3101; SI-NOHSA-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload 3102; SI-NOHSA-NEXT: s_waitcnt vmcnt(0) 3103; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 3104; SI-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 3105; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 3106; SI-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 3107; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 3108; SI-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 3109; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 3110; SI-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 3111; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 3112; SI-NOHSA-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:32 3113; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48 3114; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 3115; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:16 3116; SI-NOHSA-NEXT: s_endpgm 3117; 3118; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64: 3119; GCNX3-HSA: ; %bb.0: 3120; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3121; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 3122; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 3123; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 3124; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] 3125; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70 3126; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3127; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 3128; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 3129; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] 3130; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x60 3131; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3132; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 3133; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 3134; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 3135; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] 3136; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3137; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 3138; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 3139; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 3140; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] 3141; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3142; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s4 3143; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s5 3144; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 3145; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 48 3146; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3147; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 3148; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 3149; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[8:9] 3150; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 3151; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 3152; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 16 3153; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 3154; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 3155; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 3156; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 3157; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 3158; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3159; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 3160; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 3161; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3162; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s1 3163; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s0 3164; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 3165; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v29 3166; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v28 3167; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v28 3168; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v29 3169; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 3170; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 3171; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 3172; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3173; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[32:35] 3174; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 3175; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 3176; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 3177; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v35, 31, v31 3178; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v33, 31, v30 3179; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, v30 3180; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, v31 3181; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3182; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[32:35] 3183; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) 3184; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v25 3185; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 3186; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 3187; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 3188; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3189; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 3190; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 3191; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 3192; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v24 3193; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v24 3194; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v25 3195; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3196; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] 3197; GCNX3-HSA-NEXT: v_mov_b32_e32 v37, s3 3198; GCNX3-HSA-NEXT: v_mov_b32_e32 v36, s2 3199; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 3200; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v27 3201; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v26 3202; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v26 3203; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v27 3204; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3205; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[28:31] 3206; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 3207; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 3208; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 3209; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3210; GCNX3-HSA-NEXT: v_mov_b32_e32 v39, s3 3211; GCNX3-HSA-NEXT: v_mov_b32_e32 v38, s2 3212; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 3213; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) 3214; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v21 3215; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v20 3216; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v20 3217; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v21 3218; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3219; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v23 3220; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v22 3221; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, v22 3222; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, v23 3223; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[24:27] 3224; GCNX3-HSA-NEXT: flat_store_dwordx4 v[36:37], v[28:31] 3225; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) 3226; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v15 3227; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v14 3228; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v13 3229; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v12 3230; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, v12 3231; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, v13 3232; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, v14 3233; GCNX3-HSA-NEXT: v_mov_b32_e32 v22, v15 3234; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) 3235; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v5 3236; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v4 3237; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, v4 3238; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, v5 3239; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 3240; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 3241; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 3242; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3243; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[24:27] 3244; GCNX3-HSA-NEXT: flat_store_dwordx4 v[38:39], v[20:23] 3245; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] 3246; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 3247; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 3248; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 3249; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v7 3250; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v6 3251; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v6 3252; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v7 3253; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3254; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[23:26] 3255; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) 3256; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v16 3257; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v16 3258; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 3259; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 3260; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 3261; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v17 3262; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v17 3263; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3264; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7] 3265; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s3 3266; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s2 3267; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 3268; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v19 3269; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v18 3270; GCNX3-HSA-NEXT: v_mov_b32_e32 v23, v18 3271; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, v19 3272; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3273; GCNX3-HSA-NEXT: flat_store_dwordx4 v[15:16], v[23:26] 3274; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) 3275; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v9 3276; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v8 3277; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, v8 3278; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, v9 3279; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 3280; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 3281; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 3282; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3283; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[15:18] 3284; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 3285; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 3286; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 3287; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v1 3288; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v0 3289; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v11 3290; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, v11 3291; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v0 3292; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, v1 3293; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3294; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 3295; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 3296; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 3297; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] 3298; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 3299; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s0 3300; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v3 3301; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v2 3302; GCNX3-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v10 3303; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v10 3304; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v2 3305; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, v3 3306; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s1 3307; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 3308; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] 3309; GCNX3-HSA-NEXT: s_endpgm 3310; 3311; GCNX3-NOHSA-LABEL: global_sextload_v32i32_to_v32i64: 3312; GCNX3-NOHSA: ; %bb.0: 3313; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 3314; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 3315; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 3316; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 3317; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 3318; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 3319; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 3320; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 3321; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 3322; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:112 3323; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80 3324; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:64 3325; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 3326; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 3327; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 3328; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 3329; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 3330; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 3331; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 3332; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v35, 31, v11 3333; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v33, 31, v10 3334; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) 3335; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v39, 31, v15 3336; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v37, 31, v14 3337; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v43, 31, v13 3338; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v41, 31, v12 3339; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v40, v12 3340; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v42, v13 3341; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v36, v14 3342; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v38, v15 3343; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 3344; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v8 3345; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v8 3346; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v9 3347; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v32, v10 3348; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v34, v11 3349; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(5) 3350; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 3351; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v9, 31, v6 3352; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v47, 31, v5 3353; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v45, 31, v4 3354; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v44, v4 3355; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v46, v5 3356; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6 3357; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7 3358; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(4) 3359; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 3360; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v5, 31, v2 3361; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2 3362; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3 3363; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3) 3364; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v19 3365; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v19 3366; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 3367; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v19, 31, v23 3368; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v51, 31, v1 3369; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v49, 31, v0 3370; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v48, v0 3371; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v50, v1 3372; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v18 3373; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v55, 31, v17 3374; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v53, 31, v16 3375; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v52, v16 3376; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v54, v17 3377; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v18 3378; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v17, 31, v22 3379; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v59, 31, v21 3380; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v57, 31, v20 3381; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v56, v20 3382; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v58, v21 3383; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v16, v22 3384; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v18, v23 3385; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1) 3386; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v22, 31, v27 3387; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v20, 31, v26 3388; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:224 3389; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:240 3390; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v42, 31, v25 3391; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v40, 31, v24 3392; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2) 3393; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v38, 31, v31 3394; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v36, 31, v30 3395; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:192 3396; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v37, v31 3397; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v29 3398; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v13, 31, v28 3399; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v12, v28 3400; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v14, v29 3401; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208 3402; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160 3403; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 3404; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128 3405; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144 3406; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:96 3407; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 3408; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:64 3409; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80 3410; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 3411; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v35, v30 3412; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v39, v24 3413; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v41, v25 3414; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v19, v26 3415; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v21, v27 3416; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:48 3417; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 3418; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16 3419; GCNX3-NOHSA-NEXT: s_endpgm 3420; 3421; EG-LABEL: global_sextload_v32i32_to_v32i64: 3422; EG: ; %bb.0: 3423; EG-NEXT: ALU 33, @36, KC0[CB0:0-32], KC1[] 3424; EG-NEXT: TEX 7 @20 3425; EG-NEXT: ALU 96, @70, KC0[CB0:0-32], KC1[] 3426; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T12.X, 0 3427; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T23.X, 0 3428; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0 3429; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T20.X, 0 3430; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T19.X, 0 3431; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T10.X, 0 3432; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T9.X, 0 3433; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T8.X, 0 3434; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T7.X, 0 3435; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T6.X, 0 3436; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T5.X, 0 3437; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0 3438; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T3.X, 0 3439; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T2.X, 0 3440; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T1.X, 0 3441; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T0.X, 1 3442; EG-NEXT: CF_END 3443; EG-NEXT: Fetch clause starting at 20: 3444; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 112, #1 3445; EG-NEXT: VTX_READ_128 T13.XYZW, T11.X, 96, #1 3446; EG-NEXT: VTX_READ_128 T14.XYZW, T11.X, 80, #1 3447; EG-NEXT: VTX_READ_128 T15.XYZW, T11.X, 64, #1 3448; EG-NEXT: VTX_READ_128 T16.XYZW, T11.X, 48, #1 3449; EG-NEXT: VTX_READ_128 T17.XYZW, T11.X, 32, #1 3450; EG-NEXT: VTX_READ_128 T18.XYZW, T11.X, 16, #1 3451; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 3452; EG-NEXT: ALU clause starting at 36: 3453; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 3454; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3455; EG-NEXT: LSHR T0.X, PV.W, literal.x, 3456; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3457; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3458; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 3459; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 3460; EG-NEXT: LSHR T2.X, PV.W, literal.x, 3461; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3462; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 3463; EG-NEXT: LSHR T3.X, PV.W, literal.x, 3464; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3465; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 3466; EG-NEXT: LSHR T4.X, PV.W, literal.x, 3467; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3468; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) 3469; EG-NEXT: LSHR T5.X, PV.W, literal.x, 3470; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3471; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) 3472; EG-NEXT: LSHR T6.X, PV.W, literal.x, 3473; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3474; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) 3475; EG-NEXT: LSHR T7.X, PV.W, literal.x, 3476; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3477; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) 3478; EG-NEXT: LSHR T8.X, PV.W, literal.x, 3479; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3480; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) 3481; EG-NEXT: LSHR T9.X, PV.W, literal.x, 3482; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3483; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) 3484; EG-NEXT: LSHR T10.X, PV.W, literal.x, 3485; EG-NEXT: MOV * T11.X, KC0[2].Z, 3486; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3487; EG-NEXT: ALU clause starting at 70: 3488; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 3489; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00) 3490; EG-NEXT: LSHR T19.X, PV.W, literal.x, 3491; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3492; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43) 3493; EG-NEXT: LSHR T20.X, PV.W, literal.x, 3494; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3495; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) 3496; EG-NEXT: LSHR T21.X, PV.W, literal.x, 3497; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y, 3498; EG-NEXT: ASHR * T22.W, T11.W, literal.z, 3499; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) 3500; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3501; EG-NEXT: LSHR T23.X, PV.W, literal.x, 3502; EG-NEXT: ASHR T22.Y, T11.Z, literal.y, 3503; EG-NEXT: ASHR T24.W, T11.Y, literal.y, 3504; EG-NEXT: MOV * T22.X, T11.Z, 3505; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 3506; EG-NEXT: ASHR T24.Y, T11.X, literal.x, 3507; EG-NEXT: ASHR * T25.W, T18.W, literal.x, 3508; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3509; EG-NEXT: MOV T24.X, T11.X, 3510; EG-NEXT: ASHR T25.Y, T18.Z, literal.x, 3511; EG-NEXT: MOV T22.Z, T11.W, 3512; EG-NEXT: ASHR T26.W, T18.Y, literal.x, 3513; EG-NEXT: MOV * T25.X, T18.Z, 3514; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3515; EG-NEXT: ASHR T26.Y, T18.X, literal.x, 3516; EG-NEXT: MOV T24.Z, T11.Y, 3517; EG-NEXT: ASHR * T11.W, T17.W, literal.x, 3518; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3519; EG-NEXT: MOV T26.X, T18.X, 3520; EG-NEXT: ASHR T11.Y, T17.Z, literal.x, 3521; EG-NEXT: MOV T25.Z, T18.W, 3522; EG-NEXT: ASHR T27.W, T17.Y, literal.x, 3523; EG-NEXT: MOV * T11.X, T17.Z, 3524; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3525; EG-NEXT: ASHR T27.Y, T17.X, literal.x, 3526; EG-NEXT: MOV T26.Z, T18.Y, 3527; EG-NEXT: ASHR * T18.W, T16.W, literal.x, 3528; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3529; EG-NEXT: MOV T27.X, T17.X, 3530; EG-NEXT: ASHR T18.Y, T16.Z, literal.x, 3531; EG-NEXT: MOV T11.Z, T17.W, 3532; EG-NEXT: ASHR T28.W, T16.Y, literal.x, 3533; EG-NEXT: MOV * T18.X, T16.Z, 3534; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3535; EG-NEXT: ASHR T28.Y, T16.X, literal.x, 3536; EG-NEXT: MOV T27.Z, T17.Y, 3537; EG-NEXT: ASHR * T17.W, T15.W, literal.x, 3538; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3539; EG-NEXT: MOV T28.X, T16.X, 3540; EG-NEXT: ASHR T17.Y, T15.Z, literal.x, 3541; EG-NEXT: MOV T18.Z, T16.W, 3542; EG-NEXT: ASHR T29.W, T15.Y, literal.x, 3543; EG-NEXT: MOV * T17.X, T15.Z, 3544; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3545; EG-NEXT: ASHR T29.Y, T15.X, literal.x, 3546; EG-NEXT: MOV T28.Z, T16.Y, 3547; EG-NEXT: ASHR * T16.W, T14.W, literal.x, 3548; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3549; EG-NEXT: MOV T29.X, T15.X, 3550; EG-NEXT: ASHR T16.Y, T14.Z, literal.x, 3551; EG-NEXT: MOV T17.Z, T15.W, 3552; EG-NEXT: ASHR T30.W, T14.Y, literal.x, 3553; EG-NEXT: MOV * T16.X, T14.Z, 3554; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3555; EG-NEXT: ASHR T30.Y, T14.X, literal.x, 3556; EG-NEXT: MOV T29.Z, T15.Y, 3557; EG-NEXT: ASHR * T15.W, T13.W, literal.x, 3558; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3559; EG-NEXT: MOV T30.X, T14.X, 3560; EG-NEXT: ASHR T15.Y, T13.Z, literal.x, 3561; EG-NEXT: MOV T16.Z, T14.W, 3562; EG-NEXT: ASHR T31.W, T13.Y, literal.x, 3563; EG-NEXT: MOV * T15.X, T13.Z, 3564; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3565; EG-NEXT: ASHR T31.Y, T13.X, literal.x, 3566; EG-NEXT: MOV T30.Z, T14.Y, 3567; EG-NEXT: ASHR * T14.W, T12.W, literal.x, 3568; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3569; EG-NEXT: MOV T31.X, T13.X, 3570; EG-NEXT: ASHR T14.Y, T12.Z, literal.x, 3571; EG-NEXT: MOV T15.Z, T13.W, 3572; EG-NEXT: ASHR T32.W, T12.Y, literal.x, 3573; EG-NEXT: MOV * T14.X, T12.Z, 3574; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3575; EG-NEXT: ASHR T32.Y, T12.X, literal.x, 3576; EG-NEXT: MOV * T31.Z, T13.Y, 3577; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 3578; EG-NEXT: MOV T32.X, T12.X, 3579; EG-NEXT: MOV T14.Z, T12.W, 3580; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 3581; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00) 3582; EG-NEXT: LSHR T12.X, PV.W, literal.x, 3583; EG-NEXT: MOV * T32.Z, T12.Y, 3584; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3585; 3586; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64: 3587; GCN-GFX900-HSA: ; %bb.0: 3588; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3] 3589; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1] 3590; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3591; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0 3592; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15 3593; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0 3594; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0) 3595; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 3596; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 3597; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80 3598; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64 3599; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48 3600; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32 3601; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) 3602; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3 3603; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2 3604; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v2 3605; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v3 3606; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(4) 3607; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 3608; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 3609; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5 3610; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4 3611; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v33, v4 3612; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v35, v5 3613; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v29, v6 3614; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v31, v7 3615; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 3616; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 3617; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0 3618; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1 3619; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill 3620; GCN-GFX900-HSA-NEXT: s_nop 0 3621; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill 3622; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill 3623; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill 3624; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) 3625; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 3626; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 3627; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10 3628; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9 3629; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v37, v9 3630; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v39, v10 3631; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v25, v11 3632; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v27, v12 3633; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(6) 3634; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16 3635; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15 3636; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14 3637; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13 3638; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v41, v13 3639; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v43, v14 3640; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v9, v15 3641; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v11, v16 3642; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) 3643; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20 3644; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19 3645; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18 3646; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17 3647; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v45, v17 3648; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v47, v18 3649; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v13, v19 3650; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16 3651; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v15, v20 3652; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(5) 3653; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 3654; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 3655; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22 3656; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21 3657; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v53, v21 3658; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v55, v22 3659; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v17, v23 3660; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v19, v24 3661; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] 3662; GCN-GFX900-HSA-NEXT: s_nop 0 3663; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 3664; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 3665; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 3666; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload 3667; GCN-GFX900-HSA-NEXT: s_nop 0 3668; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload 3669; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload 3670; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload 3671; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8) 3672; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 3673; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 3674; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50 3675; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49 3676; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v0, v49 3677; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v2, v50 3678; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v57, v51 3679; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v59, v52 3680; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7) 3681; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24 3682; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23 3683; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22 3684; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21 3685; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v21 3686; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v22 3687; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(0) 3688; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208 3689; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160 3690; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176 3691; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128 3692; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144 3693; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96 3694; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112 3695; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64 3696; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80 3697; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 3698; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48 3699; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] 3700; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v28, v23 3701; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v30, v24 3702; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16 3703; GCN-GFX900-HSA-NEXT: s_endpgm 3704; 3705; GCN-GFX908-HSA-LABEL: global_sextload_v32i32_to_v32i64: 3706; GCN-GFX908-HSA: ; %bb.0: 3707; GCN-GFX908-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3708; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v8, 0 3709; GCN-GFX908-HSA-NEXT: s_waitcnt lgkmcnt(0) 3710; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96 3711; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112 3712; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[9:12], v8, s[2:3] offset:80 3713; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[13:16], v8, s[2:3] offset:64 3714; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[17:20], v8, s[2:3] offset:48 3715; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] offset:32 3716; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[49:52], v8, s[2:3] offset:16 3717; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(6) 3718; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v2 3719; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v3 3720; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v2 3721; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v3 3722; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a0, v25 3723; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a1, v26 3724; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a2, v27 3725; GCN-GFX908-HSA-NEXT: v_accvgpr_write_b32 a3, v28 3726; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(4) 3727; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12 3728; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11 3729; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v40, 31, v10 3730; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v38, 31, v9 3731; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v37, v9 3732; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v39, v10 3733; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v25, v11 3734; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v27, v12 3735; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) 3736; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v16 3737; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v15 3738; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v44, 31, v14 3739; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v42, 31, v13 3740; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v41, v13 3741; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v43, v14 3742; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v9, v15 3743; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v11, v16 3744; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(2) 3745; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v20 3746; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v19 3747; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v48, 31, v18 3748; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v46, 31, v17 3749; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v45, v17 3750; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v47, v18 3751; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v13, v19 3752; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v15, v20 3753; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(1) 3754; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v24 3755; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v23 3756; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v56, 31, v22 3757; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v54, 31, v21 3758; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v53, v21 3759; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v55, v22 3760; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v17, v23 3761; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v19, v24 3762; GCN-GFX908-HSA-NEXT: global_load_dwordx4 v[21:24], v8, s[2:3] 3763; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v32, 31, v7 3764; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v36, 31, v5 3765; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v34, 31, v4 3766; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v33, v4 3767; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v35, v5 3768; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v30, 31, v6 3769; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v29, v6 3770; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v31, v7 3771; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224 3772; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240 3773; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v35, a3 3774; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 3775; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0 3776; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v0 3777; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v1 3778; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v34, a2 3779; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v33, a1 3780; GCN-GFX908-HSA-NEXT: v_accvgpr_read_b32 v32, a0 3781; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) 3782; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52 3783; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51 3784; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v50 3785; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v49 3786; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v0, v49 3787; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v2, v50 3788; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v57, v51 3789; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v59, v52 3790; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192 3791; GCN-GFX908-HSA-NEXT: s_waitcnt vmcnt(3) 3792; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v31, 31, v24 3793; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v29, 31, v23 3794; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v22 3795; GCN-GFX908-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v21 3796; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v4, v21 3797; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v6, v22 3798; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[32:35], s[0:1] offset:208 3799; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[37:40], s[0:1] offset:160 3800; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[25:28], s[0:1] offset:176 3801; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[41:44], s[0:1] offset:128 3802; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[9:12], s[0:1] offset:144 3803; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[45:48], s[0:1] offset:96 3804; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[13:16], s[0:1] offset:112 3805; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[53:56], s[0:1] offset:64 3806; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[17:20], s[0:1] offset:80 3807; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 3808; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[57:60], s[0:1] offset:48 3809; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] 3810; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v28, v23 3811; GCN-GFX908-HSA-NEXT: v_mov_b32_e32 v30, v24 3812; GCN-GFX908-HSA-NEXT: global_store_dwordx4 v8, v[28:31], s[0:1] offset:16 3813; GCN-GFX908-HSA-NEXT: s_endpgm 3814 %ld = load <32 x i32>, ptr addrspace(1) %in 3815 %ext = sext <32 x i32> %ld to <32 x i64> 3816 store <32 x i64> %ext, ptr addrspace(1) %out 3817 ret void 3818} 3819 3820define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 3821; SI-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: 3822; SI-NOHSA: ; %bb.0: 3823; SI-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 3824; SI-NOHSA-NEXT: s_mov_b32 s3, 0xf000 3825; SI-NOHSA-NEXT: s_mov_b32 s2, -1 3826; SI-NOHSA-NEXT: v_mov_b32_e32 v1, 0 3827; SI-NOHSA-NEXT: s_mov_b32 s10, s2 3828; SI-NOHSA-NEXT: s_mov_b32 s11, s3 3829; SI-NOHSA-NEXT: v_mov_b32_e32 v3, v1 3830; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 3831; SI-NOHSA-NEXT: s_mov_b32 s8, s6 3832; SI-NOHSA-NEXT: s_mov_b32 s9, s7 3833; SI-NOHSA-NEXT: s_mov_b32 s0, s4 3834; SI-NOHSA-NEXT: s_mov_b32 s1, s5 3835; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:112 3836; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:96 3837; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 3838; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16 3839; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 3840; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48 3841; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) 3842; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v4 3843; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v5 3844; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:64 3845; SI-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:80 3846; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 3847; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3848; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v6 3849; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v7 3850; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 3851; SI-NOHSA-NEXT: s_waitcnt vmcnt(8) expcnt(0) 3852; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v8 3853; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v9 3854; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 3855; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3856; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v10 3857; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v11 3858; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 3859; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) expcnt(0) 3860; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v32 3861; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v33 3862; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 3863; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3864; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v34 3865; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v35 3866; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 3867; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3868; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v28 3869; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v29 3870; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 3871; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3872; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v30 3873; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v31 3874; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 3875; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3876; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v24 3877; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v25 3878; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 3879; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3880; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v26 3881; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v27 3882; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 3883; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3884; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v20 3885; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v21 3886; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 3887; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3888; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v22 3889; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v23 3890; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 3891; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3892; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v16 3893; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v17 3894; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 3895; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3896; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v18 3897; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v19 3898; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 3899; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3900; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v12 3901; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v13 3902; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3903; SI-NOHSA-NEXT: s_waitcnt expcnt(0) 3904; SI-NOHSA-NEXT: v_mov_b32_e32 v0, v14 3905; SI-NOHSA-NEXT: v_mov_b32_e32 v2, v15 3906; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3907; SI-NOHSA-NEXT: s_endpgm 3908; 3909; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64: 3910; GCNX3-HSA: ; %bb.0: 3911; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3912; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 3913; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 3914; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 3915; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 32 3916; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 3917; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 3918; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 3919; GCNX3-HSA-NEXT: s_add_u32 s8, s2, 48 3920; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[0:1] 3921; GCNX3-HSA-NEXT: s_addc_u32 s9, s3, 0 3922; GCNX3-HSA-NEXT: s_add_u32 s10, s2, 64 3923; GCNX3-HSA-NEXT: s_addc_u32 s11, s3, 0 3924; GCNX3-HSA-NEXT: s_add_u32 s12, s2, 0x50 3925; GCNX3-HSA-NEXT: s_addc_u32 s13, s3, 0 3926; GCNX3-HSA-NEXT: s_add_u32 s14, s2, 0x60 3927; GCNX3-HSA-NEXT: s_addc_u32 s15, s3, 0 3928; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x70 3929; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 3930; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 3931; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 3932; GCNX3-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] 3933; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s14 3934; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s15 3935; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[0:1] 3936; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s12 3937; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s13 3938; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[0:1] 3939; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s10 3940; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s11 3941; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[0:1] 3942; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s8 3943; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s9 3944; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] 3945; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s4 3946; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s7 3947; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s5 3948; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s6 3949; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 3950; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 3951; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, 0 3952; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16 3953; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, v1 3954; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3955; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 3956; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v28 3957; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v29 3958; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s1 3959; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s0 3960; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] 3961; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 3962; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 3963; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xe0 3964; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3965; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v30 3966; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v31 3967; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 3968; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 3969; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xf0 3970; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3971; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] 3972; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 3973; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 3974; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xc0 3975; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3976; GCNX3-HSA-NEXT: s_waitcnt vmcnt(8) 3977; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v32 3978; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v33 3979; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] 3980; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 3981; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 3982; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xd0 3983; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3984; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s3 3985; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s2 3986; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xa0 3987; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3988; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v34 3989; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v35 3990; GCNX3-HSA-NEXT: v_mov_b32_e32 v35, s3 3991; GCNX3-HSA-NEXT: v_mov_b32_e32 v34, s2 3992; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0xb0 3993; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3994; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] 3995; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s3 3996; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s2 3997; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x80 3998; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 3999; GCNX3-HSA-NEXT: s_waitcnt vmcnt(9) 4000; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v24 4001; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v25 4002; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 4003; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 4004; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x90 4005; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4006; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] 4007; GCNX3-HSA-NEXT: s_nop 0 4008; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v26 4009; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v27 4010; GCNX3-HSA-NEXT: v_mov_b32_e32 v27, s3 4011; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2 4012; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 4013; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4014; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] 4015; GCNX3-HSA-NEXT: v_mov_b32_e32 v31, s3 4016; GCNX3-HSA-NEXT: s_waitcnt vmcnt(10) 4017; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v20 4018; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v21 4019; GCNX3-HSA-NEXT: flat_store_dwordx4 v[34:35], v[0:3] 4020; GCNX3-HSA-NEXT: v_mov_b32_e32 v30, s2 4021; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v22 4022; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v23 4023; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x70 4024; GCNX3-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] 4025; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4026; GCNX3-HSA-NEXT: s_waitcnt vmcnt(11) 4027; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v16 4028; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v17 4029; GCNX3-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] 4030; GCNX3-HSA-NEXT: s_nop 0 4031; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v18 4032; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v19 4033; GCNX3-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] 4034; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) 4035; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v12 4036; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v13 4037; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s3 4038; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s2 4039; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 4040; GCNX3-HSA-NEXT: flat_store_dwordx4 v[30:31], v[0:3] 4041; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4042; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v14 4043; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v15 4044; GCNX3-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 4045; GCNX3-HSA-NEXT: s_waitcnt vmcnt(12) 4046; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v8 4047; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v9 4048; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 4049; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 4050; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x50 4051; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4052; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 4053; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3 4054; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2 4055; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 32 4056; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v10 4057; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v11 4058; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4059; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 4060; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 48 4061; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v4 4062; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v5 4063; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 4064; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 4065; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 4066; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4067; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s1 4068; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, v6 4069; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, v7 4070; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s0 4071; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4072; GCNX3-HSA-NEXT: s_endpgm 4073; 4074; GCNX3-NOHSA-LABEL: global_zextload_v32i32_to_v32i64: 4075; GCNX3-NOHSA: ; %bb.0: 4076; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 4077; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000 4078; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1 4079; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2 4080; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3 4081; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 4082; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6 4083; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7 4084; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112 4085; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 4086; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 4087; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 4088; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:48 4089; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32 4090; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16 4091; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 4092; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v29, 0 4093; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v31, v29 4094; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4 4095; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5 4096; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 4097; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v0 4098; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v1 4099; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:224 4100; GCNX3-NOHSA-NEXT: s_nop 0 4101; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v2 4102; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v3 4103; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:240 4104; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(8) 4105; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v4 4106; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v5 4107; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:192 4108; GCNX3-NOHSA-NEXT: s_nop 0 4109; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v6 4110; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v7 4111; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:208 4112; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(9) 4113; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v8 4114; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v9 4115; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 4116; GCNX3-NOHSA-NEXT: s_nop 0 4117; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v10 4118; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v11 4119; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176 4120; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(10) 4121; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v12 4122; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v13 4123; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:128 4124; GCNX3-NOHSA-NEXT: s_nop 0 4125; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v14 4126; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v15 4127; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144 4128; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(11) 4129; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v16 4130; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v17 4131; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96 4132; GCNX3-NOHSA-NEXT: s_nop 0 4133; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v18 4134; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v19 4135; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112 4136; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(12) 4137; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v20 4138; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v21 4139; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64 4140; GCNX3-NOHSA-NEXT: s_nop 0 4141; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v22 4142; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v23 4143; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80 4144; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(13) 4145; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v24 4146; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v25 4147; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:32 4148; GCNX3-NOHSA-NEXT: s_nop 0 4149; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v26 4150; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v27 4151; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48 4152; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(14) 4153; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v32 4154; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v33 4155; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 4156; GCNX3-NOHSA-NEXT: s_nop 0 4157; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v28, v34 4158; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v30, v35 4159; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16 4160; GCNX3-NOHSA-NEXT: s_endpgm 4161; 4162; EG-LABEL: global_zextload_v32i32_to_v32i64: 4163; EG: ; %bb.0: 4164; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[] 4165; EG-NEXT: TEX 2 @22 4166; EG-NEXT: ALU 10, @39, KC0[], KC1[] 4167; EG-NEXT: TEX 4 @28 4168; EG-NEXT: ALU 100, @50, KC0[CB0:0-32], KC1[] 4169; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T31.X, 0 4170; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T30.X, 0 4171; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T29.X, 0 4172; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T28.X, 0 4173; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T27.X, 0 4174; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T26.X, 0 4175; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0 4176; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0 4177; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T13.X, 0 4178; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T12.X, 0 4179; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T11.X, 0 4180; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T10.X, 0 4181; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T3.X, 0 4182; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T2.X, 0 4183; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T1.X, 0 4184; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T0.X, 1 4185; EG-NEXT: CF_END 4186; EG-NEXT: Fetch clause starting at 22: 4187; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 112, #1 4188; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 80, #1 4189; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 96, #1 4190; EG-NEXT: Fetch clause starting at 28: 4191; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 4192; EG-NEXT: VTX_READ_128 T11.XYZW, T0.X, 16, #1 4193; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 32, #1 4194; EG-NEXT: VTX_READ_128 T13.XYZW, T0.X, 48, #1 4195; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 64, #1 4196; EG-NEXT: ALU clause starting at 38: 4197; EG-NEXT: MOV * T0.X, KC0[2].Z, 4198; EG-NEXT: ALU clause starting at 39: 4199; EG-NEXT: MOV T4.X, T1.X, 4200; EG-NEXT: MOV T4.Y, 0.0, 4201; EG-NEXT: MOV * T5.X, T1.Z, 4202; EG-NEXT: MOV * T5.Y, 0.0, 4203; EG-NEXT: MOV T6.X, T3.X, 4204; EG-NEXT: MOV T6.Y, 0.0, 4205; EG-NEXT: MOV * T7.X, T3.Z, 4206; EG-NEXT: MOV * T7.Y, 0.0, 4207; EG-NEXT: MOV T8.X, T2.X, 4208; EG-NEXT: MOV T8.Y, 0.0, 4209; EG-NEXT: MOV * T9.X, T2.Z, 4210; EG-NEXT: ALU clause starting at 50: 4211; EG-NEXT: MOV * T9.Y, 0.0, 4212; EG-NEXT: MOV T14.X, T0.X, 4213; EG-NEXT: MOV T14.Y, 0.0, 4214; EG-NEXT: MOV * T15.X, T0.Z, 4215; EG-NEXT: MOV * T15.Y, 0.0, 4216; EG-NEXT: MOV T16.X, T13.X, 4217; EG-NEXT: MOV T16.Y, 0.0, 4218; EG-NEXT: MOV * T17.X, T13.Z, 4219; EG-NEXT: MOV * T17.Y, 0.0, 4220; EG-NEXT: MOV T18.X, T12.X, 4221; EG-NEXT: MOV T18.Y, 0.0, 4222; EG-NEXT: MOV * T19.X, T12.Z, 4223; EG-NEXT: MOV * T19.Y, 0.0, 4224; EG-NEXT: MOV T20.X, T11.X, 4225; EG-NEXT: MOV T20.Y, 0.0, 4226; EG-NEXT: MOV * T21.X, T11.Z, 4227; EG-NEXT: MOV * T21.Y, 0.0, 4228; EG-NEXT: MOV T22.X, T10.X, 4229; EG-NEXT: MOV T22.Y, 0.0, 4230; EG-NEXT: MOV * T23.X, T10.Z, 4231; EG-NEXT: MOV T23.Y, 0.0, 4232; EG-NEXT: MOV T4.Z, T1.Y, 4233; EG-NEXT: MOV T4.W, 0.0, 4234; EG-NEXT: MOV * T5.Z, T1.W, 4235; EG-NEXT: MOV * T5.W, 0.0, 4236; EG-NEXT: MOV T6.Z, T3.Y, 4237; EG-NEXT: MOV T6.W, 0.0, 4238; EG-NEXT: MOV * T7.Z, T3.W, 4239; EG-NEXT: MOV * T7.W, 0.0, 4240; EG-NEXT: MOV T8.Z, T2.Y, 4241; EG-NEXT: MOV T8.W, 0.0, 4242; EG-NEXT: MOV * T9.Z, T2.W, 4243; EG-NEXT: MOV * T9.W, 0.0, 4244; EG-NEXT: MOV T14.Z, T0.Y, 4245; EG-NEXT: MOV T14.W, 0.0, 4246; EG-NEXT: MOV * T15.Z, T0.W, 4247; EG-NEXT: MOV * T15.W, 0.0, 4248; EG-NEXT: MOV T16.Z, T13.Y, 4249; EG-NEXT: MOV T16.W, 0.0, 4250; EG-NEXT: MOV * T17.Z, T13.W, 4251; EG-NEXT: MOV * T17.W, 0.0, 4252; EG-NEXT: MOV T18.Z, T12.Y, 4253; EG-NEXT: MOV T18.W, 0.0, 4254; EG-NEXT: MOV * T19.Z, T12.W, 4255; EG-NEXT: MOV * T19.W, 0.0, 4256; EG-NEXT: MOV T20.Z, T11.Y, 4257; EG-NEXT: MOV T20.W, 0.0, 4258; EG-NEXT: MOV * T21.Z, T11.W, 4259; EG-NEXT: MOV * T21.W, 0.0, 4260; EG-NEXT: MOV T22.Z, T10.Y, 4261; EG-NEXT: MOV T22.W, 0.0, 4262; EG-NEXT: MOV * T23.Z, T10.W, 4263; EG-NEXT: MOV T23.W, 0.0, 4264; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4265; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4266; EG-NEXT: LSHR T0.X, PS, literal.x, 4267; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4268; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4269; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4270; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4271; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4272; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4273; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 4274; EG-NEXT: LSHR T3.X, PV.W, literal.x, 4275; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4276; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 4277; EG-NEXT: LSHR T10.X, PV.W, literal.x, 4278; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4279; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) 4280; EG-NEXT: LSHR T11.X, PV.W, literal.x, 4281; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4282; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) 4283; EG-NEXT: LSHR T12.X, PV.W, literal.x, 4284; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4285; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43) 4286; EG-NEXT: LSHR T13.X, PV.W, literal.x, 4287; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4288; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43) 4289; EG-NEXT: LSHR T24.X, PV.W, literal.x, 4290; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4291; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) 4292; EG-NEXT: LSHR T25.X, PV.W, literal.x, 4293; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4294; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43) 4295; EG-NEXT: LSHR T26.X, PV.W, literal.x, 4296; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4297; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43) 4298; EG-NEXT: LSHR T27.X, PV.W, literal.x, 4299; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4300; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43) 4301; EG-NEXT: LSHR T28.X, PV.W, literal.x, 4302; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4303; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43) 4304; EG-NEXT: LSHR T29.X, PV.W, literal.x, 4305; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4306; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43) 4307; EG-NEXT: LSHR T30.X, PV.W, literal.x, 4308; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4309; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43) 4310; EG-NEXT: LSHR * T31.X, PV.W, literal.x, 4311; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4312; 4313; GCN-HSA-LABEL: global_zextload_v32i32_to_v32i64: 4314; GCN-HSA: ; %bb.0: 4315; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4316; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 4317; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 4318; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 4319; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v1, s[2:3] offset:112 4320; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v1, s[2:3] offset:96 4321; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v1, s[2:3] offset:80 4322; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v1, s[2:3] offset:64 4323; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v1, s[2:3] offset:48 4324; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v1, s[2:3] offset:32 4325; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v1, s[2:3] offset:16 4326; GCN-HSA-NEXT: global_load_dwordx4 v[32:35], v1, s[2:3] 4327; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4328; GCN-HSA-NEXT: v_mov_b32_e32 v0, v4 4329; GCN-HSA-NEXT: v_mov_b32_e32 v2, v5 4330; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:224 4331; GCN-HSA-NEXT: s_nop 0 4332; GCN-HSA-NEXT: v_mov_b32_e32 v0, v6 4333; GCN-HSA-NEXT: v_mov_b32_e32 v2, v7 4334; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:240 4335; GCN-HSA-NEXT: s_waitcnt vmcnt(8) 4336; GCN-HSA-NEXT: v_mov_b32_e32 v0, v8 4337; GCN-HSA-NEXT: v_mov_b32_e32 v2, v9 4338; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:192 4339; GCN-HSA-NEXT: s_nop 0 4340; GCN-HSA-NEXT: v_mov_b32_e32 v0, v10 4341; GCN-HSA-NEXT: v_mov_b32_e32 v2, v11 4342; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:208 4343; GCN-HSA-NEXT: s_waitcnt vmcnt(9) 4344; GCN-HSA-NEXT: v_mov_b32_e32 v0, v12 4345; GCN-HSA-NEXT: v_mov_b32_e32 v2, v13 4346; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:160 4347; GCN-HSA-NEXT: s_nop 0 4348; GCN-HSA-NEXT: v_mov_b32_e32 v0, v14 4349; GCN-HSA-NEXT: v_mov_b32_e32 v2, v15 4350; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:176 4351; GCN-HSA-NEXT: s_waitcnt vmcnt(10) 4352; GCN-HSA-NEXT: v_mov_b32_e32 v0, v16 4353; GCN-HSA-NEXT: v_mov_b32_e32 v2, v17 4354; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:128 4355; GCN-HSA-NEXT: s_nop 0 4356; GCN-HSA-NEXT: v_mov_b32_e32 v0, v18 4357; GCN-HSA-NEXT: v_mov_b32_e32 v2, v19 4358; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:144 4359; GCN-HSA-NEXT: s_waitcnt vmcnt(11) 4360; GCN-HSA-NEXT: v_mov_b32_e32 v0, v20 4361; GCN-HSA-NEXT: v_mov_b32_e32 v2, v21 4362; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:96 4363; GCN-HSA-NEXT: s_nop 0 4364; GCN-HSA-NEXT: v_mov_b32_e32 v0, v22 4365; GCN-HSA-NEXT: v_mov_b32_e32 v2, v23 4366; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:112 4367; GCN-HSA-NEXT: s_waitcnt vmcnt(12) 4368; GCN-HSA-NEXT: v_mov_b32_e32 v0, v24 4369; GCN-HSA-NEXT: v_mov_b32_e32 v2, v25 4370; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:64 4371; GCN-HSA-NEXT: s_nop 0 4372; GCN-HSA-NEXT: v_mov_b32_e32 v0, v26 4373; GCN-HSA-NEXT: v_mov_b32_e32 v2, v27 4374; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:80 4375; GCN-HSA-NEXT: s_waitcnt vmcnt(13) 4376; GCN-HSA-NEXT: v_mov_b32_e32 v0, v28 4377; GCN-HSA-NEXT: v_mov_b32_e32 v2, v29 4378; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:32 4379; GCN-HSA-NEXT: s_nop 0 4380; GCN-HSA-NEXT: v_mov_b32_e32 v0, v30 4381; GCN-HSA-NEXT: v_mov_b32_e32 v2, v31 4382; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:48 4383; GCN-HSA-NEXT: s_waitcnt vmcnt(14) 4384; GCN-HSA-NEXT: v_mov_b32_e32 v0, v32 4385; GCN-HSA-NEXT: v_mov_b32_e32 v2, v33 4386; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 4387; GCN-HSA-NEXT: s_nop 0 4388; GCN-HSA-NEXT: v_mov_b32_e32 v0, v34 4389; GCN-HSA-NEXT: v_mov_b32_e32 v2, v35 4390; GCN-HSA-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] offset:16 4391; GCN-HSA-NEXT: s_endpgm 4392 %ld = load <32 x i32>, ptr addrspace(1) %in 4393 %ext = zext <32 x i32> %ld to <32 x i64> 4394 store <32 x i64> %ext, ptr addrspace(1) %out 4395 ret void 4396} 4397 4398define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 4399; SI-NOHSA-LABEL: global_load_v32i32: 4400; SI-NOHSA: ; %bb.0: 4401; SI-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4402; SI-NOHSA-NEXT: s_mov_b32 s7, 0xf000 4403; SI-NOHSA-NEXT: s_mov_b32 s6, -1 4404; SI-NOHSA-NEXT: s_mov_b32 s10, s6 4405; SI-NOHSA-NEXT: s_mov_b32 s11, s7 4406; SI-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 4407; SI-NOHSA-NEXT: s_mov_b32 s4, s0 4408; SI-NOHSA-NEXT: s_mov_b32 s5, s1 4409; SI-NOHSA-NEXT: s_mov_b32 s8, s2 4410; SI-NOHSA-NEXT: s_mov_b32 s9, s3 4411; SI-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 4412; SI-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 4413; SI-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:112 4414; SI-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:96 4415; SI-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:80 4416; SI-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:64 4417; SI-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32 4418; SI-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48 4419; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) 4420; SI-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:96 4421; SI-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:112 4422; SI-NOHSA-NEXT: s_waitcnt vmcnt(4) 4423; SI-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:64 4424; SI-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:80 4425; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) 4426; SI-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 offset:32 4427; SI-NOHSA-NEXT: s_waitcnt vmcnt(5) 4428; SI-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:48 4429; SI-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 4430; SI-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 4431; SI-NOHSA-NEXT: s_endpgm 4432; 4433; GCNX3-HSA-LABEL: global_load_v32i32: 4434; GCNX3-HSA: ; %bb.0: 4435; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4436; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0) 4437; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16 4438; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 4439; GCNX3-HSA-NEXT: s_add_u32 s6, s2, 48 4440; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 4441; GCNX3-HSA-NEXT: v_mov_b32_e32 v29, s5 4442; GCNX3-HSA-NEXT: s_addc_u32 s7, s3, 0 4443; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 4444; GCNX3-HSA-NEXT: v_mov_b32_e32 v28, s4 4445; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32 4446; GCNX3-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 4447; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 4448; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s5 4449; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s4 4450; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x50 4451; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 4452; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s5 4453; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s4 4454; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 64 4455; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 4456; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, s5 4457; GCNX3-HSA-NEXT: v_mov_b32_e32 v16, s4 4458; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 0x70 4459; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s6 4460; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0 4461; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s7 4462; GCNX3-HSA-NEXT: s_add_u32 s2, s2, 0x60 4463; GCNX3-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 4464; GCNX3-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 4465; GCNX3-HSA-NEXT: s_addc_u32 s3, s3, 0 4466; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s5 4467; GCNX3-HSA-NEXT: v_mov_b32_e32 v25, s3 4468; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s4 4469; GCNX3-HSA-NEXT: v_mov_b32_e32 v24, s2 4470; GCNX3-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 4471; GCNX3-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] 4472; GCNX3-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] 4473; GCNX3-HSA-NEXT: flat_load_dwordx4 v[24:27], v[24:25] 4474; GCNX3-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] 4475; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 0x60 4476; GCNX3-HSA-NEXT: v_mov_b32_e32 v33, s1 4477; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4478; GCNX3-HSA-NEXT: v_mov_b32_e32 v32, s0 4479; GCNX3-HSA-NEXT: s_add_u32 s4, s0, 0x70 4480; GCNX3-HSA-NEXT: s_addc_u32 s5, s1, 0 4481; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4482; GCNX3-HSA-NEXT: flat_store_dwordx4 v[32:33], v[0:3] 4483; GCNX3-HSA-NEXT: s_nop 0 4484; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2 4485; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3 4486; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64 4487; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0 4488; GCNX3-HSA-NEXT: s_add_u32 s6, s0, 0x50 4489; GCNX3-HSA-NEXT: s_addc_u32 s7, s1, 0 4490; GCNX3-HSA-NEXT: s_add_u32 s8, s0, 32 4491; GCNX3-HSA-NEXT: s_addc_u32 s9, s1, 0 4492; GCNX3-HSA-NEXT: s_add_u32 s10, s0, 48 4493; GCNX3-HSA-NEXT: s_addc_u32 s11, s1, 0 4494; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s10 4495; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s11 4496; GCNX3-HSA-NEXT: s_add_u32 s0, s0, 16 4497; GCNX3-HSA-NEXT: s_addc_u32 s1, s1, 0 4498; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4499; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] 4500; GCNX3-HSA-NEXT: s_nop 0 4501; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s8 4502; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s9 4503; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4504; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[8:11] 4505; GCNX3-HSA-NEXT: v_mov_b32_e32 v6, s6 4506; GCNX3-HSA-NEXT: v_mov_b32_e32 v2, s4 4507; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3 4508; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, s7 4509; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s1 4510; GCNX3-HSA-NEXT: v_mov_b32_e32 v3, s5 4511; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2 4512; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s0 4513; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4514; GCNX3-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] 4515; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4516; GCNX3-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] 4517; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4518; GCNX3-HSA-NEXT: flat_store_dwordx4 v[2:3], v[20:23] 4519; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4520; GCNX3-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] 4521; GCNX3-HSA-NEXT: s_waitcnt vmcnt(7) 4522; GCNX3-HSA-NEXT: flat_store_dwordx4 v[8:9], v[28:31] 4523; GCNX3-HSA-NEXT: s_endpgm 4524; 4525; GCNX3-NOHSA-LABEL: global_load_v32i32: 4526; GCNX3-NOHSA: ; %bb.0: 4527; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4528; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000 4529; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1 4530; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6 4531; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7 4532; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0) 4533; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2 4534; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3 4535; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112 4536; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96 4537; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80 4538; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64 4539; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:32 4540; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48 4541; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 4542; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16 4543; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0 4544; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1 4545; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) 4546; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96 4547; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112 4548; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6) 4549; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64 4550; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80 4551; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 4552; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32 4553; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 4554; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48 4555; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 4556; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 4557; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7) 4558; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16 4559; GCNX3-NOHSA-NEXT: s_endpgm 4560; 4561; EG-LABEL: global_load_v32i32: 4562; EG: ; %bb.0: 4563; EG-NEXT: ALU 23, @28, KC0[CB0:0-32], KC1[] 4564; EG-NEXT: TEX 7 @12 4565; EG-NEXT: ALU 1, @52, KC0[], KC1[] 4566; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T15.X, 0 4567; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T6.X, 0 4568; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T5.X, 0 4569; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T4.X, 0 4570; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T3.X, 0 4571; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T2.X, 0 4572; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T1.X, 0 4573; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1 4574; EG-NEXT: CF_END 4575; EG-NEXT: Fetch clause starting at 12: 4576; EG-NEXT: VTX_READ_128 T8.XYZW, T7.X, 96, #1 4577; EG-NEXT: VTX_READ_128 T9.XYZW, T7.X, 112, #1 4578; EG-NEXT: VTX_READ_128 T10.XYZW, T7.X, 64, #1 4579; EG-NEXT: VTX_READ_128 T11.XYZW, T7.X, 80, #1 4580; EG-NEXT: VTX_READ_128 T12.XYZW, T7.X, 32, #1 4581; EG-NEXT: VTX_READ_128 T13.XYZW, T7.X, 48, #1 4582; EG-NEXT: VTX_READ_128 T14.XYZW, T7.X, 0, #1 4583; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 16, #1 4584; EG-NEXT: ALU clause starting at 28: 4585; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4586; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4587; EG-NEXT: LSHR T0.X, PV.W, literal.x, 4588; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4589; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4590; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4591; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4592; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4593; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4594; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 4595; EG-NEXT: LSHR T3.X, PV.W, literal.x, 4596; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4597; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 4598; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4599; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4600; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44) 4601; EG-NEXT: LSHR T5.X, PV.W, literal.x, 4602; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4603; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43) 4604; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4605; EG-NEXT: MOV * T7.X, KC0[2].Z, 4606; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4607; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4608; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00) 4609; EG-NEXT: ALU clause starting at 52: 4610; EG-NEXT: LSHR * T15.X, T0.W, literal.x, 4611; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4612; 4613; GCN-HSA-LABEL: global_load_v32i32: 4614; GCN-HSA: ; %bb.0: 4615; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4616; GCN-HSA-NEXT: v_mov_b32_e32 v32, 0 4617; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) 4618; GCN-HSA-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] offset:96 4619; GCN-HSA-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:112 4620; GCN-HSA-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:64 4621; GCN-HSA-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:80 4622; GCN-HSA-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:32 4623; GCN-HSA-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:48 4624; GCN-HSA-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] 4625; GCN-HSA-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:16 4626; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4627; GCN-HSA-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 4628; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4629; GCN-HSA-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 4630; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4631; GCN-HSA-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 4632; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4633; GCN-HSA-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 4634; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4635; GCN-HSA-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 4636; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4637; GCN-HSA-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 4638; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4639; GCN-HSA-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] 4640; GCN-HSA-NEXT: s_waitcnt vmcnt(7) 4641; GCN-HSA-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:16 4642; GCN-HSA-NEXT: s_endpgm 4643 %ld = load <32 x i32>, ptr addrspace(1) %in 4644 store <32 x i32> %ld, ptr addrspace(1) %out 4645 ret void 4646} 4647 4648attributes #0 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" } 4649