1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s 4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc < %s -mtriple=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s 6; RUN: llc < %s -mtriple=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s 7 8define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounwind { 9; SI-LABEL: i8_arg: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dword s2, s[4:5], 0xb 12; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 13; SI-NEXT: s_mov_b32 s3, 0xf000 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_and_b32 s4, s2, 0xff 16; SI-NEXT: s_mov_b32 s2, -1 17; SI-NEXT: v_mov_b32_e32 v0, s4 18; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 19; SI-NEXT: s_endpgm 20; 21; VI-LABEL: i8_arg: 22; VI: ; %bb.0: 23; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 24; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_and_b32 s2, s2, 0xff 27; VI-NEXT: v_mov_b32_e32 v0, s0 28; VI-NEXT: v_mov_b32_e32 v1, s1 29; VI-NEXT: v_mov_b32_e32 v2, s2 30; VI-NEXT: flat_store_dword v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: i8_arg: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 36; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: s_and_b32 s2, s2, 0xff 40; GFX9-NEXT: v_mov_b32_e32 v1, s2 41; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 42; GFX9-NEXT: s_endpgm 43; 44; EG-LABEL: i8_arg: 45; EG: ; %bb.0: 46; EG-NEXT: ALU 0, @8, KC0[], KC1[] 47; EG-NEXT: TEX 0 @6 48; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 49; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 50; EG-NEXT: CF_END 51; EG-NEXT: PAD 52; EG-NEXT: Fetch clause starting at 6: 53; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 54; EG-NEXT: ALU clause starting at 8: 55; EG-NEXT: MOV * T0.X, 0.0, 56; EG-NEXT: ALU clause starting at 9: 57; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 58; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 59; 60; CM-LABEL: i8_arg: 61; CM: ; %bb.0: 62; CM-NEXT: ALU 0, @8, KC0[], KC1[] 63; CM-NEXT: TEX 0 @6 64; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 65; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 66; CM-NEXT: CF_END 67; CM-NEXT: PAD 68; CM-NEXT: Fetch clause starting at 6: 69; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 70; CM-NEXT: ALU clause starting at 8: 71; CM-NEXT: MOV * T0.X, 0.0, 72; CM-NEXT: ALU clause starting at 9: 73; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 74; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 75 %ext = zext i8 %in to i32 76 store i32 %ext, ptr addrspace(1) %out, align 4 77 ret void 78} 79 80define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroext %in) nounwind { 81; SI-LABEL: i8_zext_arg: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dword s2, s[4:5], 0xb 84; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 85; SI-NEXT: s_mov_b32 s3, 0xf000 86; SI-NEXT: s_waitcnt lgkmcnt(0) 87; SI-NEXT: s_and_b32 s4, s2, 0xff 88; SI-NEXT: s_mov_b32 s2, -1 89; SI-NEXT: v_mov_b32_e32 v0, s4 90; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: i8_zext_arg: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 96; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: s_and_b32 s2, s2, 0xff 99; VI-NEXT: v_mov_b32_e32 v0, s0 100; VI-NEXT: v_mov_b32_e32 v1, s1 101; VI-NEXT: v_mov_b32_e32 v2, s2 102; VI-NEXT: flat_store_dword v[0:1], v2 103; VI-NEXT: s_endpgm 104; 105; GFX9-LABEL: i8_zext_arg: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 109; GFX9-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: s_and_b32 s2, s2, 0xff 112; GFX9-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 114; GFX9-NEXT: s_endpgm 115; 116; EG-LABEL: i8_zext_arg: 117; EG: ; %bb.0: 118; EG-NEXT: ALU 0, @8, KC0[], KC1[] 119; EG-NEXT: TEX 0 @6 120; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 122; EG-NEXT: CF_END 123; EG-NEXT: PAD 124; EG-NEXT: Fetch clause starting at 6: 125; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 126; EG-NEXT: ALU clause starting at 8: 127; EG-NEXT: MOV * T0.X, 0.0, 128; EG-NEXT: ALU clause starting at 9: 129; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 131; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 132; 133; CM-LABEL: i8_zext_arg: 134; CM: ; %bb.0: 135; CM-NEXT: ALU 0, @8, KC0[], KC1[] 136; CM-NEXT: TEX 0 @6 137; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 138; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 139; CM-NEXT: CF_END 140; CM-NEXT: PAD 141; CM-NEXT: Fetch clause starting at 6: 142; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 143; CM-NEXT: ALU clause starting at 8: 144; CM-NEXT: MOV * T0.X, 0.0, 145; CM-NEXT: ALU clause starting at 9: 146; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 147; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 148; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 149; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 150 %ext = zext i8 %in to i32 151 store i32 %ext, ptr addrspace(1) %out, align 4 152 ret void 153} 154 155define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signext %in) nounwind { 156; SI-LABEL: i8_sext_arg: 157; SI: ; %bb.0: 158; SI-NEXT: s_load_dword s2, s[4:5], 0xb 159; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 160; SI-NEXT: s_mov_b32 s3, 0xf000 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_sext_i32_i8 s4, s2 163; SI-NEXT: s_mov_b32 s2, -1 164; SI-NEXT: v_mov_b32_e32 v0, s4 165; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 166; SI-NEXT: s_endpgm 167; 168; VI-LABEL: i8_sext_arg: 169; VI: ; %bb.0: 170; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 171; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: s_sext_i32_i8 s2, s2 174; VI-NEXT: v_mov_b32_e32 v0, s0 175; VI-NEXT: v_mov_b32_e32 v1, s1 176; VI-NEXT: v_mov_b32_e32 v2, s2 177; VI-NEXT: flat_store_dword v[0:1], v2 178; VI-NEXT: s_endpgm 179; 180; GFX9-LABEL: i8_sext_arg: 181; GFX9: ; %bb.0: 182; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 183; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 184; GFX9-NEXT: v_mov_b32_e32 v0, 0 185; GFX9-NEXT: s_waitcnt lgkmcnt(0) 186; GFX9-NEXT: s_sext_i32_i8 s2, s2 187; GFX9-NEXT: v_mov_b32_e32 v1, s2 188; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 189; GFX9-NEXT: s_endpgm 190; 191; EG-LABEL: i8_sext_arg: 192; EG: ; %bb.0: 193; EG-NEXT: ALU 0, @8, KC0[], KC1[] 194; EG-NEXT: TEX 0 @6 195; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 196; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 197; EG-NEXT: CF_END 198; EG-NEXT: PAD 199; EG-NEXT: Fetch clause starting at 6: 200; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 201; EG-NEXT: ALU clause starting at 8: 202; EG-NEXT: MOV * T0.X, 0.0, 203; EG-NEXT: ALU clause starting at 9: 204; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 205; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 206; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 207; 208; CM-LABEL: i8_sext_arg: 209; CM: ; %bb.0: 210; CM-NEXT: ALU 0, @8, KC0[], KC1[] 211; CM-NEXT: TEX 0 @6 212; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 213; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 214; CM-NEXT: CF_END 215; CM-NEXT: PAD 216; CM-NEXT: Fetch clause starting at 6: 217; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 218; CM-NEXT: ALU clause starting at 8: 219; CM-NEXT: MOV * T0.X, 0.0, 220; CM-NEXT: ALU clause starting at 9: 221; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 222; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 223; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 224; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 225 %ext = sext i8 %in to i32 226 store i32 %ext, ptr addrspace(1) %out, align 4 227 ret void 228} 229 230define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nounwind { 231; SI-LABEL: i16_arg: 232; SI: ; %bb.0: 233; SI-NEXT: s_load_dword s2, s[4:5], 0xb 234; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 235; SI-NEXT: s_mov_b32 s3, 0xf000 236; SI-NEXT: s_waitcnt lgkmcnt(0) 237; SI-NEXT: s_and_b32 s4, s2, 0xffff 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: v_mov_b32_e32 v0, s4 240; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: i16_arg: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 246; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: s_and_b32 s2, s2, 0xffff 249; VI-NEXT: v_mov_b32_e32 v0, s0 250; VI-NEXT: v_mov_b32_e32 v1, s1 251; VI-NEXT: v_mov_b32_e32 v2, s2 252; VI-NEXT: flat_store_dword v[0:1], v2 253; VI-NEXT: s_endpgm 254; 255; GFX9-LABEL: i16_arg: 256; GFX9: ; %bb.0: 257; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 258; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 259; GFX9-NEXT: v_mov_b32_e32 v0, 0 260; GFX9-NEXT: s_waitcnt lgkmcnt(0) 261; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 262; GFX9-NEXT: v_mov_b32_e32 v1, s2 263; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 264; GFX9-NEXT: s_endpgm 265; 266; EG-LABEL: i16_arg: 267; EG: ; %bb.0: 268; EG-NEXT: ALU 0, @8, KC0[], KC1[] 269; EG-NEXT: TEX 0 @6 270; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 271; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 272; EG-NEXT: CF_END 273; EG-NEXT: PAD 274; EG-NEXT: Fetch clause starting at 6: 275; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 276; EG-NEXT: ALU clause starting at 8: 277; EG-NEXT: MOV * T0.X, 0.0, 278; EG-NEXT: ALU clause starting at 9: 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281; 282; CM-LABEL: i16_arg: 283; CM: ; %bb.0: 284; CM-NEXT: ALU 0, @8, KC0[], KC1[] 285; CM-NEXT: TEX 0 @6 286; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 287; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 288; CM-NEXT: CF_END 289; CM-NEXT: PAD 290; CM-NEXT: Fetch clause starting at 6: 291; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 292; CM-NEXT: ALU clause starting at 8: 293; CM-NEXT: MOV * T0.X, 0.0, 294; CM-NEXT: ALU clause starting at 9: 295; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 296; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 297 %ext = zext i16 %in to i32 298 store i32 %ext, ptr addrspace(1) %out, align 4 299 ret void 300} 301 302define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zeroext %in) nounwind { 303; SI-LABEL: i16_zext_arg: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dword s2, s[4:5], 0xb 306; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 307; SI-NEXT: s_mov_b32 s3, 0xf000 308; SI-NEXT: s_waitcnt lgkmcnt(0) 309; SI-NEXT: s_and_b32 s4, s2, 0xffff 310; SI-NEXT: s_mov_b32 s2, -1 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 313; SI-NEXT: s_endpgm 314; 315; VI-LABEL: i16_zext_arg: 316; VI: ; %bb.0: 317; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 318; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 319; VI-NEXT: s_waitcnt lgkmcnt(0) 320; VI-NEXT: s_and_b32 s2, s2, 0xffff 321; VI-NEXT: v_mov_b32_e32 v0, s0 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_mov_b32_e32 v2, s2 324; VI-NEXT: flat_store_dword v[0:1], v2 325; VI-NEXT: s_endpgm 326; 327; GFX9-LABEL: i16_zext_arg: 328; GFX9: ; %bb.0: 329; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 330; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 331; GFX9-NEXT: v_mov_b32_e32 v0, 0 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 334; GFX9-NEXT: v_mov_b32_e32 v1, s2 335; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 336; GFX9-NEXT: s_endpgm 337; 338; EG-LABEL: i16_zext_arg: 339; EG: ; %bb.0: 340; EG-NEXT: ALU 0, @8, KC0[], KC1[] 341; EG-NEXT: TEX 0 @6 342; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 343; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 344; EG-NEXT: CF_END 345; EG-NEXT: PAD 346; EG-NEXT: Fetch clause starting at 6: 347; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 348; EG-NEXT: ALU clause starting at 8: 349; EG-NEXT: MOV * T0.X, 0.0, 350; EG-NEXT: ALU clause starting at 9: 351; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 352; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 353; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 354; 355; CM-LABEL: i16_zext_arg: 356; CM: ; %bb.0: 357; CM-NEXT: ALU 0, @8, KC0[], KC1[] 358; CM-NEXT: TEX 0 @6 359; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 360; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 361; CM-NEXT: CF_END 362; CM-NEXT: PAD 363; CM-NEXT: Fetch clause starting at 6: 364; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 365; CM-NEXT: ALU clause starting at 8: 366; CM-NEXT: MOV * T0.X, 0.0, 367; CM-NEXT: ALU clause starting at 9: 368; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 369; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 370; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 371; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 372 %ext = zext i16 %in to i32 373 store i32 %ext, ptr addrspace(1) %out, align 4 374 ret void 375} 376 377define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 signext %in) nounwind { 378; SI-LABEL: i16_sext_arg: 379; SI: ; %bb.0: 380; SI-NEXT: s_load_dword s2, s[4:5], 0xb 381; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 382; SI-NEXT: s_mov_b32 s3, 0xf000 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: s_sext_i32_i16 s4, s2 385; SI-NEXT: s_mov_b32 s2, -1 386; SI-NEXT: v_mov_b32_e32 v0, s4 387; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; SI-NEXT: s_endpgm 389; 390; VI-LABEL: i16_sext_arg: 391; VI: ; %bb.0: 392; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 393; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 394; VI-NEXT: s_waitcnt lgkmcnt(0) 395; VI-NEXT: s_sext_i32_i16 s2, s2 396; VI-NEXT: v_mov_b32_e32 v0, s0 397; VI-NEXT: v_mov_b32_e32 v1, s1 398; VI-NEXT: v_mov_b32_e32 v2, s2 399; VI-NEXT: flat_store_dword v[0:1], v2 400; VI-NEXT: s_endpgm 401; 402; GFX9-LABEL: i16_sext_arg: 403; GFX9: ; %bb.0: 404; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 405; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 406; GFX9-NEXT: v_mov_b32_e32 v0, 0 407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 408; GFX9-NEXT: s_sext_i32_i16 s2, s2 409; GFX9-NEXT: v_mov_b32_e32 v1, s2 410; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 411; GFX9-NEXT: s_endpgm 412; 413; EG-LABEL: i16_sext_arg: 414; EG: ; %bb.0: 415; EG-NEXT: ALU 0, @8, KC0[], KC1[] 416; EG-NEXT: TEX 0 @6 417; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 418; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 419; EG-NEXT: CF_END 420; EG-NEXT: PAD 421; EG-NEXT: Fetch clause starting at 6: 422; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 423; EG-NEXT: ALU clause starting at 8: 424; EG-NEXT: MOV * T0.X, 0.0, 425; EG-NEXT: ALU clause starting at 9: 426; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x, 427; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 428; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) 429; 430; CM-LABEL: i16_sext_arg: 431; CM: ; %bb.0: 432; CM-NEXT: ALU 0, @8, KC0[], KC1[] 433; CM-NEXT: TEX 0 @6 434; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 435; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 436; CM-NEXT: CF_END 437; CM-NEXT: PAD 438; CM-NEXT: Fetch clause starting at 6: 439; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 440; CM-NEXT: ALU clause starting at 8: 441; CM-NEXT: MOV * T0.X, 0.0, 442; CM-NEXT: ALU clause starting at 9: 443; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x, 444; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 445; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 446; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 447 %ext = sext i16 %in to i32 448 store i32 %ext, ptr addrspace(1) %out, align 4 449 ret void 450} 451 452define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nounwind { 453; SI-LABEL: i32_arg: 454; SI: ; %bb.0: ; %entry 455; SI-NEXT: s_load_dword s6, s[4:5], 0xb 456; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 457; SI-NEXT: s_mov_b32 s3, 0xf000 458; SI-NEXT: s_mov_b32 s2, -1 459; SI-NEXT: s_waitcnt lgkmcnt(0) 460; SI-NEXT: v_mov_b32_e32 v0, s6 461; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 462; SI-NEXT: s_endpgm 463; 464; VI-LABEL: i32_arg: 465; VI: ; %bb.0: ; %entry 466; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 467; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 468; VI-NEXT: s_waitcnt lgkmcnt(0) 469; VI-NEXT: v_mov_b32_e32 v0, s0 470; VI-NEXT: v_mov_b32_e32 v1, s1 471; VI-NEXT: v_mov_b32_e32 v2, s2 472; VI-NEXT: flat_store_dword v[0:1], v2 473; VI-NEXT: s_endpgm 474; 475; GFX9-LABEL: i32_arg: 476; GFX9: ; %bb.0: ; %entry 477; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 479; GFX9-NEXT: v_mov_b32_e32 v0, 0 480; GFX9-NEXT: s_waitcnt lgkmcnt(0) 481; GFX9-NEXT: v_mov_b32_e32 v1, s2 482; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 483; GFX9-NEXT: s_endpgm 484; 485; EG-LABEL: i32_arg: 486; EG: ; %bb.0: ; %entry 487; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 489; EG-NEXT: CF_END 490; EG-NEXT: PAD 491; EG-NEXT: ALU clause starting at 4: 492; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 493; EG-NEXT: MOV * T1.X, KC0[2].Z, 494; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 495; 496; CM-LABEL: i32_arg: 497; CM: ; %bb.0: ; %entry 498; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 499; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 500; CM-NEXT: CF_END 501; CM-NEXT: PAD 502; CM-NEXT: ALU clause starting at 4: 503; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 504; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 505; CM-NEXT: MOV * T1.X, KC0[2].Z, 506entry: 507 store i32 %in, ptr addrspace(1) %out, align 4 508 ret void 509} 510 511define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) nounwind { 512; SI-LABEL: f32_arg: 513; SI: ; %bb.0: ; %entry 514; SI-NEXT: s_load_dword s6, s[4:5], 0xb 515; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 516; SI-NEXT: s_mov_b32 s3, 0xf000 517; SI-NEXT: s_mov_b32 s2, -1 518; SI-NEXT: s_waitcnt lgkmcnt(0) 519; SI-NEXT: v_mov_b32_e32 v0, s6 520; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 521; SI-NEXT: s_endpgm 522; 523; VI-LABEL: f32_arg: 524; VI: ; %bb.0: ; %entry 525; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 526; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: v_mov_b32_e32 v0, s0 529; VI-NEXT: v_mov_b32_e32 v1, s1 530; VI-NEXT: v_mov_b32_e32 v2, s2 531; VI-NEXT: flat_store_dword v[0:1], v2 532; VI-NEXT: s_endpgm 533; 534; GFX9-LABEL: f32_arg: 535; GFX9: ; %bb.0: ; %entry 536; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 537; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 538; GFX9-NEXT: v_mov_b32_e32 v0, 0 539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 540; GFX9-NEXT: v_mov_b32_e32 v1, s2 541; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 542; GFX9-NEXT: s_endpgm 543; 544; EG-LABEL: f32_arg: 545; EG: ; %bb.0: ; %entry 546; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 547; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 548; EG-NEXT: CF_END 549; EG-NEXT: PAD 550; EG-NEXT: ALU clause starting at 4: 551; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 552; EG-NEXT: MOV * T1.X, KC0[2].Z, 553; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 554; 555; CM-LABEL: f32_arg: 556; CM: ; %bb.0: ; %entry 557; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 558; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 559; CM-NEXT: CF_END 560; CM-NEXT: PAD 561; CM-NEXT: ALU clause starting at 4: 562; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 563; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 564; CM-NEXT: MOV * T1.X, KC0[2].Z, 565entry: 566 store float %in, ptr addrspace(1) %out, align 4 567 ret void 568} 569 570define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) { 571; SI-LABEL: v2i8_arg: 572; SI: ; %bb.0: ; %entry 573; SI-NEXT: s_load_dword s6, s[4:5], 0xb 574; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 575; SI-NEXT: s_mov_b32 s3, 0xf000 576; SI-NEXT: s_mov_b32 s2, -1 577; SI-NEXT: s_waitcnt lgkmcnt(0) 578; SI-NEXT: v_mov_b32_e32 v0, s6 579; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 580; SI-NEXT: s_endpgm 581; 582; VI-LABEL: v2i8_arg: 583; VI: ; %bb.0: ; %entry 584; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 585; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 586; VI-NEXT: s_waitcnt lgkmcnt(0) 587; VI-NEXT: v_mov_b32_e32 v0, s0 588; VI-NEXT: v_mov_b32_e32 v1, s1 589; VI-NEXT: v_mov_b32_e32 v2, s2 590; VI-NEXT: flat_store_short v[0:1], v2 591; VI-NEXT: s_endpgm 592; 593; GFX9-LABEL: v2i8_arg: 594; GFX9: ; %bb.0: ; %entry 595; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 596; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 597; GFX9-NEXT: v_mov_b32_e32 v0, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: v_mov_b32_e32 v1, s2 600; GFX9-NEXT: global_store_short v0, v1, s[0:1] 601; GFX9-NEXT: s_endpgm 602; 603; EG-LABEL: v2i8_arg: 604; EG: ; %bb.0: ; %entry 605; EG-NEXT: ALU 0, @8, KC0[], KC1[] 606; EG-NEXT: TEX 0 @6 607; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 608; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 609; EG-NEXT: CF_END 610; EG-NEXT: PAD 611; EG-NEXT: Fetch clause starting at 6: 612; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 613; EG-NEXT: ALU clause starting at 8: 614; EG-NEXT: MOV * T0.X, 0.0, 615; EG-NEXT: ALU clause starting at 9: 616; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 617; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, 618; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 619; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 620; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 621; EG-NEXT: LSHL T0.X, T1.W, PV.W, 622; EG-NEXT: LSHL * T0.W, literal.x, PV.W, 623; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 624; EG-NEXT: MOV T0.Y, 0.0, 625; EG-NEXT: MOV * T0.Z, 0.0, 626; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 627; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 628; 629; CM-LABEL: v2i8_arg: 630; CM: ; %bb.0: ; %entry 631; CM-NEXT: ALU 0, @8, KC0[], KC1[] 632; CM-NEXT: TEX 0 @6 633; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 634; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 635; CM-NEXT: CF_END 636; CM-NEXT: PAD 637; CM-NEXT: Fetch clause starting at 6: 638; CM-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 639; CM-NEXT: ALU clause starting at 8: 640; CM-NEXT: MOV * T0.X, 0.0, 641; CM-NEXT: ALU clause starting at 9: 642; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 643; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 644; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, 645; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 646; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 647; CM-NEXT: LSHL T0.X, PV.Z, PV.W, 648; CM-NEXT: LSHL * T0.W, literal.x, PV.W, 649; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 650; CM-NEXT: MOV T0.Y, 0.0, 651; CM-NEXT: MOV * T0.Z, 0.0, 652; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 653; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 654entry: 655 store <2 x i8> %in, ptr addrspace(1) %out 656 ret void 657} 658 659define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) { 660; SI-LABEL: v2i16_arg: 661; SI: ; %bb.0: ; %entry 662; SI-NEXT: s_load_dword s6, s[4:5], 0xb 663; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 664; SI-NEXT: s_mov_b32 s3, 0xf000 665; SI-NEXT: s_mov_b32 s2, -1 666; SI-NEXT: s_waitcnt lgkmcnt(0) 667; SI-NEXT: v_mov_b32_e32 v0, s6 668; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 669; SI-NEXT: s_endpgm 670; 671; VI-LABEL: v2i16_arg: 672; VI: ; %bb.0: ; %entry 673; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 674; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 675; VI-NEXT: s_waitcnt lgkmcnt(0) 676; VI-NEXT: v_mov_b32_e32 v0, s0 677; VI-NEXT: v_mov_b32_e32 v1, s1 678; VI-NEXT: v_mov_b32_e32 v2, s2 679; VI-NEXT: flat_store_dword v[0:1], v2 680; VI-NEXT: s_endpgm 681; 682; GFX9-LABEL: v2i16_arg: 683; GFX9: ; %bb.0: ; %entry 684; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 685; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 686; GFX9-NEXT: v_mov_b32_e32 v0, 0 687; GFX9-NEXT: s_waitcnt lgkmcnt(0) 688; GFX9-NEXT: v_mov_b32_e32 v1, s2 689; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 690; GFX9-NEXT: s_endpgm 691; 692; EG-LABEL: v2i16_arg: 693; EG: ; %bb.0: ; %entry 694; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 695; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 696; EG-NEXT: CF_END 697; EG-NEXT: PAD 698; EG-NEXT: ALU clause starting at 4: 699; EG-NEXT: MOV T0.X, KC0[2].Z, 700; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 701; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 702; 703; CM-LABEL: v2i16_arg: 704; CM: ; %bb.0: ; %entry 705; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 706; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 707; CM-NEXT: CF_END 708; CM-NEXT: PAD 709; CM-NEXT: ALU clause starting at 4: 710; CM-NEXT: MOV * T0.X, KC0[2].Z, 711; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 712; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 713entry: 714 store <2 x i16> %in, ptr addrspace(1) %out 715 ret void 716} 717 718define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> %in) nounwind { 719; SI-LABEL: v2i32_arg: 720; SI: ; %bb.0: ; %entry 721; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 722; SI-NEXT: s_mov_b32 s7, 0xf000 723; SI-NEXT: s_mov_b32 s6, -1 724; SI-NEXT: s_waitcnt lgkmcnt(0) 725; SI-NEXT: s_mov_b32 s4, s0 726; SI-NEXT: s_mov_b32 s5, s1 727; SI-NEXT: v_mov_b32_e32 v0, s2 728; SI-NEXT: v_mov_b32_e32 v1, s3 729; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 730; SI-NEXT: s_endpgm 731; 732; VI-LABEL: v2i32_arg: 733; VI: ; %bb.0: ; %entry 734; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 735; VI-NEXT: s_waitcnt lgkmcnt(0) 736; VI-NEXT: v_mov_b32_e32 v0, s0 737; VI-NEXT: v_mov_b32_e32 v2, s2 738; VI-NEXT: v_mov_b32_e32 v1, s1 739; VI-NEXT: v_mov_b32_e32 v3, s3 740; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 741; VI-NEXT: s_endpgm 742; 743; GFX9-LABEL: v2i32_arg: 744; GFX9: ; %bb.0: ; %entry 745; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 746; GFX9-NEXT: v_mov_b32_e32 v2, 0 747; GFX9-NEXT: s_waitcnt lgkmcnt(0) 748; GFX9-NEXT: v_mov_b32_e32 v0, s2 749; GFX9-NEXT: v_mov_b32_e32 v1, s3 750; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 751; GFX9-NEXT: s_endpgm 752; 753; EG-LABEL: v2i32_arg: 754; EG: ; %bb.0: ; %entry 755; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 756; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 757; EG-NEXT: CF_END 758; EG-NEXT: PAD 759; EG-NEXT: ALU clause starting at 4: 760; EG-NEXT: MOV * T0.Y, KC0[3].X, 761; EG-NEXT: MOV T0.X, KC0[2].W, 762; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 763; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 764; 765; CM-LABEL: v2i32_arg: 766; CM: ; %bb.0: ; %entry 767; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 768; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 769; CM-NEXT: CF_END 770; CM-NEXT: PAD 771; CM-NEXT: ALU clause starting at 4: 772; CM-NEXT: MOV * T0.Y, KC0[3].X, 773; CM-NEXT: MOV * T0.X, KC0[2].W, 774; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 775; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 776entry: 777 store <2 x i32> %in, ptr addrspace(1) %out, align 4 778 ret void 779} 780 781define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float> %in) nounwind { 782; SI-LABEL: v2f32_arg: 783; SI: ; %bb.0: ; %entry 784; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 785; SI-NEXT: s_mov_b32 s7, 0xf000 786; SI-NEXT: s_mov_b32 s6, -1 787; SI-NEXT: s_waitcnt lgkmcnt(0) 788; SI-NEXT: s_mov_b32 s4, s0 789; SI-NEXT: s_mov_b32 s5, s1 790; SI-NEXT: v_mov_b32_e32 v0, s2 791; SI-NEXT: v_mov_b32_e32 v1, s3 792; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 793; SI-NEXT: s_endpgm 794; 795; VI-LABEL: v2f32_arg: 796; VI: ; %bb.0: ; %entry 797; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 798; VI-NEXT: s_waitcnt lgkmcnt(0) 799; VI-NEXT: v_mov_b32_e32 v0, s0 800; VI-NEXT: v_mov_b32_e32 v2, s2 801; VI-NEXT: v_mov_b32_e32 v1, s1 802; VI-NEXT: v_mov_b32_e32 v3, s3 803; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 804; VI-NEXT: s_endpgm 805; 806; GFX9-LABEL: v2f32_arg: 807; GFX9: ; %bb.0: ; %entry 808; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 809; GFX9-NEXT: v_mov_b32_e32 v2, 0 810; GFX9-NEXT: s_waitcnt lgkmcnt(0) 811; GFX9-NEXT: v_mov_b32_e32 v0, s2 812; GFX9-NEXT: v_mov_b32_e32 v1, s3 813; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 814; GFX9-NEXT: s_endpgm 815; 816; EG-LABEL: v2f32_arg: 817; EG: ; %bb.0: ; %entry 818; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 819; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 820; EG-NEXT: CF_END 821; EG-NEXT: PAD 822; EG-NEXT: ALU clause starting at 4: 823; EG-NEXT: MOV * T0.Y, KC0[3].X, 824; EG-NEXT: MOV T0.X, KC0[2].W, 825; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 826; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 827; 828; CM-LABEL: v2f32_arg: 829; CM: ; %bb.0: ; %entry 830; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 831; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 832; CM-NEXT: CF_END 833; CM-NEXT: PAD 834; CM-NEXT: ALU clause starting at 4: 835; CM-NEXT: MOV * T0.Y, KC0[3].X, 836; CM-NEXT: MOV * T0.X, KC0[2].W, 837; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 838; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 839entry: 840 store <2 x float> %in, ptr addrspace(1) %out, align 4 841 ret void 842} 843 844define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %in) nounwind { 845; SI-LABEL: v3i8_arg: 846; SI: ; %bb.0: ; %entry 847; SI-NEXT: s_load_dword s6, s[4:5], 0xb 848; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 849; SI-NEXT: s_mov_b32 s3, 0xf000 850; SI-NEXT: s_waitcnt lgkmcnt(0) 851; SI-NEXT: s_lshr_b32 s4, s6, 16 852; SI-NEXT: s_mov_b32 s2, -1 853; SI-NEXT: v_mov_b32_e32 v0, s6 854; SI-NEXT: v_mov_b32_e32 v1, s4 855; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 856; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 857; SI-NEXT: s_endpgm 858; 859; VI-LABEL: v3i8_arg: 860; VI: ; %bb.0: ; %entry 861; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 862; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 863; VI-NEXT: s_waitcnt lgkmcnt(0) 864; VI-NEXT: s_lshr_b32 s3, s2, 16 865; VI-NEXT: v_mov_b32_e32 v0, s0 866; VI-NEXT: v_mov_b32_e32 v1, s1 867; VI-NEXT: s_add_u32 s0, s0, 2 868; VI-NEXT: s_addc_u32 s1, s1, 0 869; VI-NEXT: v_mov_b32_e32 v3, s1 870; VI-NEXT: v_mov_b32_e32 v5, s3 871; VI-NEXT: v_mov_b32_e32 v2, s0 872; VI-NEXT: v_mov_b32_e32 v4, s2 873; VI-NEXT: flat_store_byte v[2:3], v5 874; VI-NEXT: flat_store_short v[0:1], v4 875; VI-NEXT: s_endpgm 876; 877; GFX9-LABEL: v3i8_arg: 878; GFX9: ; %bb.0: ; %entry 879; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 880; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 881; GFX9-NEXT: v_mov_b32_e32 v0, 0 882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 883; GFX9-NEXT: v_mov_b32_e32 v1, s2 884; GFX9-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:2 885; GFX9-NEXT: global_store_short v0, v1, s[0:1] 886; GFX9-NEXT: s_endpgm 887; 888; EG-LABEL: v3i8_arg: 889; EG: ; %bb.0: ; %entry 890; EG-NEXT: ALU 0, @12, KC0[], KC1[] 891; EG-NEXT: TEX 2 @6 892; EG-NEXT: ALU 28, @13, KC0[CB0:0-32], KC1[] 893; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 894; EG-NEXT: MEM_RAT MSKOR T5.XW, T6.X 895; EG-NEXT: CF_END 896; EG-NEXT: Fetch clause starting at 6: 897; EG-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 898; EG-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 899; EG-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 900; EG-NEXT: ALU clause starting at 12: 901; EG-NEXT: MOV * T4.X, 0.0, 902; EG-NEXT: ALU clause starting at 13: 903; EG-NEXT: LSHL T0.W, T5.X, literal.x, 904; EG-NEXT: AND_INT * T1.W, T4.X, literal.y, 905; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) 906; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 907; EG-NEXT: OR_INT * T0.W, PV.W, PS, 908; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 909; EG-NEXT: AND_INT T0.W, PS, literal.x, 910; EG-NEXT: LSHL * T1.W, PV.W, literal.y, 911; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 912; EG-NEXT: LSHL T4.X, PV.W, PS, 913; EG-NEXT: LSHL * T4.W, literal.x, PS, 914; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 915; EG-NEXT: MOV T4.Y, 0.0, 916; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 917; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 918; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 919; EG-NEXT: AND_INT * T2.W, T6.X, literal.y, 920; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 921; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 922; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 923; EG-NEXT: LSHL T5.X, T2.W, PV.W, 924; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 925; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 926; EG-NEXT: MOV T5.Y, 0.0, 927; EG-NEXT: MOV T4.Z, 0.0, 928; EG-NEXT: MOV * T5.Z, 0.0, 929; EG-NEXT: LSHR T6.X, T0.W, literal.x, 930; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 931; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 932; 933; CM-LABEL: v3i8_arg: 934; CM: ; %bb.0: ; %entry 935; CM-NEXT: ALU 0, @12, KC0[], KC1[] 936; CM-NEXT: TEX 2 @6 937; CM-NEXT: ALU 29, @13, KC0[CB0:0-32], KC1[] 938; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 939; CM-NEXT: MEM_RAT MSKOR T5.XW, T6.X 940; CM-NEXT: CF_END 941; CM-NEXT: Fetch clause starting at 6: 942; CM-NEXT: VTX_READ_8 T5.X, T4.X, 41, #3 943; CM-NEXT: VTX_READ_8 T6.X, T4.X, 42, #3 944; CM-NEXT: VTX_READ_8 T4.X, T4.X, 40, #3 945; CM-NEXT: ALU clause starting at 12: 946; CM-NEXT: MOV * T4.X, 0.0, 947; CM-NEXT: ALU clause starting at 13: 948; CM-NEXT: LSHL T0.Z, T5.X, literal.x, 949; CM-NEXT: AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212 950; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) 951; CM-NEXT: AND_INT T1.Z, KC0[2].Y, literal.x, 952; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 953; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 954; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, 955; CM-NEXT: LSHL * T0.W, PV.Z, literal.y, 956; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 957; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 958; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 959; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 960; CM-NEXT: MOV T4.Y, 0.0, 961; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 962; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 963; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 964; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 965; CM-NEXT: AND_INT T0.Z, T6.X, literal.x, 966; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 967; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 968; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 969; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 970; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 971; CM-NEXT: MOV T5.Y, 0.0, 972; CM-NEXT: MOV * T4.Z, 0.0, 973; CM-NEXT: MOV * T5.Z, 0.0, 974; CM-NEXT: LSHR * T6.X, T0.W, literal.x, 975; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 976; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 977; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 978entry: 979 store <3 x i8> %in, ptr addrspace(1) %out, align 4 980 ret void 981} 982 983define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) nounwind { 984; SI-LABEL: v3i16_arg: 985; SI: ; %bb.0: ; %entry 986; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 987; SI-NEXT: s_mov_b32 s7, 0xf000 988; SI-NEXT: s_mov_b32 s6, -1 989; SI-NEXT: s_waitcnt lgkmcnt(0) 990; SI-NEXT: s_mov_b32 s4, s0 991; SI-NEXT: s_mov_b32 s5, s1 992; SI-NEXT: v_mov_b32_e32 v0, s3 993; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 994; SI-NEXT: s_waitcnt expcnt(0) 995; SI-NEXT: v_mov_b32_e32 v0, s2 996; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 997; SI-NEXT: s_endpgm 998; 999; VI-LABEL: v3i16_arg: 1000; VI: ; %bb.0: ; %entry 1001; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1002; VI-NEXT: s_waitcnt lgkmcnt(0) 1003; VI-NEXT: s_add_u32 s4, s0, 4 1004; VI-NEXT: s_addc_u32 s5, s1, 0 1005; VI-NEXT: v_mov_b32_e32 v2, s4 1006; VI-NEXT: v_mov_b32_e32 v4, s3 1007; VI-NEXT: v_mov_b32_e32 v0, s0 1008; VI-NEXT: v_mov_b32_e32 v3, s5 1009; VI-NEXT: v_mov_b32_e32 v1, s1 1010; VI-NEXT: v_mov_b32_e32 v5, s2 1011; VI-NEXT: flat_store_short v[2:3], v4 1012; VI-NEXT: flat_store_dword v[0:1], v5 1013; VI-NEXT: s_endpgm 1014; 1015; GFX9-LABEL: v3i16_arg: 1016; GFX9: ; %bb.0: ; %entry 1017; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1018; GFX9-NEXT: v_mov_b32_e32 v0, 0 1019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX9-NEXT: v_mov_b32_e32 v1, s3 1021; GFX9-NEXT: v_mov_b32_e32 v2, s2 1022; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 1023; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1024; GFX9-NEXT: s_endpgm 1025; 1026; EG-LABEL: v3i16_arg: 1027; EG: ; %bb.0: ; %entry 1028; EG-NEXT: ALU 0, @12, KC0[], KC1[] 1029; EG-NEXT: TEX 2 @6 1030; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1031; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 1032; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1033; EG-NEXT: CF_END 1034; EG-NEXT: Fetch clause starting at 6: 1035; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1036; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1037; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1038; EG-NEXT: ALU clause starting at 12: 1039; EG-NEXT: MOV * T5.X, 0.0, 1040; EG-NEXT: ALU clause starting at 13: 1041; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1042; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1043; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1044; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1045; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1046; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1047; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1048; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1049; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1050; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1051; EG-NEXT: MOV T5.Y, 0.0, 1052; EG-NEXT: MOV * T5.Z, 0.0, 1053; EG-NEXT: LSHR T8.X, T0.W, literal.x, 1054; EG-NEXT: LSHL T0.W, T7.X, literal.y, 1055; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, 1056; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1057; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1058; EG-NEXT: OR_INT T6.X, PV.W, PS, 1059; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1060; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1061; 1062; CM-LABEL: v3i16_arg: 1063; CM: ; %bb.0: ; %entry 1064; CM-NEXT: ALU 0, @12, KC0[], KC1[] 1065; CM-NEXT: TEX 2 @6 1066; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] 1067; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1068; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X 1069; CM-NEXT: CF_END 1070; CM-NEXT: Fetch clause starting at 6: 1071; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 1072; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 1073; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 1074; CM-NEXT: ALU clause starting at 12: 1075; CM-NEXT: MOV * T5.X, 0.0, 1076; CM-NEXT: ALU clause starting at 13: 1077; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1078; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1079; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1080; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1081; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1082; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1083; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1084; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1085; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1086; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1087; CM-NEXT: MOV T5.Y, 0.0, 1088; CM-NEXT: MOV * T5.Z, 0.0, 1089; CM-NEXT: LSHL T0.Z, T7.X, literal.x, 1090; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 1091; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 1092; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1093; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1094; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1095; CM-NEXT: LSHR * T8.X, T0.W, literal.x, 1096; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1097entry: 1098 store <3 x i16> %in, ptr addrspace(1) %out, align 4 1099 ret void 1100} 1101 1102define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32> %in) nounwind { 1103; SI-LABEL: v3i32_arg: 1104; SI: ; %bb.0: ; %entry 1105; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1106; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1107; SI-NEXT: s_mov_b32 s7, 0xf000 1108; SI-NEXT: s_mov_b32 s6, -1 1109; SI-NEXT: s_waitcnt lgkmcnt(0) 1110; SI-NEXT: v_mov_b32_e32 v0, s2 1111; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 1112; SI-NEXT: s_waitcnt expcnt(0) 1113; SI-NEXT: v_mov_b32_e32 v0, s0 1114; SI-NEXT: v_mov_b32_e32 v1, s1 1115; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1116; SI-NEXT: s_endpgm 1117; 1118; VI-LABEL: v3i32_arg: 1119; VI: ; %bb.0: ; %entry 1120; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1121; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1122; VI-NEXT: s_waitcnt lgkmcnt(0) 1123; VI-NEXT: v_mov_b32_e32 v0, s0 1124; VI-NEXT: v_mov_b32_e32 v3, s4 1125; VI-NEXT: v_mov_b32_e32 v1, s1 1126; VI-NEXT: v_mov_b32_e32 v2, s2 1127; VI-NEXT: v_mov_b32_e32 v4, s5 1128; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1129; VI-NEXT: s_endpgm 1130; 1131; GFX9-LABEL: v3i32_arg: 1132; GFX9: ; %bb.0: ; %entry 1133; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1134; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1135; GFX9-NEXT: v_mov_b32_e32 v3, 0 1136; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX9-NEXT: v_mov_b32_e32 v0, s0 1138; GFX9-NEXT: v_mov_b32_e32 v1, s1 1139; GFX9-NEXT: v_mov_b32_e32 v2, s2 1140; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] 1141; GFX9-NEXT: s_endpgm 1142; 1143; EG-LABEL: v3i32_arg: 1144; EG: ; %bb.0: ; %entry 1145; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1146; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1147; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1148; EG-NEXT: CF_END 1149; EG-NEXT: ALU clause starting at 4: 1150; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1151; EG-NEXT: MOV T0.X, KC0[3].Y, 1152; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1153; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1154; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1155; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1156; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1157; EG-NEXT: MOV * T3.X, KC0[3].W, 1158; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1159; 1160; CM-LABEL: v3i32_arg: 1161; CM: ; %bb.0: ; %entry 1162; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1163; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1164; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1165; CM-NEXT: CF_END 1166; CM-NEXT: ALU clause starting at 4: 1167; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1168; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1169; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1170; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1171; CM-NEXT: MOV T1.X, KC0[3].W, 1172; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1173; CM-NEXT: MOV * T2.X, KC0[3].Y, 1174; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1175; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1176entry: 1177 store <3 x i32> %in, ptr addrspace(1) %out, align 4 1178 ret void 1179} 1180 1181define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float> %in) nounwind { 1182; SI-LABEL: v3f32_arg: 1183; SI: ; %bb.0: ; %entry 1184; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1185; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1186; SI-NEXT: s_mov_b32 s7, 0xf000 1187; SI-NEXT: s_mov_b32 s6, -1 1188; SI-NEXT: s_waitcnt lgkmcnt(0) 1189; SI-NEXT: v_mov_b32_e32 v0, s2 1190; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 1191; SI-NEXT: s_waitcnt expcnt(0) 1192; SI-NEXT: v_mov_b32_e32 v0, s0 1193; SI-NEXT: v_mov_b32_e32 v1, s1 1194; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1195; SI-NEXT: s_endpgm 1196; 1197; VI-LABEL: v3f32_arg: 1198; VI: ; %bb.0: ; %entry 1199; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1200; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1201; VI-NEXT: s_waitcnt lgkmcnt(0) 1202; VI-NEXT: v_mov_b32_e32 v0, s0 1203; VI-NEXT: v_mov_b32_e32 v3, s4 1204; VI-NEXT: v_mov_b32_e32 v1, s1 1205; VI-NEXT: v_mov_b32_e32 v2, s2 1206; VI-NEXT: v_mov_b32_e32 v4, s5 1207; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1208; VI-NEXT: s_endpgm 1209; 1210; GFX9-LABEL: v3f32_arg: 1211; GFX9: ; %bb.0: ; %entry 1212; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1213; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1214; GFX9-NEXT: v_mov_b32_e32 v3, 0 1215; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX9-NEXT: v_mov_b32_e32 v0, s0 1217; GFX9-NEXT: v_mov_b32_e32 v1, s1 1218; GFX9-NEXT: v_mov_b32_e32 v2, s2 1219; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] 1220; GFX9-NEXT: s_endpgm 1221; 1222; EG-LABEL: v3f32_arg: 1223; EG: ; %bb.0: ; %entry 1224; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1225; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1226; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1227; EG-NEXT: CF_END 1228; EG-NEXT: ALU clause starting at 4: 1229; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1230; EG-NEXT: MOV T0.X, KC0[3].Y, 1231; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1232; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1233; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1234; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1235; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1236; EG-NEXT: MOV * T3.X, KC0[3].W, 1237; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1238; 1239; CM-LABEL: v3f32_arg: 1240; CM: ; %bb.0: ; %entry 1241; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1242; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T3.X 1243; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1244; CM-NEXT: CF_END 1245; CM-NEXT: ALU clause starting at 4: 1246; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1247; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1248; CM-NEXT: LSHR * T0.X, PV.W, literal.x, 1249; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1250; CM-NEXT: MOV T1.X, KC0[3].W, 1251; CM-NEXT: MOV * T2.Y, KC0[3].Z, 1252; CM-NEXT: MOV * T2.X, KC0[3].Y, 1253; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1254; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1255entry: 1256 store <3 x float> %in, ptr addrspace(1) %out, align 4 1257 ret void 1258} 1259 1260define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) { 1261; SI-LABEL: v4i8_arg: 1262; SI: ; %bb.0: ; %entry 1263; SI-NEXT: s_load_dword s6, s[4:5], 0xb 1264; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1265; SI-NEXT: s_mov_b32 s3, 0xf000 1266; SI-NEXT: s_mov_b32 s2, -1 1267; SI-NEXT: s_waitcnt lgkmcnt(0) 1268; SI-NEXT: v_mov_b32_e32 v0, s6 1269; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1270; SI-NEXT: s_endpgm 1271; 1272; VI-LABEL: v4i8_arg: 1273; VI: ; %bb.0: ; %entry 1274; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1275; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1276; VI-NEXT: s_waitcnt lgkmcnt(0) 1277; VI-NEXT: v_mov_b32_e32 v0, s0 1278; VI-NEXT: v_mov_b32_e32 v1, s1 1279; VI-NEXT: v_mov_b32_e32 v2, s2 1280; VI-NEXT: flat_store_dword v[0:1], v2 1281; VI-NEXT: s_endpgm 1282; 1283; GFX9-LABEL: v4i8_arg: 1284; GFX9: ; %bb.0: ; %entry 1285; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 1286; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1287; GFX9-NEXT: v_mov_b32_e32 v0, 0 1288; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1289; GFX9-NEXT: v_mov_b32_e32 v1, s2 1290; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1291; GFX9-NEXT: s_endpgm 1292; 1293; EG-LABEL: v4i8_arg: 1294; EG: ; %bb.0: ; %entry 1295; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1296; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1297; EG-NEXT: CF_END 1298; EG-NEXT: PAD 1299; EG-NEXT: ALU clause starting at 4: 1300; EG-NEXT: MOV T0.X, KC0[2].Z, 1301; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1302; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1303; 1304; CM-LABEL: v4i8_arg: 1305; CM: ; %bb.0: ; %entry 1306; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1307; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1308; CM-NEXT: CF_END 1309; CM-NEXT: PAD 1310; CM-NEXT: ALU clause starting at 4: 1311; CM-NEXT: MOV * T0.X, KC0[2].Z, 1312; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1313; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1314entry: 1315 store <4 x i8> %in, ptr addrspace(1) %out 1316 ret void 1317} 1318 1319define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { 1320; SI-LABEL: v4i16_arg: 1321; SI: ; %bb.0: ; %entry 1322; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1323; SI-NEXT: s_mov_b32 s7, 0xf000 1324; SI-NEXT: s_mov_b32 s6, -1 1325; SI-NEXT: s_waitcnt lgkmcnt(0) 1326; SI-NEXT: s_mov_b32 s4, s0 1327; SI-NEXT: s_mov_b32 s5, s1 1328; SI-NEXT: v_mov_b32_e32 v0, s2 1329; SI-NEXT: v_mov_b32_e32 v1, s3 1330; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1331; SI-NEXT: s_endpgm 1332; 1333; VI-LABEL: v4i16_arg: 1334; VI: ; %bb.0: ; %entry 1335; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1336; VI-NEXT: s_waitcnt lgkmcnt(0) 1337; VI-NEXT: v_mov_b32_e32 v0, s0 1338; VI-NEXT: v_mov_b32_e32 v2, s2 1339; VI-NEXT: v_mov_b32_e32 v1, s1 1340; VI-NEXT: v_mov_b32_e32 v3, s3 1341; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1342; VI-NEXT: s_endpgm 1343; 1344; GFX9-LABEL: v4i16_arg: 1345; GFX9: ; %bb.0: ; %entry 1346; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1347; GFX9-NEXT: v_mov_b32_e32 v2, 0 1348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1349; GFX9-NEXT: v_mov_b32_e32 v0, s2 1350; GFX9-NEXT: v_mov_b32_e32 v1, s3 1351; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1352; GFX9-NEXT: s_endpgm 1353; 1354; EG-LABEL: v4i16_arg: 1355; EG: ; %bb.0: ; %entry 1356; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1357; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1358; EG-NEXT: CF_END 1359; EG-NEXT: PAD 1360; EG-NEXT: ALU clause starting at 4: 1361; EG-NEXT: MOV * T0.Y, KC0[3].X, 1362; EG-NEXT: MOV T0.X, KC0[2].W, 1363; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1364; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1365; 1366; CM-LABEL: v4i16_arg: 1367; CM: ; %bb.0: ; %entry 1368; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1369; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1370; CM-NEXT: CF_END 1371; CM-NEXT: PAD 1372; CM-NEXT: ALU clause starting at 4: 1373; CM-NEXT: MOV * T0.Y, KC0[3].X, 1374; CM-NEXT: MOV * T0.X, KC0[2].W, 1375; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1376; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1377entry: 1378 store <4 x i16> %in, ptr addrspace(1) %out 1379 ret void 1380} 1381 1382define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32> %in) nounwind { 1383; SI-LABEL: v4i32_arg: 1384; SI: ; %bb.0: ; %entry 1385; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1386; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1387; SI-NEXT: s_mov_b32 s7, 0xf000 1388; SI-NEXT: s_mov_b32 s6, -1 1389; SI-NEXT: s_waitcnt lgkmcnt(0) 1390; SI-NEXT: v_mov_b32_e32 v0, s0 1391; SI-NEXT: v_mov_b32_e32 v1, s1 1392; SI-NEXT: v_mov_b32_e32 v2, s2 1393; SI-NEXT: v_mov_b32_e32 v3, s3 1394; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1395; SI-NEXT: s_endpgm 1396; 1397; VI-LABEL: v4i32_arg: 1398; VI: ; %bb.0: ; %entry 1399; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1400; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1401; VI-NEXT: s_waitcnt lgkmcnt(0) 1402; VI-NEXT: v_mov_b32_e32 v4, s6 1403; VI-NEXT: v_mov_b32_e32 v0, s0 1404; VI-NEXT: v_mov_b32_e32 v5, s7 1405; VI-NEXT: v_mov_b32_e32 v1, s1 1406; VI-NEXT: v_mov_b32_e32 v2, s2 1407; VI-NEXT: v_mov_b32_e32 v3, s3 1408; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1409; VI-NEXT: s_endpgm 1410; 1411; GFX9-LABEL: v4i32_arg: 1412; GFX9: ; %bb.0: ; %entry 1413; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1414; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1415; GFX9-NEXT: v_mov_b32_e32 v4, 0 1416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX9-NEXT: v_mov_b32_e32 v0, s0 1418; GFX9-NEXT: v_mov_b32_e32 v1, s1 1419; GFX9-NEXT: v_mov_b32_e32 v2, s2 1420; GFX9-NEXT: v_mov_b32_e32 v3, s3 1421; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 1422; GFX9-NEXT: s_endpgm 1423; 1424; EG-LABEL: v4i32_arg: 1425; EG: ; %bb.0: ; %entry 1426; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1427; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1428; EG-NEXT: CF_END 1429; EG-NEXT: PAD 1430; EG-NEXT: ALU clause starting at 4: 1431; EG-NEXT: MOV * T0.W, KC0[4].X, 1432; EG-NEXT: MOV * T0.Z, KC0[3].W, 1433; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1434; EG-NEXT: MOV T0.X, KC0[3].Y, 1435; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1436; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1437; 1438; CM-LABEL: v4i32_arg: 1439; CM: ; %bb.0: ; %entry 1440; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1441; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1442; CM-NEXT: CF_END 1443; CM-NEXT: PAD 1444; CM-NEXT: ALU clause starting at 4: 1445; CM-NEXT: MOV * T0.W, KC0[4].X, 1446; CM-NEXT: MOV * T0.Z, KC0[3].W, 1447; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1448; CM-NEXT: MOV * T0.X, KC0[3].Y, 1449; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1450; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1451entry: 1452 store <4 x i32> %in, ptr addrspace(1) %out, align 4 1453 ret void 1454} 1455 1456define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float> %in) nounwind { 1457; SI-LABEL: v4f32_arg: 1458; SI: ; %bb.0: ; %entry 1459; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 1460; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1461; SI-NEXT: s_mov_b32 s7, 0xf000 1462; SI-NEXT: s_mov_b32 s6, -1 1463; SI-NEXT: s_waitcnt lgkmcnt(0) 1464; SI-NEXT: v_mov_b32_e32 v0, s0 1465; SI-NEXT: v_mov_b32_e32 v1, s1 1466; SI-NEXT: v_mov_b32_e32 v2, s2 1467; SI-NEXT: v_mov_b32_e32 v3, s3 1468; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1469; SI-NEXT: s_endpgm 1470; 1471; VI-LABEL: v4f32_arg: 1472; VI: ; %bb.0: ; %entry 1473; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1474; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 1475; VI-NEXT: s_waitcnt lgkmcnt(0) 1476; VI-NEXT: v_mov_b32_e32 v4, s6 1477; VI-NEXT: v_mov_b32_e32 v0, s0 1478; VI-NEXT: v_mov_b32_e32 v5, s7 1479; VI-NEXT: v_mov_b32_e32 v1, s1 1480; VI-NEXT: v_mov_b32_e32 v2, s2 1481; VI-NEXT: v_mov_b32_e32 v3, s3 1482; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1483; VI-NEXT: s_endpgm 1484; 1485; GFX9-LABEL: v4f32_arg: 1486; GFX9: ; %bb.0: ; %entry 1487; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1488; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1489; GFX9-NEXT: v_mov_b32_e32 v4, 0 1490; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX9-NEXT: v_mov_b32_e32 v0, s0 1492; GFX9-NEXT: v_mov_b32_e32 v1, s1 1493; GFX9-NEXT: v_mov_b32_e32 v2, s2 1494; GFX9-NEXT: v_mov_b32_e32 v3, s3 1495; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 1496; GFX9-NEXT: s_endpgm 1497; 1498; EG-LABEL: v4f32_arg: 1499; EG: ; %bb.0: ; %entry 1500; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1501; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1502; EG-NEXT: CF_END 1503; EG-NEXT: PAD 1504; EG-NEXT: ALU clause starting at 4: 1505; EG-NEXT: MOV * T0.W, KC0[4].X, 1506; EG-NEXT: MOV * T0.Z, KC0[3].W, 1507; EG-NEXT: MOV * T0.Y, KC0[3].Z, 1508; EG-NEXT: MOV T0.X, KC0[3].Y, 1509; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1510; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1511; 1512; CM-LABEL: v4f32_arg: 1513; CM: ; %bb.0: ; %entry 1514; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1515; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 1516; CM-NEXT: CF_END 1517; CM-NEXT: PAD 1518; CM-NEXT: ALU clause starting at 4: 1519; CM-NEXT: MOV * T0.W, KC0[4].X, 1520; CM-NEXT: MOV * T0.Z, KC0[3].W, 1521; CM-NEXT: MOV * T0.Y, KC0[3].Z, 1522; CM-NEXT: MOV * T0.X, KC0[3].Y, 1523; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1524; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1525entry: 1526 store <4 x float> %in, ptr addrspace(1) %out, align 4 1527 ret void 1528} 1529 1530define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) nounwind { 1531; SI-LABEL: v5i8_arg: 1532; SI: ; %bb.0: ; %entry 1533; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1534; SI-NEXT: s_mov_b32 s7, 0xf000 1535; SI-NEXT: s_mov_b32 s6, -1 1536; SI-NEXT: s_waitcnt lgkmcnt(0) 1537; SI-NEXT: s_mov_b32 s4, s0 1538; SI-NEXT: s_mov_b32 s5, s1 1539; SI-NEXT: v_mov_b32_e32 v0, s3 1540; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 1541; SI-NEXT: s_waitcnt expcnt(0) 1542; SI-NEXT: v_mov_b32_e32 v0, s2 1543; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1544; SI-NEXT: s_endpgm 1545; 1546; VI-LABEL: v5i8_arg: 1547; VI: ; %bb.0: ; %entry 1548; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1549; VI-NEXT: s_waitcnt lgkmcnt(0) 1550; VI-NEXT: s_add_u32 s4, s0, 4 1551; VI-NEXT: s_addc_u32 s5, s1, 0 1552; VI-NEXT: v_mov_b32_e32 v2, s4 1553; VI-NEXT: v_mov_b32_e32 v4, s3 1554; VI-NEXT: v_mov_b32_e32 v0, s0 1555; VI-NEXT: v_mov_b32_e32 v3, s5 1556; VI-NEXT: v_mov_b32_e32 v1, s1 1557; VI-NEXT: v_mov_b32_e32 v5, s2 1558; VI-NEXT: flat_store_byte v[2:3], v4 1559; VI-NEXT: flat_store_dword v[0:1], v5 1560; VI-NEXT: s_endpgm 1561; 1562; GFX9-LABEL: v5i8_arg: 1563; GFX9: ; %bb.0: ; %entry 1564; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1565; GFX9-NEXT: v_mov_b32_e32 v0, 0 1566; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1567; GFX9-NEXT: v_mov_b32_e32 v1, s3 1568; GFX9-NEXT: v_mov_b32_e32 v2, s2 1569; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4 1570; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 1571; GFX9-NEXT: s_endpgm 1572; 1573; EG-LABEL: v5i8_arg: 1574; EG: ; %bb.0: ; %entry 1575; EG-NEXT: ALU 0, @16, KC0[], KC1[] 1576; EG-NEXT: TEX 4 @6 1577; EG-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1578; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X 1579; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 1580; EG-NEXT: CF_END 1581; EG-NEXT: Fetch clause starting at 6: 1582; EG-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1583; EG-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1584; EG-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1585; EG-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1586; EG-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1587; EG-NEXT: ALU clause starting at 16: 1588; EG-NEXT: MOV * T5.X, 0.0, 1589; EG-NEXT: ALU clause starting at 17: 1590; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1591; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1592; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1593; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, 1594; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) 1595; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1596; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1597; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1598; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1599; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1600; EG-NEXT: MOV T5.Y, 0.0, 1601; EG-NEXT: MOV T5.Z, 0.0, 1602; EG-NEXT: AND_INT T1.W, T9.X, literal.x, 1603; EG-NEXT: AND_INT * T0.Z, T8.X, literal.x, 1604; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1605; EG-NEXT: LSHL T1.W, PV.W, literal.x, 1606; EG-NEXT: LSHL * T2.W, T7.X, literal.y, 1607; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44) 1608; EG-NEXT: OR_INT T1.W, PS, PV.W, 1609; EG-NEXT: LSHL * T2.W, T0.Z, literal.x, 1610; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1611; EG-NEXT: OR_INT T1.W, PV.W, PS, 1612; EG-NEXT: AND_INT * T2.W, T6.X, literal.x, 1613; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1614; EG-NEXT: OR_INT T6.X, PV.W, PS, 1615; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1616; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1617; EG-NEXT: LSHR * T8.X, T0.W, literal.x, 1618; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1619; 1620; CM-LABEL: v5i8_arg: 1621; CM: ; %bb.0: ; %entry 1622; CM-NEXT: ALU 0, @16, KC0[], KC1[] 1623; CM-NEXT: TEX 4 @6 1624; CM-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] 1625; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X 1626; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X 1627; CM-NEXT: CF_END 1628; CM-NEXT: Fetch clause starting at 6: 1629; CM-NEXT: VTX_READ_8 T6.X, T5.X, 44, #3 1630; CM-NEXT: VTX_READ_8 T7.X, T5.X, 47, #3 1631; CM-NEXT: VTX_READ_8 T8.X, T5.X, 45, #3 1632; CM-NEXT: VTX_READ_8 T9.X, T5.X, 46, #3 1633; CM-NEXT: VTX_READ_8 T5.X, T5.X, 48, #3 1634; CM-NEXT: ALU clause starting at 16: 1635; CM-NEXT: MOV * T5.X, 0.0, 1636; CM-NEXT: ALU clause starting at 17: 1637; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1638; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1639; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1640; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1641; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, 1642; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1643; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1644; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1645; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1646; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1647; CM-NEXT: MOV T5.Y, 0.0, 1648; CM-NEXT: MOV T5.Z, 0.0, 1649; CM-NEXT: AND_INT * T1.W, T9.X, literal.x, 1650; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1651; CM-NEXT: AND_INT T0.Y, T8.X, literal.x, 1652; CM-NEXT: LSHL T0.Z, PV.W, literal.y, 1653; CM-NEXT: LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212 1654; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44) 1655; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1656; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z, 1657; CM-NEXT: LSHL * T1.W, PV.Y, literal.x, 1658; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1659; CM-NEXT: LSHR T7.X, T0.W, literal.x, 1660; CM-NEXT: OR_INT T0.Z, PV.Z, PV.W, 1661; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 1662; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) 1663; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, 1664; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 1665; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1666entry: 1667 store <5 x i8> %in, ptr addrspace(1) %out, align 4 1668 ret void 1669} 1670 1671define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16> %in) nounwind { 1672; SI-LABEL: v5i16_arg: 1673; SI: ; %bb.0: ; %entry 1674; SI-NEXT: s_load_dword s6, s[4:5], 0xf 1675; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1676; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1677; SI-NEXT: s_mov_b32 s3, 0xf000 1678; SI-NEXT: s_mov_b32 s2, -1 1679; SI-NEXT: s_waitcnt lgkmcnt(0) 1680; SI-NEXT: v_mov_b32_e32 v0, s6 1681; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8 1682; SI-NEXT: s_waitcnt expcnt(0) 1683; SI-NEXT: v_mov_b32_e32 v0, s4 1684; SI-NEXT: v_mov_b32_e32 v1, s5 1685; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1686; SI-NEXT: s_endpgm 1687; 1688; VI-LABEL: v5i16_arg: 1689; VI: ; %bb.0: ; %entry 1690; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1691; VI-NEXT: s_load_dword s6, s[4:5], 0x3c 1692; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 1693; VI-NEXT: s_waitcnt lgkmcnt(0) 1694; VI-NEXT: s_add_u32 s4, s0, 8 1695; VI-NEXT: s_addc_u32 s5, s1, 0 1696; VI-NEXT: v_mov_b32_e32 v2, s4 1697; VI-NEXT: v_mov_b32_e32 v4, s6 1698; VI-NEXT: v_mov_b32_e32 v3, s5 1699; VI-NEXT: v_mov_b32_e32 v0, s0 1700; VI-NEXT: flat_store_short v[2:3], v4 1701; VI-NEXT: v_mov_b32_e32 v2, s2 1702; VI-NEXT: v_mov_b32_e32 v1, s1 1703; VI-NEXT: v_mov_b32_e32 v3, s3 1704; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1705; VI-NEXT: s_endpgm 1706; 1707; GFX9-LABEL: v5i16_arg: 1708; GFX9: ; %bb.0: ; %entry 1709; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 1710; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1711; GFX9-NEXT: v_mov_b32_e32 v2, 0 1712; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX9-NEXT: v_mov_b32_e32 v3, s2 1714; GFX9-NEXT: v_mov_b32_e32 v0, s0 1715; GFX9-NEXT: v_mov_b32_e32 v1, s1 1716; GFX9-NEXT: global_store_short v2, v3, s[4:5] offset:8 1717; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 1718; GFX9-NEXT: s_endpgm 1719; 1720; EG-LABEL: v5i16_arg: 1721; EG: ; %bb.0: ; %entry 1722; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1723; EG-NEXT: TEX 4 @10 1724; EG-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[] 1725; EG-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1726; EG-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1727; EG-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1728; EG-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1729; EG-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1730; EG-NEXT: CF_END 1731; EG-NEXT: PAD 1732; EG-NEXT: Fetch clause starting at 10: 1733; EG-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1734; EG-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1735; EG-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1736; EG-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1737; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1738; EG-NEXT: ALU clause starting at 20: 1739; EG-NEXT: MOV * T0.X, 0.0, 1740; EG-NEXT: ALU clause starting at 21: 1741; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1742; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1743; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1744; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 1745; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1746; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1747; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1748; EG-NEXT: LSHL T5.X, T2.W, PV.W, 1749; EG-NEXT: LSHL * T5.W, literal.x, PV.W, 1750; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1751; EG-NEXT: MOV T5.Y, 0.0, 1752; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x, 1753; EG-NEXT: AND_INT * T2.W, T4.X, literal.y, 1754; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1755; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1756; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1757; EG-NEXT: LSHL T4.X, T2.W, PV.W, 1758; EG-NEXT: LSHL * T4.W, literal.x, PV.W, 1759; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1760; EG-NEXT: MOV T4.Y, 0.0, 1761; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1762; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1763; EG-NEXT: AND_INT T2.W, PV.W, literal.x, 1764; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 1765; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1766; EG-NEXT: LSHL * T2.W, PV.W, literal.x, 1767; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1768; EG-NEXT: LSHL T3.X, T3.W, PV.W, 1769; EG-NEXT: LSHL * T3.W, literal.x, PV.W, 1770; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1771; EG-NEXT: MOV T3.Y, 0.0, 1772; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1773; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1774; EG-NEXT: AND_INT T6.W, PV.W, literal.x, 1775; EG-NEXT: AND_INT * T7.W, T2.X, literal.y, 1776; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1777; EG-NEXT: LSHL * T6.W, PV.W, literal.x, 1778; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1779; EG-NEXT: LSHL T6.X, T7.W, PV.W, 1780; EG-NEXT: LSHL * T6.W, literal.x, PV.W, 1781; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1782; EG-NEXT: MOV T6.Y, 0.0, 1783; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 1784; EG-NEXT: 6(8.407791e-45), 0(0.000000e+00) 1785; EG-NEXT: AND_INT T8.W, PV.W, literal.x, 1786; EG-NEXT: AND_INT * T9.W, T1.X, literal.y, 1787; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) 1788; EG-NEXT: LSHL * T8.W, PV.W, literal.x, 1789; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1790; EG-NEXT: LSHL T8.X, T9.W, PV.W, 1791; EG-NEXT: LSHL * T8.W, literal.x, PV.W, 1792; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1793; EG-NEXT: MOV T8.Y, 0.0, 1794; EG-NEXT: MOV T5.Z, 0.0, 1795; EG-NEXT: MOV * T4.Z, 0.0, 1796; EG-NEXT: MOV T3.Z, 0.0, 1797; EG-NEXT: MOV * T6.Z, 0.0, 1798; EG-NEXT: MOV * T8.Z, 0.0, 1799; EG-NEXT: LSHR T0.X, T7.W, literal.x, 1800; EG-NEXT: LSHR * T1.X, T2.W, literal.x, 1801; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1802; EG-NEXT: LSHR T2.X, T1.W, literal.x, 1803; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1804; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1805; EG-NEXT: LSHR * T9.X, T0.W, literal.x, 1806; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1807; 1808; CM-LABEL: v5i16_arg: 1809; CM: ; %bb.0: ; %entry 1810; CM-NEXT: ALU 0, @20, KC0[], KC1[] 1811; CM-NEXT: TEX 4 @10 1812; CM-NEXT: ALU 67, @21, KC0[CB0:0-32], KC1[] 1813; CM-NEXT: MEM_RAT MSKOR T5.XW, T9.X 1814; CM-NEXT: MEM_RAT MSKOR T4.XW, T7.X 1815; CM-NEXT: MEM_RAT MSKOR T3.XW, T2.X 1816; CM-NEXT: MEM_RAT MSKOR T6.XW, T1.X 1817; CM-NEXT: MEM_RAT MSKOR T8.XW, T0.X 1818; CM-NEXT: CF_END 1819; CM-NEXT: PAD 1820; CM-NEXT: Fetch clause starting at 10: 1821; CM-NEXT: VTX_READ_16 T1.X, T0.X, 58, #3 1822; CM-NEXT: VTX_READ_16 T2.X, T0.X, 56, #3 1823; CM-NEXT: VTX_READ_16 T3.X, T0.X, 54, #3 1824; CM-NEXT: VTX_READ_16 T4.X, T0.X, 52, #3 1825; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 1826; CM-NEXT: ALU clause starting at 20: 1827; CM-NEXT: MOV * T0.X, 0.0, 1828; CM-NEXT: ALU clause starting at 21: 1829; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1830; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1831; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 1832; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1833; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, 1834; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1835; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1836; CM-NEXT: LSHL T5.X, PV.Z, PV.W, 1837; CM-NEXT: LSHL * T5.W, literal.x, PV.W, 1838; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1839; CM-NEXT: MOV T5.Y, 0.0, 1840; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1841; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1842; CM-NEXT: AND_INT T0.Z, T4.X, literal.x, 1843; CM-NEXT: LSHL * T1.W, PV.W, literal.y, 1844; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1845; CM-NEXT: LSHL T4.X, PV.Z, PV.W, 1846; CM-NEXT: LSHL * T4.W, literal.x, PV.W, 1847; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1848; CM-NEXT: MOV T4.Y, 0.0, 1849; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1850; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1851; CM-NEXT: AND_INT * T2.W, PV.W, literal.x, 1852; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1853; CM-NEXT: AND_INT T0.Z, T3.X, literal.x, 1854; CM-NEXT: LSHL * T2.W, PV.W, literal.y, 1855; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1856; CM-NEXT: LSHL T3.X, PV.Z, PV.W, 1857; CM-NEXT: LSHL * T3.W, literal.x, PV.W, 1858; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1859; CM-NEXT: MOV T3.Y, 0.0, 1860; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 1861; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1862; CM-NEXT: AND_INT * T6.W, PV.W, literal.x, 1863; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1864; CM-NEXT: AND_INT T0.Z, T2.X, literal.x, 1865; CM-NEXT: LSHL * T6.W, PV.W, literal.y, 1866; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1867; CM-NEXT: LSHL T6.X, PV.Z, PV.W, 1868; CM-NEXT: LSHL * T6.W, literal.x, PV.W, 1869; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1870; CM-NEXT: MOV T6.Y, 0.0, 1871; CM-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.x, 1872; CM-NEXT: 6(8.407791e-45), 0(0.000000e+00) 1873; CM-NEXT: AND_INT * T8.W, PV.W, literal.x, 1874; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1875; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, 1876; CM-NEXT: LSHL * T8.W, PV.W, literal.y, 1877; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1878; CM-NEXT: LSHL T8.X, PV.Z, PV.W, 1879; CM-NEXT: LSHL * T8.W, literal.x, PV.W, 1880; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1881; CM-NEXT: MOV T8.Y, 0.0, 1882; CM-NEXT: MOV * T5.Z, 0.0, 1883; CM-NEXT: MOV * T4.Z, 0.0, 1884; CM-NEXT: MOV * T3.Z, 0.0, 1885; CM-NEXT: MOV * T6.Z, 0.0, 1886; CM-NEXT: MOV * T8.Z, 0.0, 1887; CM-NEXT: LSHR * T0.X, T7.W, literal.x, 1888; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1889; CM-NEXT: LSHR * T1.X, T2.W, literal.x, 1890; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1891; CM-NEXT: LSHR * T2.X, T1.W, literal.x, 1892; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1893; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 1894; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1895; CM-NEXT: LSHR * T9.X, T0.W, literal.x, 1896; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1897entry: 1898 store <5 x i16> %in, ptr addrspace(1) %out, align 4 1899 ret void 1900} 1901 1902define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32> %in) nounwind { 1903; SI-LABEL: v5i32_arg: 1904; SI: ; %bb.0: ; %entry 1905; SI-NEXT: s_load_dword s8, s[4:5], 0x15 1906; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1907; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 1908; SI-NEXT: s_mov_b32 s3, 0xf000 1909; SI-NEXT: s_mov_b32 s2, -1 1910; SI-NEXT: s_waitcnt lgkmcnt(0) 1911; SI-NEXT: v_mov_b32_e32 v0, s8 1912; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 1913; SI-NEXT: s_waitcnt expcnt(0) 1914; SI-NEXT: v_mov_b32_e32 v0, s4 1915; SI-NEXT: v_mov_b32_e32 v1, s5 1916; SI-NEXT: v_mov_b32_e32 v2, s6 1917; SI-NEXT: v_mov_b32_e32 v3, s7 1918; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1919; SI-NEXT: s_endpgm 1920; 1921; VI-LABEL: v5i32_arg: 1922; VI: ; %bb.0: ; %entry 1923; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1924; VI-NEXT: s_load_dword s8, s[4:5], 0x54 1925; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 1926; VI-NEXT: s_waitcnt lgkmcnt(0) 1927; VI-NEXT: s_add_u32 s4, s6, 16 1928; VI-NEXT: s_addc_u32 s5, s7, 0 1929; VI-NEXT: v_mov_b32_e32 v0, s4 1930; VI-NEXT: v_mov_b32_e32 v2, s8 1931; VI-NEXT: v_mov_b32_e32 v1, s5 1932; VI-NEXT: v_mov_b32_e32 v4, s6 1933; VI-NEXT: flat_store_dword v[0:1], v2 1934; VI-NEXT: v_mov_b32_e32 v0, s0 1935; VI-NEXT: v_mov_b32_e32 v5, s7 1936; VI-NEXT: v_mov_b32_e32 v1, s1 1937; VI-NEXT: v_mov_b32_e32 v2, s2 1938; VI-NEXT: v_mov_b32_e32 v3, s3 1939; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1940; VI-NEXT: s_endpgm 1941; 1942; GFX9-LABEL: v5i32_arg: 1943; GFX9: ; %bb.0: ; %entry 1944; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30 1945; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 1946; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1947; GFX9-NEXT: v_mov_b32_e32 v4, 0 1948; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1949; GFX9-NEXT: v_mov_b32_e32 v5, s6 1950; GFX9-NEXT: v_mov_b32_e32 v0, s0 1951; GFX9-NEXT: v_mov_b32_e32 v1, s1 1952; GFX9-NEXT: v_mov_b32_e32 v2, s2 1953; GFX9-NEXT: v_mov_b32_e32 v3, s3 1954; GFX9-NEXT: global_store_dword v4, v5, s[4:5] offset:16 1955; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 1956; GFX9-NEXT: s_endpgm 1957; 1958; EG-LABEL: v5i32_arg: 1959; EG: ; %bb.0: ; %entry 1960; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 1961; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 1962; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 1963; EG-NEXT: CF_END 1964; EG-NEXT: ALU clause starting at 4: 1965; EG-NEXT: MOV * T0.W, KC0[5].X, 1966; EG-NEXT: MOV * T0.Z, KC0[4].W, 1967; EG-NEXT: MOV * T0.Y, KC0[4].Z, 1968; EG-NEXT: MOV T0.X, KC0[4].Y, 1969; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1970; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1971; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 1972; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1973; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1974; EG-NEXT: MOV * T3.X, KC0[5].Y, 1975; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1976; 1977; CM-LABEL: v5i32_arg: 1978; CM: ; %bb.0: ; %entry 1979; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 1980; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 1981; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 1982; CM-NEXT: CF_END 1983; CM-NEXT: ALU clause starting at 4: 1984; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 1985; CM-NEXT: MOV * T0.W, KC0[5].X, 1986; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1987; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 1988; CM-NEXT: MOV * T0.Z, KC0[4].W, 1989; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1990; CM-NEXT: MOV T2.X, KC0[5].Y, 1991; CM-NEXT: MOV * T0.Y, KC0[4].Z, 1992; CM-NEXT: MOV * T0.X, KC0[4].Y, 1993; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 1994; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1995entry: 1996 store <5 x i32> %in, ptr addrspace(1) %out, align 4 1997 ret void 1998} 1999 2000define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float> %in) nounwind { 2001; SI-LABEL: v5f32_arg: 2002; SI: ; %bb.0: ; %entry 2003; SI-NEXT: s_load_dword s8, s[4:5], 0x15 2004; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2005; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11 2006; SI-NEXT: s_mov_b32 s3, 0xf000 2007; SI-NEXT: s_mov_b32 s2, -1 2008; SI-NEXT: s_waitcnt lgkmcnt(0) 2009; SI-NEXT: v_mov_b32_e32 v0, s8 2010; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16 2011; SI-NEXT: s_waitcnt expcnt(0) 2012; SI-NEXT: v_mov_b32_e32 v0, s4 2013; SI-NEXT: v_mov_b32_e32 v1, s5 2014; SI-NEXT: v_mov_b32_e32 v2, s6 2015; SI-NEXT: v_mov_b32_e32 v3, s7 2016; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2017; SI-NEXT: s_endpgm 2018; 2019; VI-LABEL: v5f32_arg: 2020; VI: ; %bb.0: ; %entry 2021; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 2022; VI-NEXT: s_load_dword s8, s[4:5], 0x54 2023; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 2024; VI-NEXT: s_waitcnt lgkmcnt(0) 2025; VI-NEXT: s_add_u32 s4, s6, 16 2026; VI-NEXT: s_addc_u32 s5, s7, 0 2027; VI-NEXT: v_mov_b32_e32 v1, s4 2028; VI-NEXT: v_mov_b32_e32 v3, s8 2029; VI-NEXT: v_mov_b32_e32 v2, s5 2030; VI-NEXT: v_mov_b32_e32 v4, s6 2031; VI-NEXT: v_mov_b32_e32 v0, s0 2032; VI-NEXT: flat_store_dword v[1:2], v3 2033; VI-NEXT: v_mov_b32_e32 v1, s1 2034; VI-NEXT: v_mov_b32_e32 v2, s2 2035; VI-NEXT: v_mov_b32_e32 v3, s3 2036; VI-NEXT: v_mov_b32_e32 v5, s7 2037; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2038; VI-NEXT: s_endpgm 2039; 2040; GFX9-LABEL: v5f32_arg: 2041; GFX9: ; %bb.0: ; %entry 2042; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 2043; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2044; GFX9-NEXT: s_load_dword s6, s[8:9], 0x30 2045; GFX9-NEXT: v_mov_b32_e32 v4, 0 2046; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2047; GFX9-NEXT: v_mov_b32_e32 v0, s0 2048; GFX9-NEXT: v_mov_b32_e32 v1, s1 2049; GFX9-NEXT: v_mov_b32_e32 v2, s2 2050; GFX9-NEXT: v_mov_b32_e32 v3, s3 2051; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 2052; GFX9-NEXT: s_nop 0 2053; GFX9-NEXT: v_mov_b32_e32 v0, s6 2054; GFX9-NEXT: global_store_dword v4, v0, s[4:5] offset:16 2055; GFX9-NEXT: s_endpgm 2056; 2057; EG-LABEL: v5f32_arg: 2058; EG: ; %bb.0: ; %entry 2059; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2060; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 2061; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2062; EG-NEXT: CF_END 2063; EG-NEXT: ALU clause starting at 4: 2064; EG-NEXT: MOV * T0.W, KC0[5].X, 2065; EG-NEXT: MOV * T0.Z, KC0[4].W, 2066; EG-NEXT: MOV * T0.Y, KC0[4].Z, 2067; EG-NEXT: MOV T0.X, KC0[4].Y, 2068; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2069; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2070; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2071; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2072; EG-NEXT: LSHR T2.X, PV.W, literal.x, 2073; EG-NEXT: MOV * T3.X, KC0[5].Y, 2074; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2075; 2076; CM-LABEL: v5f32_arg: 2077; CM: ; %bb.0: ; %entry 2078; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 2079; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 2080; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 2081; CM-NEXT: CF_END 2082; CM-NEXT: ALU clause starting at 4: 2083; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x, 2084; CM-NEXT: MOV * T0.W, KC0[5].X, 2085; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2086; CM-NEXT: LSHR T1.X, PV.Z, literal.x, 2087; CM-NEXT: MOV * T0.Z, KC0[4].W, 2088; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2089; CM-NEXT: MOV T2.X, KC0[5].Y, 2090; CM-NEXT: MOV * T0.Y, KC0[4].Z, 2091; CM-NEXT: MOV * T0.X, KC0[4].Y, 2092; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2093; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2094entry: 2095 store <5 x float> %in, ptr addrspace(1) %out, align 4 2096 ret void 2097} 2098 2099define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64> %in) nounwind { 2100; SI-LABEL: v5i64_arg: 2101; SI: ; %bb.0: ; %entry 2102; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19 2103; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2104; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21 2105; SI-NEXT: s_mov_b32 s3, 0xf000 2106; SI-NEXT: s_mov_b32 s2, -1 2107; SI-NEXT: s_waitcnt lgkmcnt(0) 2108; SI-NEXT: v_mov_b32_e32 v0, s12 2109; SI-NEXT: v_mov_b32_e32 v1, s13 2110; SI-NEXT: v_mov_b32_e32 v2, s14 2111; SI-NEXT: v_mov_b32_e32 v3, s15 2112; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 2113; SI-NEXT: s_waitcnt expcnt(0) 2114; SI-NEXT: v_mov_b32_e32 v0, s8 2115; SI-NEXT: v_mov_b32_e32 v1, s9 2116; SI-NEXT: v_mov_b32_e32 v2, s10 2117; SI-NEXT: v_mov_b32_e32 v3, s11 2118; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2119; SI-NEXT: s_waitcnt expcnt(0) 2120; SI-NEXT: v_mov_b32_e32 v0, s4 2121; SI-NEXT: v_mov_b32_e32 v1, s5 2122; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 2123; SI-NEXT: s_endpgm 2124; 2125; VI-LABEL: v5i64_arg: 2126; VI: ; %bb.0: ; %entry 2127; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 2128; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84 2129; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 2130; VI-NEXT: s_waitcnt lgkmcnt(0) 2131; VI-NEXT: s_add_u32 s12, s8, 32 2132; VI-NEXT: v_mov_b32_e32 v1, s10 2133; VI-NEXT: s_addc_u32 s13, s9, 0 2134; VI-NEXT: v_mov_b32_e32 v3, s12 2135; VI-NEXT: v_mov_b32_e32 v2, s11 2136; VI-NEXT: v_mov_b32_e32 v0, s4 2137; VI-NEXT: v_mov_b32_e32 v4, s13 2138; VI-NEXT: s_add_u32 s4, s8, 16 2139; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 2140; VI-NEXT: v_mov_b32_e32 v1, s5 2141; VI-NEXT: s_addc_u32 s5, s9, 0 2142; VI-NEXT: v_mov_b32_e32 v4, s4 2143; VI-NEXT: v_mov_b32_e32 v2, s6 2144; VI-NEXT: v_mov_b32_e32 v3, s7 2145; VI-NEXT: v_mov_b32_e32 v5, s5 2146; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2147; VI-NEXT: v_mov_b32_e32 v4, s8 2148; VI-NEXT: v_mov_b32_e32 v0, s0 2149; VI-NEXT: v_mov_b32_e32 v1, s1 2150; VI-NEXT: v_mov_b32_e32 v2, s2 2151; VI-NEXT: v_mov_b32_e32 v3, s3 2152; VI-NEXT: v_mov_b32_e32 v5, s9 2153; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2154; VI-NEXT: s_endpgm 2155; 2156; GFX9-LABEL: v5i64_arg: 2157; GFX9: ; %bb.0: ; %entry 2158; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60 2159; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40 2160; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 2161; GFX9-NEXT: v_mov_b32_e32 v4, 0 2162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2163; GFX9-NEXT: v_mov_b32_e32 v1, s10 2164; GFX9-NEXT: v_mov_b32_e32 v2, s11 2165; GFX9-NEXT: v_mov_b32_e32 v0, s4 2166; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32 2167; GFX9-NEXT: v_mov_b32_e32 v1, s5 2168; GFX9-NEXT: v_mov_b32_e32 v2, s6 2169; GFX9-NEXT: v_mov_b32_e32 v3, s7 2170; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 2171; GFX9-NEXT: s_nop 0 2172; GFX9-NEXT: v_mov_b32_e32 v0, s0 2173; GFX9-NEXT: v_mov_b32_e32 v1, s1 2174; GFX9-NEXT: v_mov_b32_e32 v2, s2 2175; GFX9-NEXT: v_mov_b32_e32 v3, s3 2176; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] 2177; GFX9-NEXT: s_endpgm 2178; 2179; EG-LABEL: v5i64_arg: 2180; EG: ; %bb.0: ; %entry 2181; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2182; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2183; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2184; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2185; EG-NEXT: CF_END 2186; EG-NEXT: PAD 2187; EG-NEXT: ALU clause starting at 6: 2188; EG-NEXT: MOV * T0.W, KC0[7].X, 2189; EG-NEXT: MOV * T0.Z, KC0[6].W, 2190; EG-NEXT: MOV T0.Y, KC0[6].Z, 2191; EG-NEXT: MOV * T1.W, KC0[8].X, 2192; EG-NEXT: MOV T0.X, KC0[6].Y, 2193; EG-NEXT: MOV * T1.Z, KC0[7].W, 2194; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2195; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2196; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2197; EG-NEXT: MOV T1.X, KC0[7].Y, 2198; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2199; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2200; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2201; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2202; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2203; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2204; EG-NEXT: MOV T5.Y, KC0[8].Z, 2205; EG-NEXT: MOV * T5.X, KC0[8].Y, 2206; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2207; 2208; CM-LABEL: v5i64_arg: 2209; CM: ; %bb.0: ; %entry 2210; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2211; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2212; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2213; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2214; CM-NEXT: CF_END 2215; CM-NEXT: PAD 2216; CM-NEXT: ALU clause starting at 6: 2217; CM-NEXT: MOV * T0.W, KC0[8].X, 2218; CM-NEXT: MOV T1.Y, KC0[8].Z, 2219; CM-NEXT: MOV * T0.Z, KC0[7].W, 2220; CM-NEXT: MOV T1.X, KC0[8].Y, 2221; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2222; CM-NEXT: MOV T0.X, KC0[7].Y, 2223; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2224; CM-NEXT: MOV * T2.W, KC0[7].X, 2225; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2226; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2227; CM-NEXT: MOV T2.Z, KC0[6].W, 2228; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2229; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2230; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2231; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2232; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2233; CM-NEXT: MOV * T2.X, KC0[6].Y, 2234; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2235; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2236entry: 2237 store <5 x i64> %in, ptr addrspace(1) %out, align 8 2238 ret void 2239} 2240 2241define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x double> %in) nounwind { 2242; SI-LABEL: v5f64_arg: 2243; SI: ; %bb.0: ; %entry 2244; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x19 2245; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2246; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x21 2247; SI-NEXT: s_mov_b32 s3, 0xf000 2248; SI-NEXT: s_mov_b32 s2, -1 2249; SI-NEXT: s_waitcnt lgkmcnt(0) 2250; SI-NEXT: v_mov_b32_e32 v0, s12 2251; SI-NEXT: v_mov_b32_e32 v1, s13 2252; SI-NEXT: v_mov_b32_e32 v2, s14 2253; SI-NEXT: v_mov_b32_e32 v3, s15 2254; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 2255; SI-NEXT: s_waitcnt expcnt(0) 2256; SI-NEXT: v_mov_b32_e32 v0, s8 2257; SI-NEXT: v_mov_b32_e32 v1, s9 2258; SI-NEXT: v_mov_b32_e32 v2, s10 2259; SI-NEXT: v_mov_b32_e32 v3, s11 2260; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2261; SI-NEXT: s_waitcnt expcnt(0) 2262; SI-NEXT: v_mov_b32_e32 v0, s4 2263; SI-NEXT: v_mov_b32_e32 v1, s5 2264; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:32 2265; SI-NEXT: s_endpgm 2266; 2267; VI-LABEL: v5f64_arg: 2268; VI: ; %bb.0: ; %entry 2269; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 2270; VI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x84 2271; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x64 2272; VI-NEXT: s_waitcnt lgkmcnt(0) 2273; VI-NEXT: s_add_u32 s12, s8, 32 2274; VI-NEXT: v_mov_b32_e32 v1, s10 2275; VI-NEXT: s_addc_u32 s13, s9, 0 2276; VI-NEXT: v_mov_b32_e32 v3, s12 2277; VI-NEXT: v_mov_b32_e32 v2, s11 2278; VI-NEXT: v_mov_b32_e32 v0, s4 2279; VI-NEXT: v_mov_b32_e32 v4, s13 2280; VI-NEXT: s_add_u32 s4, s8, 16 2281; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 2282; VI-NEXT: v_mov_b32_e32 v1, s5 2283; VI-NEXT: s_addc_u32 s5, s9, 0 2284; VI-NEXT: v_mov_b32_e32 v4, s4 2285; VI-NEXT: v_mov_b32_e32 v2, s6 2286; VI-NEXT: v_mov_b32_e32 v3, s7 2287; VI-NEXT: v_mov_b32_e32 v5, s5 2288; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2289; VI-NEXT: v_mov_b32_e32 v4, s8 2290; VI-NEXT: v_mov_b32_e32 v0, s0 2291; VI-NEXT: v_mov_b32_e32 v1, s1 2292; VI-NEXT: v_mov_b32_e32 v2, s2 2293; VI-NEXT: v_mov_b32_e32 v3, s3 2294; VI-NEXT: v_mov_b32_e32 v5, s9 2295; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2296; VI-NEXT: s_endpgm 2297; 2298; GFX9-LABEL: v5f64_arg: 2299; GFX9: ; %bb.0: ; %entry 2300; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x60 2301; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x40 2302; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 2303; GFX9-NEXT: v_mov_b32_e32 v4, 0 2304; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2305; GFX9-NEXT: v_mov_b32_e32 v1, s10 2306; GFX9-NEXT: v_mov_b32_e32 v2, s11 2307; GFX9-NEXT: v_mov_b32_e32 v0, s4 2308; GFX9-NEXT: global_store_dwordx2 v4, v[1:2], s[12:13] offset:32 2309; GFX9-NEXT: v_mov_b32_e32 v1, s5 2310; GFX9-NEXT: v_mov_b32_e32 v2, s6 2311; GFX9-NEXT: v_mov_b32_e32 v3, s7 2312; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 2313; GFX9-NEXT: s_nop 0 2314; GFX9-NEXT: v_mov_b32_e32 v0, s0 2315; GFX9-NEXT: v_mov_b32_e32 v1, s1 2316; GFX9-NEXT: v_mov_b32_e32 v2, s2 2317; GFX9-NEXT: v_mov_b32_e32 v3, s3 2318; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] 2319; GFX9-NEXT: s_endpgm 2320; 2321; EG-LABEL: v5f64_arg: 2322; EG: ; %bb.0: ; %entry 2323; EG-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2324; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0 2325; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2326; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2327; EG-NEXT: CF_END 2328; EG-NEXT: PAD 2329; EG-NEXT: ALU clause starting at 6: 2330; EG-NEXT: MOV * T0.W, KC0[7].X, 2331; EG-NEXT: MOV * T0.Z, KC0[6].W, 2332; EG-NEXT: MOV T0.Y, KC0[6].Z, 2333; EG-NEXT: MOV * T1.W, KC0[8].X, 2334; EG-NEXT: MOV T0.X, KC0[6].Y, 2335; EG-NEXT: MOV * T1.Z, KC0[7].W, 2336; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2337; EG-NEXT: MOV * T1.Y, KC0[7].Z, 2338; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2339; EG-NEXT: MOV T1.X, KC0[7].Y, 2340; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2341; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2342; EG-NEXT: LSHR T3.X, PV.W, literal.x, 2343; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 2344; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 2345; EG-NEXT: LSHR T4.X, PV.W, literal.x, 2346; EG-NEXT: MOV T5.Y, KC0[8].Z, 2347; EG-NEXT: MOV * T5.X, KC0[8].Y, 2348; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2349; 2350; CM-LABEL: v5f64_arg: 2351; CM: ; %bb.0: ; %entry 2352; CM-NEXT: ALU 18, @6, KC0[CB0:0-32], KC1[] 2353; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 2354; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T4.X 2355; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2356; CM-NEXT: CF_END 2357; CM-NEXT: PAD 2358; CM-NEXT: ALU clause starting at 6: 2359; CM-NEXT: MOV * T0.W, KC0[8].X, 2360; CM-NEXT: MOV T1.Y, KC0[8].Z, 2361; CM-NEXT: MOV * T0.Z, KC0[7].W, 2362; CM-NEXT: MOV T1.X, KC0[8].Y, 2363; CM-NEXT: MOV * T0.Y, KC0[7].Z, 2364; CM-NEXT: MOV T0.X, KC0[7].Y, 2365; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 2366; CM-NEXT: MOV * T2.W, KC0[7].X, 2367; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 2368; CM-NEXT: LSHR T3.X, PV.Z, literal.x, 2369; CM-NEXT: MOV T2.Z, KC0[6].W, 2370; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, 2371; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 2372; CM-NEXT: LSHR T4.X, PV.W, literal.x, 2373; CM-NEXT: MOV * T2.Y, KC0[6].Z, 2374; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2375; CM-NEXT: MOV * T2.X, KC0[6].Y, 2376; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 2377; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2378entry: 2379 store <5 x double> %in, ptr addrspace(1) %out, align 8 2380 ret void 2381} 2382 2383; FIXME: Lots of unpack and re-pack junk on VI 2384define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { 2385; SI-LABEL: v8i8_arg: 2386; SI: ; %bb.0: ; %entry 2387; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2388; SI-NEXT: s_mov_b32 s7, 0xf000 2389; SI-NEXT: s_mov_b32 s6, -1 2390; SI-NEXT: s_waitcnt lgkmcnt(0) 2391; SI-NEXT: s_mov_b32 s4, s0 2392; SI-NEXT: s_mov_b32 s5, s1 2393; SI-NEXT: v_mov_b32_e32 v0, s2 2394; SI-NEXT: v_mov_b32_e32 v1, s3 2395; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2396; SI-NEXT: s_endpgm 2397; 2398; VI-LABEL: v8i8_arg: 2399; VI: ; %bb.0: ; %entry 2400; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2401; VI-NEXT: s_waitcnt lgkmcnt(0) 2402; VI-NEXT: v_mov_b32_e32 v0, s0 2403; VI-NEXT: v_mov_b32_e32 v2, s2 2404; VI-NEXT: v_mov_b32_e32 v1, s1 2405; VI-NEXT: v_mov_b32_e32 v3, s3 2406; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2407; VI-NEXT: s_endpgm 2408; 2409; GFX9-LABEL: v8i8_arg: 2410; GFX9: ; %bb.0: ; %entry 2411; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2412; GFX9-NEXT: v_mov_b32_e32 v2, 0 2413; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2414; GFX9-NEXT: v_mov_b32_e32 v0, s2 2415; GFX9-NEXT: v_mov_b32_e32 v1, s3 2416; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2417; GFX9-NEXT: s_endpgm 2418; 2419; EG-LABEL: v8i8_arg: 2420; EG: ; %bb.0: ; %entry 2421; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2422; EG-NEXT: TEX 0 @20 2423; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2424; EG-NEXT: TEX 0 @22 2425; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2426; EG-NEXT: TEX 0 @24 2427; EG-NEXT: ALU 7, @50, KC0[], KC1[] 2428; EG-NEXT: TEX 0 @26 2429; EG-NEXT: ALU 7, @58, KC0[], KC1[] 2430; EG-NEXT: TEX 0 @28 2431; EG-NEXT: ALU 7, @66, KC0[], KC1[] 2432; EG-NEXT: TEX 0 @30 2433; EG-NEXT: ALU 7, @74, KC0[], KC1[] 2434; EG-NEXT: TEX 0 @32 2435; EG-NEXT: ALU 5, @82, KC0[], KC1[] 2436; EG-NEXT: TEX 0 @34 2437; EG-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2438; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1 2439; EG-NEXT: CF_END 2440; EG-NEXT: PAD 2441; EG-NEXT: Fetch clause starting at 20: 2442; EG-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2443; EG-NEXT: Fetch clause starting at 22: 2444; EG-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2445; EG-NEXT: Fetch clause starting at 24: 2446; EG-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2447; EG-NEXT: Fetch clause starting at 26: 2448; EG-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2449; EG-NEXT: Fetch clause starting at 28: 2450; EG-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2451; EG-NEXT: Fetch clause starting at 30: 2452; EG-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2453; EG-NEXT: Fetch clause starting at 32: 2454; EG-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2455; EG-NEXT: Fetch clause starting at 34: 2456; EG-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2457; EG-NEXT: ALU clause starting at 36: 2458; EG-NEXT: MOV * T0.Y, T2.X, 2459; EG-NEXT: MOV * T5.X, 0.0, 2460; EG-NEXT: ALU clause starting at 38: 2461; EG-NEXT: LSHL T0.W, T6.X, literal.x, 2462; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2463; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2464; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2465; EG-NEXT: MOV T2.X, PV.W, 2466; EG-NEXT: MOV * T0.Y, T3.X, 2467; EG-NEXT: ALU clause starting at 44: 2468; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2469; EG-NEXT: LSHL * T1.W, T6.X, literal.y, 2470; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2471; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2472; EG-NEXT: MOV T3.X, PV.W, 2473; EG-NEXT: MOV * T0.Y, T2.X, 2474; EG-NEXT: ALU clause starting at 50: 2475; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2476; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2477; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2478; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2479; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2480; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2481; EG-NEXT: MOV T2.X, PV.W, 2482; EG-NEXT: MOV * T0.Y, T3.X, 2483; EG-NEXT: ALU clause starting at 58: 2484; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2485; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2486; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 2487; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2488; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2489; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2490; EG-NEXT: MOV T3.X, PV.W, 2491; EG-NEXT: MOV * T0.Y, T2.X, 2492; EG-NEXT: ALU clause starting at 66: 2493; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2494; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2495; EG-NEXT: 255(3.573311e-43), -65281(nan) 2496; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2497; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2498; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2499; EG-NEXT: MOV T2.X, PV.W, 2500; EG-NEXT: MOV * T0.Y, T3.X, 2501; EG-NEXT: ALU clause starting at 74: 2502; EG-NEXT: AND_INT T0.W, T6.X, literal.x, 2503; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2504; EG-NEXT: 255(3.573311e-43), -65281(nan) 2505; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2506; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2507; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 2508; EG-NEXT: MOV T3.X, PV.W, 2509; EG-NEXT: MOV * T0.Y, T2.X, 2510; EG-NEXT: ALU clause starting at 82: 2511; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2512; EG-NEXT: AND_INT * T1.W, T6.X, literal.y, 2513; EG-NEXT: -256(nan), 255(3.573311e-43) 2514; EG-NEXT: OR_INT * T5.Y, PV.W, PS, 2515; EG-NEXT: MOV T2.X, PV.Y, 2516; EG-NEXT: MOV * T0.Y, T3.X, 2517; EG-NEXT: ALU clause starting at 88: 2518; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2519; EG-NEXT: AND_INT * T1.W, T5.X, literal.y, 2520; EG-NEXT: -256(nan), 255(3.573311e-43) 2521; EG-NEXT: OR_INT T5.X, PV.W, PS, 2522; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2523; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2524; 2525; CM-LABEL: v8i8_arg: 2526; CM: ; %bb.0: ; %entry 2527; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2528; CM-NEXT: TEX 0 @20 2529; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2530; CM-NEXT: TEX 0 @22 2531; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2532; CM-NEXT: TEX 0 @24 2533; CM-NEXT: ALU 7, @50, KC0[], KC1[] 2534; CM-NEXT: TEX 0 @26 2535; CM-NEXT: ALU 7, @58, KC0[], KC1[] 2536; CM-NEXT: TEX 0 @28 2537; CM-NEXT: ALU 7, @66, KC0[], KC1[] 2538; CM-NEXT: TEX 0 @30 2539; CM-NEXT: ALU 7, @74, KC0[], KC1[] 2540; CM-NEXT: TEX 0 @32 2541; CM-NEXT: ALU 5, @82, KC0[], KC1[] 2542; CM-NEXT: TEX 0 @34 2543; CM-NEXT: ALU 5, @88, KC0[CB0:0-32], KC1[] 2544; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X 2545; CM-NEXT: CF_END 2546; CM-NEXT: PAD 2547; CM-NEXT: Fetch clause starting at 20: 2548; CM-NEXT: VTX_READ_8 T6.X, T5.X, 51, #3 2549; CM-NEXT: Fetch clause starting at 22: 2550; CM-NEXT: VTX_READ_8 T6.X, T5.X, 47, #3 2551; CM-NEXT: Fetch clause starting at 24: 2552; CM-NEXT: VTX_READ_8 T6.X, T5.X, 50, #3 2553; CM-NEXT: Fetch clause starting at 26: 2554; CM-NEXT: VTX_READ_8 T6.X, T5.X, 46, #3 2555; CM-NEXT: Fetch clause starting at 28: 2556; CM-NEXT: VTX_READ_8 T6.X, T5.X, 49, #3 2557; CM-NEXT: Fetch clause starting at 30: 2558; CM-NEXT: VTX_READ_8 T6.X, T5.X, 45, #3 2559; CM-NEXT: Fetch clause starting at 32: 2560; CM-NEXT: VTX_READ_8 T6.X, T5.X, 48, #3 2561; CM-NEXT: Fetch clause starting at 34: 2562; CM-NEXT: VTX_READ_8 T5.X, T5.X, 44, #3 2563; CM-NEXT: ALU clause starting at 36: 2564; CM-NEXT: MOV * T0.Y, T2.X, 2565; CM-NEXT: MOV * T5.X, 0.0, 2566; CM-NEXT: ALU clause starting at 38: 2567; CM-NEXT: LSHL T0.Z, T6.X, literal.x, 2568; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2569; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 2570; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2571; CM-NEXT: MOV T2.X, PV.W, 2572; CM-NEXT: MOV * T0.Y, T3.X, 2573; CM-NEXT: ALU clause starting at 44: 2574; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2575; CM-NEXT: LSHL * T0.W, T6.X, literal.y, 2576; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 2577; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2578; CM-NEXT: MOV T3.X, PV.W, 2579; CM-NEXT: MOV * T0.Y, T2.X, 2580; CM-NEXT: ALU clause starting at 50: 2581; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2582; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2583; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2584; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2585; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2586; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2587; CM-NEXT: MOV T2.X, PV.W, 2588; CM-NEXT: MOV * T0.Y, T3.X, 2589; CM-NEXT: ALU clause starting at 58: 2590; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2591; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2592; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2593; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2594; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 2595; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2596; CM-NEXT: MOV T3.X, PV.W, 2597; CM-NEXT: MOV * T0.Y, T2.X, 2598; CM-NEXT: ALU clause starting at 66: 2599; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2600; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2601; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2602; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2603; CM-NEXT: -65281(nan), 8(1.121039e-44) 2604; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2605; CM-NEXT: MOV T2.X, PV.W, 2606; CM-NEXT: MOV * T0.Y, T3.X, 2607; CM-NEXT: ALU clause starting at 74: 2608; CM-NEXT: AND_INT * T0.W, T6.X, literal.x, 2609; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2610; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2611; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 2612; CM-NEXT: -65281(nan), 8(1.121039e-44) 2613; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2614; CM-NEXT: MOV T3.X, PV.W, 2615; CM-NEXT: MOV * T0.Y, T2.X, 2616; CM-NEXT: ALU clause starting at 82: 2617; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2618; CM-NEXT: AND_INT * T0.W, T6.X, literal.y, 2619; CM-NEXT: -256(nan), 255(3.573311e-43) 2620; CM-NEXT: OR_INT * T5.Y, PV.Z, PV.W, 2621; CM-NEXT: MOV T2.X, PV.Y, 2622; CM-NEXT: MOV * T0.Y, T3.X, 2623; CM-NEXT: ALU clause starting at 88: 2624; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2625; CM-NEXT: AND_INT * T0.W, T5.X, literal.y, 2626; CM-NEXT: -256(nan), 255(3.573311e-43) 2627; CM-NEXT: OR_INT * T5.X, PV.Z, PV.W, 2628; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 2629; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2630entry: 2631 store <8 x i8> %in, ptr addrspace(1) %out 2632 ret void 2633} 2634 2635define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { 2636; SI-LABEL: v8i16_arg: 2637; SI: ; %bb.0: ; %entry 2638; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 2639; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2640; SI-NEXT: s_mov_b32 s7, 0xf000 2641; SI-NEXT: s_mov_b32 s6, -1 2642; SI-NEXT: s_waitcnt lgkmcnt(0) 2643; SI-NEXT: v_mov_b32_e32 v0, s0 2644; SI-NEXT: v_mov_b32_e32 v1, s1 2645; SI-NEXT: v_mov_b32_e32 v2, s2 2646; SI-NEXT: v_mov_b32_e32 v3, s3 2647; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2648; SI-NEXT: s_endpgm 2649; 2650; VI-LABEL: v8i16_arg: 2651; VI: ; %bb.0: ; %entry 2652; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 2653; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 2654; VI-NEXT: s_waitcnt lgkmcnt(0) 2655; VI-NEXT: v_mov_b32_e32 v4, s6 2656; VI-NEXT: v_mov_b32_e32 v0, s0 2657; VI-NEXT: v_mov_b32_e32 v5, s7 2658; VI-NEXT: v_mov_b32_e32 v1, s1 2659; VI-NEXT: v_mov_b32_e32 v2, s2 2660; VI-NEXT: v_mov_b32_e32 v3, s3 2661; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2662; VI-NEXT: s_endpgm 2663; 2664; GFX9-LABEL: v8i16_arg: 2665; GFX9: ; %bb.0: ; %entry 2666; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 2667; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 2668; GFX9-NEXT: v_mov_b32_e32 v4, 0 2669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2670; GFX9-NEXT: v_mov_b32_e32 v0, s0 2671; GFX9-NEXT: v_mov_b32_e32 v1, s1 2672; GFX9-NEXT: v_mov_b32_e32 v2, s2 2673; GFX9-NEXT: v_mov_b32_e32 v3, s3 2674; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 2675; GFX9-NEXT: s_endpgm 2676; 2677; EG-LABEL: v8i16_arg: 2678; EG: ; %bb.0: ; %entry 2679; EG-NEXT: ALU 1, @36, KC0[], KC1[] 2680; EG-NEXT: TEX 0 @20 2681; EG-NEXT: ALU 5, @38, KC0[], KC1[] 2682; EG-NEXT: TEX 0 @22 2683; EG-NEXT: ALU 5, @44, KC0[], KC1[] 2684; EG-NEXT: TEX 0 @24 2685; EG-NEXT: ALU 5, @50, KC0[], KC1[] 2686; EG-NEXT: TEX 0 @26 2687; EG-NEXT: ALU 5, @56, KC0[], KC1[] 2688; EG-NEXT: TEX 0 @28 2689; EG-NEXT: ALU 5, @62, KC0[], KC1[] 2690; EG-NEXT: TEX 0 @30 2691; EG-NEXT: ALU 5, @68, KC0[], KC1[] 2692; EG-NEXT: TEX 0 @32 2693; EG-NEXT: ALU 5, @74, KC0[], KC1[] 2694; EG-NEXT: TEX 0 @34 2695; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2696; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 2697; EG-NEXT: CF_END 2698; EG-NEXT: PAD 2699; EG-NEXT: Fetch clause starting at 20: 2700; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2701; EG-NEXT: Fetch clause starting at 22: 2702; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2703; EG-NEXT: Fetch clause starting at 24: 2704; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2705; EG-NEXT: Fetch clause starting at 26: 2706; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2707; EG-NEXT: Fetch clause starting at 28: 2708; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2709; EG-NEXT: Fetch clause starting at 30: 2710; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2711; EG-NEXT: Fetch clause starting at 32: 2712; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2713; EG-NEXT: Fetch clause starting at 34: 2714; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2715; EG-NEXT: ALU clause starting at 36: 2716; EG-NEXT: MOV * T0.Y, T3.X, 2717; EG-NEXT: MOV * T7.X, 0.0, 2718; EG-NEXT: ALU clause starting at 38: 2719; EG-NEXT: LSHL T0.W, T8.X, literal.x, 2720; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2721; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2722; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2723; EG-NEXT: MOV T3.X, PV.W, 2724; EG-NEXT: MOV * T0.Y, T5.X, 2725; EG-NEXT: ALU clause starting at 44: 2726; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2727; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2728; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2729; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2730; EG-NEXT: MOV T5.X, PV.W, 2731; EG-NEXT: MOV * T0.Y, T3.X, 2732; EG-NEXT: ALU clause starting at 50: 2733; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2734; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2735; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2736; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2737; EG-NEXT: MOV T3.X, PV.W, 2738; EG-NEXT: MOV * T0.Y, T5.X, 2739; EG-NEXT: ALU clause starting at 56: 2740; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2741; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2742; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2743; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2744; EG-NEXT: MOV T5.X, PV.W, 2745; EG-NEXT: MOV * T0.Y, T2.X, 2746; EG-NEXT: ALU clause starting at 62: 2747; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2748; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2749; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2750; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2751; EG-NEXT: MOV T2.X, PV.W, 2752; EG-NEXT: MOV * T0.Y, T4.X, 2753; EG-NEXT: ALU clause starting at 68: 2754; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2755; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 2756; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2757; EG-NEXT: OR_INT * T0.W, PV.W, PS, 2758; EG-NEXT: MOV T4.X, PV.W, 2759; EG-NEXT: MOV * T0.Y, T2.X, 2760; EG-NEXT: ALU clause starting at 74: 2761; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 2762; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 2763; EG-NEXT: -65536(nan), 65535(9.183409e-41) 2764; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 2765; EG-NEXT: MOV T2.X, PV.Z, 2766; EG-NEXT: MOV * T0.Y, T4.X, 2767; EG-NEXT: ALU clause starting at 80: 2768; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 2769; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 2770; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, 2771; EG-NEXT: 2(2.802597e-45), -65536(nan) 2772; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2773; EG-NEXT: OR_INT * T7.X, PV.W, PS, 2774; EG-NEXT: MOV T4.X, PV.X, 2775; EG-NEXT: MOV * T7.W, T3.X, 2776; EG-NEXT: MOV * T7.Y, T5.X, 2777; 2778; CM-LABEL: v8i16_arg: 2779; CM: ; %bb.0: ; %entry 2780; CM-NEXT: ALU 1, @36, KC0[], KC1[] 2781; CM-NEXT: TEX 0 @20 2782; CM-NEXT: ALU 5, @38, KC0[], KC1[] 2783; CM-NEXT: TEX 0 @22 2784; CM-NEXT: ALU 5, @44, KC0[], KC1[] 2785; CM-NEXT: TEX 0 @24 2786; CM-NEXT: ALU 5, @50, KC0[], KC1[] 2787; CM-NEXT: TEX 0 @26 2788; CM-NEXT: ALU 5, @56, KC0[], KC1[] 2789; CM-NEXT: TEX 0 @28 2790; CM-NEXT: ALU 5, @62, KC0[], KC1[] 2791; CM-NEXT: TEX 0 @30 2792; CM-NEXT: ALU 5, @68, KC0[], KC1[] 2793; CM-NEXT: TEX 0 @32 2794; CM-NEXT: ALU 5, @74, KC0[], KC1[] 2795; CM-NEXT: TEX 0 @34 2796; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] 2797; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 2798; CM-NEXT: CF_END 2799; CM-NEXT: PAD 2800; CM-NEXT: Fetch clause starting at 20: 2801; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2802; CM-NEXT: Fetch clause starting at 22: 2803; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2804; CM-NEXT: Fetch clause starting at 24: 2805; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2806; CM-NEXT: Fetch clause starting at 26: 2807; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2808; CM-NEXT: Fetch clause starting at 28: 2809; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2810; CM-NEXT: Fetch clause starting at 30: 2811; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2812; CM-NEXT: Fetch clause starting at 32: 2813; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2814; CM-NEXT: Fetch clause starting at 34: 2815; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 2816; CM-NEXT: ALU clause starting at 36: 2817; CM-NEXT: MOV * T0.Y, T3.X, 2818; CM-NEXT: MOV * T7.X, 0.0, 2819; CM-NEXT: ALU clause starting at 38: 2820; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 2821; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 2822; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2823; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 2824; CM-NEXT: MOV T3.X, PV.W, 2825; CM-NEXT: MOV * T0.Y, T5.X, 2826; CM-NEXT: ALU clause starting at 44: 2827; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2828; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 2829; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2830; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2831; CM-NEXT: MOV T5.X, PV.W, 2832; CM-NEXT: MOV * T0.Y, T3.X, 2833; CM-NEXT: ALU clause starting at 50: 2834; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2835; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 2836; CM-NEXT: -65536(nan), 65535(9.183409e-41) 2837; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2838; CM-NEXT: MOV T3.X, PV.W, 2839; CM-NEXT: MOV * T0.Y, T5.X, 2840; CM-NEXT: ALU clause starting at 56: 2841; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2842; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 2843; CM-NEXT: -65536(nan), 65535(9.183409e-41) 2844; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2845; CM-NEXT: MOV T5.X, PV.W, 2846; CM-NEXT: MOV * T0.Y, T2.X, 2847; CM-NEXT: ALU clause starting at 62: 2848; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2849; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 2850; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2851; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2852; CM-NEXT: MOV T2.X, PV.W, 2853; CM-NEXT: MOV * T0.Y, T4.X, 2854; CM-NEXT: ALU clause starting at 68: 2855; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2856; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 2857; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 2858; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 2859; CM-NEXT: MOV T4.X, PV.W, 2860; CM-NEXT: MOV * T0.Y, T2.X, 2861; CM-NEXT: ALU clause starting at 74: 2862; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 2863; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 2864; CM-NEXT: -65536(nan), 65535(9.183409e-41) 2865; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 2866; CM-NEXT: MOV T2.X, PV.Z, 2867; CM-NEXT: MOV * T0.Y, T4.X, 2868; CM-NEXT: ALU clause starting at 80: 2869; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 2870; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 2871; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, 2872; CM-NEXT: 2(2.802597e-45), -65536(nan) 2873; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2874; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 2875; CM-NEXT: MOV T4.X, PV.X, 2876; CM-NEXT: MOV * T7.W, T3.X, 2877; CM-NEXT: MOV * T7.Y, T5.X, 2878entry: 2879 store <8 x i16> %in, ptr addrspace(1) %out 2880 ret void 2881} 2882 2883define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> %in) nounwind { 2884; SI-LABEL: v8i32_arg: 2885; SI: ; %bb.0: ; %entry 2886; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 2887; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2888; SI-NEXT: s_mov_b32 s3, 0xf000 2889; SI-NEXT: s_mov_b32 s2, -1 2890; SI-NEXT: s_waitcnt lgkmcnt(0) 2891; SI-NEXT: v_mov_b32_e32 v0, s12 2892; SI-NEXT: v_mov_b32_e32 v1, s13 2893; SI-NEXT: v_mov_b32_e32 v2, s14 2894; SI-NEXT: v_mov_b32_e32 v3, s15 2895; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 2896; SI-NEXT: s_waitcnt expcnt(0) 2897; SI-NEXT: v_mov_b32_e32 v0, s8 2898; SI-NEXT: v_mov_b32_e32 v1, s9 2899; SI-NEXT: v_mov_b32_e32 v2, s10 2900; SI-NEXT: v_mov_b32_e32 v3, s11 2901; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2902; SI-NEXT: s_endpgm 2903; 2904; VI-LABEL: v8i32_arg: 2905; VI: ; %bb.0: ; %entry 2906; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 2907; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2908; VI-NEXT: s_waitcnt lgkmcnt(0) 2909; VI-NEXT: v_mov_b32_e32 v0, s12 2910; VI-NEXT: s_add_u32 s2, s0, 16 2911; VI-NEXT: s_addc_u32 s3, s1, 0 2912; VI-NEXT: v_mov_b32_e32 v5, s3 2913; VI-NEXT: v_mov_b32_e32 v1, s13 2914; VI-NEXT: v_mov_b32_e32 v2, s14 2915; VI-NEXT: v_mov_b32_e32 v3, s15 2916; VI-NEXT: v_mov_b32_e32 v4, s2 2917; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2918; VI-NEXT: v_mov_b32_e32 v5, s1 2919; VI-NEXT: v_mov_b32_e32 v0, s8 2920; VI-NEXT: v_mov_b32_e32 v1, s9 2921; VI-NEXT: v_mov_b32_e32 v2, s10 2922; VI-NEXT: v_mov_b32_e32 v3, s11 2923; VI-NEXT: v_mov_b32_e32 v4, s0 2924; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2925; VI-NEXT: s_endpgm 2926; 2927; GFX9-LABEL: v8i32_arg: 2928; GFX9: ; %bb.0: ; %entry 2929; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 2930; GFX9-NEXT: v_mov_b32_e32 v4, 0 2931; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 2932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX9-NEXT: v_mov_b32_e32 v0, s4 2934; GFX9-NEXT: v_mov_b32_e32 v1, s5 2935; GFX9-NEXT: v_mov_b32_e32 v2, s6 2936; GFX9-NEXT: v_mov_b32_e32 v3, s7 2937; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 2938; GFX9-NEXT: s_nop 0 2939; GFX9-NEXT: v_mov_b32_e32 v0, s0 2940; GFX9-NEXT: v_mov_b32_e32 v1, s1 2941; GFX9-NEXT: v_mov_b32_e32 v2, s2 2942; GFX9-NEXT: v_mov_b32_e32 v3, s3 2943; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 2944; GFX9-NEXT: s_endpgm 2945; 2946; EG-LABEL: v8i32_arg: 2947; EG: ; %bb.0: ; %entry 2948; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 2949; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 2950; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 2951; EG-NEXT: CF_END 2952; EG-NEXT: ALU clause starting at 4: 2953; EG-NEXT: MOV * T0.W, KC0[5].X, 2954; EG-NEXT: MOV * T0.Z, KC0[4].W, 2955; EG-NEXT: MOV T0.Y, KC0[4].Z, 2956; EG-NEXT: MOV * T1.W, KC0[6].X, 2957; EG-NEXT: MOV T0.X, KC0[4].Y, 2958; EG-NEXT: MOV * T1.Z, KC0[5].W, 2959; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2960; EG-NEXT: MOV * T1.Y, KC0[5].Z, 2961; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2962; EG-NEXT: MOV T1.X, KC0[5].Y, 2963; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2964; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2965; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 2966; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2967; 2968; CM-LABEL: v8i32_arg: 2969; CM: ; %bb.0: ; %entry 2970; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 2971; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 2972; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 2973; CM-NEXT: CF_END 2974; CM-NEXT: ALU clause starting at 4: 2975; CM-NEXT: MOV * T0.W, KC0[6].X, 2976; CM-NEXT: MOV * T0.Z, KC0[5].W, 2977; CM-NEXT: MOV * T0.Y, KC0[5].Z, 2978; CM-NEXT: MOV T0.X, KC0[5].Y, 2979; CM-NEXT: MOV * T1.W, KC0[5].X, 2980; CM-NEXT: MOV T1.Z, KC0[4].W, 2981; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2982; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2983; CM-NEXT: LSHR T2.X, PV.W, literal.x, 2984; CM-NEXT: MOV * T1.Y, KC0[4].Z, 2985; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2986; CM-NEXT: MOV * T1.X, KC0[4].Y, 2987; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2988; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2989entry: 2990 store <8 x i32> %in, ptr addrspace(1) %out, align 4 2991 ret void 2992} 2993 2994define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float> %in) nounwind { 2995; SI-LABEL: v8f32_arg: 2996; SI: ; %bb.0: ; %entry 2997; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 2998; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2999; SI-NEXT: s_mov_b32 s3, 0xf000 3000; SI-NEXT: s_mov_b32 s2, -1 3001; SI-NEXT: s_waitcnt lgkmcnt(0) 3002; SI-NEXT: v_mov_b32_e32 v0, s12 3003; SI-NEXT: v_mov_b32_e32 v1, s13 3004; SI-NEXT: v_mov_b32_e32 v2, s14 3005; SI-NEXT: v_mov_b32_e32 v3, s15 3006; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3007; SI-NEXT: s_waitcnt expcnt(0) 3008; SI-NEXT: v_mov_b32_e32 v0, s8 3009; SI-NEXT: v_mov_b32_e32 v1, s9 3010; SI-NEXT: v_mov_b32_e32 v2, s10 3011; SI-NEXT: v_mov_b32_e32 v3, s11 3012; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3013; SI-NEXT: s_endpgm 3014; 3015; VI-LABEL: v8f32_arg: 3016; VI: ; %bb.0: ; %entry 3017; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 3018; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3019; VI-NEXT: s_waitcnt lgkmcnt(0) 3020; VI-NEXT: v_mov_b32_e32 v0, s12 3021; VI-NEXT: s_add_u32 s2, s0, 16 3022; VI-NEXT: s_addc_u32 s3, s1, 0 3023; VI-NEXT: v_mov_b32_e32 v5, s3 3024; VI-NEXT: v_mov_b32_e32 v1, s13 3025; VI-NEXT: v_mov_b32_e32 v2, s14 3026; VI-NEXT: v_mov_b32_e32 v3, s15 3027; VI-NEXT: v_mov_b32_e32 v4, s2 3028; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3029; VI-NEXT: v_mov_b32_e32 v5, s1 3030; VI-NEXT: v_mov_b32_e32 v0, s8 3031; VI-NEXT: v_mov_b32_e32 v1, s9 3032; VI-NEXT: v_mov_b32_e32 v2, s10 3033; VI-NEXT: v_mov_b32_e32 v3, s11 3034; VI-NEXT: v_mov_b32_e32 v4, s0 3035; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3036; VI-NEXT: s_endpgm 3037; 3038; GFX9-LABEL: v8f32_arg: 3039; GFX9: ; %bb.0: ; %entry 3040; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 3041; GFX9-NEXT: v_mov_b32_e32 v4, 0 3042; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 3043; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3044; GFX9-NEXT: v_mov_b32_e32 v0, s4 3045; GFX9-NEXT: v_mov_b32_e32 v1, s5 3046; GFX9-NEXT: v_mov_b32_e32 v2, s6 3047; GFX9-NEXT: v_mov_b32_e32 v3, s7 3048; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 3049; GFX9-NEXT: s_nop 0 3050; GFX9-NEXT: v_mov_b32_e32 v0, s0 3051; GFX9-NEXT: v_mov_b32_e32 v1, s1 3052; GFX9-NEXT: v_mov_b32_e32 v2, s2 3053; GFX9-NEXT: v_mov_b32_e32 v3, s3 3054; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 3055; GFX9-NEXT: s_endpgm 3056; 3057; EG-LABEL: v8f32_arg: 3058; EG: ; %bb.0: ; %entry 3059; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3060; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 3061; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 3062; EG-NEXT: CF_END 3063; EG-NEXT: ALU clause starting at 4: 3064; EG-NEXT: MOV * T0.W, KC0[5].X, 3065; EG-NEXT: MOV * T0.Z, KC0[4].W, 3066; EG-NEXT: MOV T0.Y, KC0[4].Z, 3067; EG-NEXT: MOV * T1.W, KC0[6].X, 3068; EG-NEXT: MOV T0.X, KC0[4].Y, 3069; EG-NEXT: MOV * T1.Z, KC0[5].W, 3070; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 3071; EG-NEXT: MOV * T1.Y, KC0[5].Z, 3072; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3073; EG-NEXT: MOV T1.X, KC0[5].Y, 3074; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3075; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3076; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 3077; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3078; 3079; CM-LABEL: v8f32_arg: 3080; CM: ; %bb.0: ; %entry 3081; CM-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 3082; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T3.X 3083; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 3084; CM-NEXT: CF_END 3085; CM-NEXT: ALU clause starting at 4: 3086; CM-NEXT: MOV * T0.W, KC0[6].X, 3087; CM-NEXT: MOV * T0.Z, KC0[5].W, 3088; CM-NEXT: MOV * T0.Y, KC0[5].Z, 3089; CM-NEXT: MOV T0.X, KC0[5].Y, 3090; CM-NEXT: MOV * T1.W, KC0[5].X, 3091; CM-NEXT: MOV T1.Z, KC0[4].W, 3092; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 3093; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3094; CM-NEXT: LSHR T2.X, PV.W, literal.x, 3095; CM-NEXT: MOV * T1.Y, KC0[4].Z, 3096; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3097; CM-NEXT: MOV * T1.X, KC0[4].Y, 3098; CM-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 3099; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3100entry: 3101 store <8 x float> %in, ptr addrspace(1) %out, align 4 3102 ret void 3103} 3104 3105; FIXME: Pack/repack on VI 3106define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) { 3107; SI-LABEL: v16i8_arg: 3108; SI: ; %bb.0: ; %entry 3109; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 3110; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 3111; SI-NEXT: s_mov_b32 s7, 0xf000 3112; SI-NEXT: s_mov_b32 s6, -1 3113; SI-NEXT: s_waitcnt lgkmcnt(0) 3114; SI-NEXT: v_mov_b32_e32 v0, s0 3115; SI-NEXT: v_mov_b32_e32 v1, s1 3116; SI-NEXT: v_mov_b32_e32 v2, s2 3117; SI-NEXT: v_mov_b32_e32 v3, s3 3118; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 3119; SI-NEXT: s_endpgm 3120; 3121; VI-LABEL: v16i8_arg: 3122; VI: ; %bb.0: ; %entry 3123; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 3124; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 3125; VI-NEXT: s_waitcnt lgkmcnt(0) 3126; VI-NEXT: v_mov_b32_e32 v4, s6 3127; VI-NEXT: v_mov_b32_e32 v0, s0 3128; VI-NEXT: v_mov_b32_e32 v5, s7 3129; VI-NEXT: v_mov_b32_e32 v1, s1 3130; VI-NEXT: v_mov_b32_e32 v2, s2 3131; VI-NEXT: v_mov_b32_e32 v3, s3 3132; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3133; VI-NEXT: s_endpgm 3134; 3135; GFX9-LABEL: v16i8_arg: 3136; GFX9: ; %bb.0: ; %entry 3137; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 3138; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 3139; GFX9-NEXT: v_mov_b32_e32 v4, 0 3140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3141; GFX9-NEXT: v_mov_b32_e32 v0, s0 3142; GFX9-NEXT: v_mov_b32_e32 v1, s1 3143; GFX9-NEXT: v_mov_b32_e32 v2, s2 3144; GFX9-NEXT: v_mov_b32_e32 v3, s3 3145; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] 3146; GFX9-NEXT: s_endpgm 3147; 3148; EG-LABEL: v16i8_arg: 3149; EG: ; %bb.0: ; %entry 3150; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3151; EG-NEXT: TEX 0 @36 3152; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3153; EG-NEXT: TEX 0 @38 3154; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3155; EG-NEXT: TEX 0 @40 3156; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3157; EG-NEXT: TEX 0 @42 3158; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3159; EG-NEXT: TEX 0 @44 3160; EG-NEXT: ALU 7, @94, KC0[], KC1[] 3161; EG-NEXT: TEX 0 @46 3162; EG-NEXT: ALU 7, @102, KC0[], KC1[] 3163; EG-NEXT: TEX 0 @48 3164; EG-NEXT: ALU 7, @110, KC0[], KC1[] 3165; EG-NEXT: TEX 0 @50 3166; EG-NEXT: ALU 7, @118, KC0[], KC1[] 3167; EG-NEXT: TEX 0 @52 3168; EG-NEXT: ALU 7, @126, KC0[], KC1[] 3169; EG-NEXT: TEX 0 @54 3170; EG-NEXT: ALU 7, @134, KC0[], KC1[] 3171; EG-NEXT: TEX 0 @56 3172; EG-NEXT: ALU 7, @142, KC0[], KC1[] 3173; EG-NEXT: TEX 0 @58 3174; EG-NEXT: ALU 7, @150, KC0[], KC1[] 3175; EG-NEXT: TEX 0 @60 3176; EG-NEXT: ALU 5, @158, KC0[], KC1[] 3177; EG-NEXT: TEX 0 @62 3178; EG-NEXT: ALU 5, @164, KC0[], KC1[] 3179; EG-NEXT: TEX 0 @64 3180; EG-NEXT: ALU 5, @170, KC0[], KC1[] 3181; EG-NEXT: TEX 0 @66 3182; EG-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3183; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 3184; EG-NEXT: CF_END 3185; EG-NEXT: PAD 3186; EG-NEXT: Fetch clause starting at 36: 3187; EG-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3188; EG-NEXT: Fetch clause starting at 38: 3189; EG-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3190; EG-NEXT: Fetch clause starting at 40: 3191; EG-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3192; EG-NEXT: Fetch clause starting at 42: 3193; EG-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3194; EG-NEXT: Fetch clause starting at 44: 3195; EG-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3196; EG-NEXT: Fetch clause starting at 46: 3197; EG-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3198; EG-NEXT: Fetch clause starting at 48: 3199; EG-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3200; EG-NEXT: Fetch clause starting at 50: 3201; EG-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3202; EG-NEXT: Fetch clause starting at 52: 3203; EG-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3204; EG-NEXT: Fetch clause starting at 54: 3205; EG-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3206; EG-NEXT: Fetch clause starting at 56: 3207; EG-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3208; EG-NEXT: Fetch clause starting at 58: 3209; EG-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3210; EG-NEXT: Fetch clause starting at 60: 3211; EG-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3212; EG-NEXT: Fetch clause starting at 62: 3213; EG-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3214; EG-NEXT: Fetch clause starting at 64: 3215; EG-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3216; EG-NEXT: Fetch clause starting at 66: 3217; EG-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3218; EG-NEXT: ALU clause starting at 68: 3219; EG-NEXT: MOV * T0.Y, T2.X, 3220; EG-NEXT: MOV * T7.X, 0.0, 3221; EG-NEXT: ALU clause starting at 70: 3222; EG-NEXT: LSHL T0.W, T8.X, literal.x, 3223; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3224; EG-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3225; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3226; EG-NEXT: MOV T2.X, PV.W, 3227; EG-NEXT: MOV * T0.Y, T3.X, 3228; EG-NEXT: ALU clause starting at 76: 3229; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3230; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3231; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3232; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3233; EG-NEXT: MOV T3.X, PV.W, 3234; EG-NEXT: MOV * T0.Y, T4.X, 3235; EG-NEXT: ALU clause starting at 82: 3236; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3237; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3238; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3239; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3240; EG-NEXT: MOV T4.X, PV.W, 3241; EG-NEXT: MOV * T0.Y, T5.X, 3242; EG-NEXT: ALU clause starting at 88: 3243; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3244; EG-NEXT: LSHL * T1.W, T8.X, literal.y, 3245; EG-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3246; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3247; EG-NEXT: MOV T5.X, PV.W, 3248; EG-NEXT: MOV * T0.Y, T2.X, 3249; EG-NEXT: ALU clause starting at 94: 3250; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3251; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3252; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3253; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3254; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3255; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3256; EG-NEXT: MOV T2.X, PV.W, 3257; EG-NEXT: MOV * T0.Y, T3.X, 3258; EG-NEXT: ALU clause starting at 102: 3259; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3260; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3261; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3262; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3263; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3264; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3265; EG-NEXT: MOV T3.X, PV.W, 3266; EG-NEXT: MOV * T0.Y, T4.X, 3267; EG-NEXT: ALU clause starting at 110: 3268; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3269; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3270; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3271; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3272; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3273; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3274; EG-NEXT: MOV T4.X, PV.W, 3275; EG-NEXT: MOV * T0.Y, T5.X, 3276; EG-NEXT: ALU clause starting at 118: 3277; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3278; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3279; EG-NEXT: 255(3.573311e-43), -16711681(-1.714704e+38) 3280; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3281; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3282; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3283; EG-NEXT: MOV T5.X, PV.W, 3284; EG-NEXT: MOV * T0.Y, T2.X, 3285; EG-NEXT: ALU clause starting at 126: 3286; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3287; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3288; EG-NEXT: 255(3.573311e-43), -65281(nan) 3289; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3290; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3291; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3292; EG-NEXT: MOV T2.X, PV.W, 3293; EG-NEXT: MOV * T0.Y, T3.X, 3294; EG-NEXT: ALU clause starting at 134: 3295; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3296; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3297; EG-NEXT: 255(3.573311e-43), -65281(nan) 3298; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3299; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3300; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3301; EG-NEXT: MOV T3.X, PV.W, 3302; EG-NEXT: MOV * T0.Y, T4.X, 3303; EG-NEXT: ALU clause starting at 142: 3304; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3305; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3306; EG-NEXT: 255(3.573311e-43), -65281(nan) 3307; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3308; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3309; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3310; EG-NEXT: MOV T4.X, PV.W, 3311; EG-NEXT: MOV * T0.Y, T5.X, 3312; EG-NEXT: ALU clause starting at 150: 3313; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3314; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3315; EG-NEXT: 255(3.573311e-43), -65281(nan) 3316; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3317; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 3318; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3319; EG-NEXT: MOV T5.X, PV.W, 3320; EG-NEXT: MOV * T0.Y, T2.X, 3321; EG-NEXT: ALU clause starting at 158: 3322; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3323; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3324; EG-NEXT: -256(nan), 255(3.573311e-43) 3325; EG-NEXT: OR_INT * T7.W, PV.W, PS, 3326; EG-NEXT: MOV T2.X, PV.W, 3327; EG-NEXT: MOV * T0.Y, T3.X, 3328; EG-NEXT: ALU clause starting at 164: 3329; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3330; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3331; EG-NEXT: -256(nan), 255(3.573311e-43) 3332; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 3333; EG-NEXT: MOV T3.X, PV.Z, 3334; EG-NEXT: MOV * T0.Y, T4.X, 3335; EG-NEXT: ALU clause starting at 170: 3336; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3337; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, 3338; EG-NEXT: -256(nan), 255(3.573311e-43) 3339; EG-NEXT: OR_INT * T7.Y, PV.W, PS, 3340; EG-NEXT: MOV T4.X, PV.Y, 3341; EG-NEXT: MOV * T0.Y, T5.X, 3342; EG-NEXT: ALU clause starting at 176: 3343; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3344; EG-NEXT: AND_INT * T1.W, T7.X, literal.y, 3345; EG-NEXT: -256(nan), 255(3.573311e-43) 3346; EG-NEXT: OR_INT T7.X, PV.W, PS, 3347; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3348; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3349; 3350; CM-LABEL: v16i8_arg: 3351; CM: ; %bb.0: ; %entry 3352; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3353; CM-NEXT: TEX 0 @36 3354; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3355; CM-NEXT: TEX 0 @38 3356; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3357; CM-NEXT: TEX 0 @40 3358; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3359; CM-NEXT: TEX 0 @42 3360; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3361; CM-NEXT: TEX 0 @44 3362; CM-NEXT: ALU 7, @94, KC0[], KC1[] 3363; CM-NEXT: TEX 0 @46 3364; CM-NEXT: ALU 7, @102, KC0[], KC1[] 3365; CM-NEXT: TEX 0 @48 3366; CM-NEXT: ALU 7, @110, KC0[], KC1[] 3367; CM-NEXT: TEX 0 @50 3368; CM-NEXT: ALU 7, @118, KC0[], KC1[] 3369; CM-NEXT: TEX 0 @52 3370; CM-NEXT: ALU 7, @126, KC0[], KC1[] 3371; CM-NEXT: TEX 0 @54 3372; CM-NEXT: ALU 7, @134, KC0[], KC1[] 3373; CM-NEXT: TEX 0 @56 3374; CM-NEXT: ALU 7, @142, KC0[], KC1[] 3375; CM-NEXT: TEX 0 @58 3376; CM-NEXT: ALU 7, @150, KC0[], KC1[] 3377; CM-NEXT: TEX 0 @60 3378; CM-NEXT: ALU 5, @158, KC0[], KC1[] 3379; CM-NEXT: TEX 0 @62 3380; CM-NEXT: ALU 5, @164, KC0[], KC1[] 3381; CM-NEXT: TEX 0 @64 3382; CM-NEXT: ALU 5, @170, KC0[], KC1[] 3383; CM-NEXT: TEX 0 @66 3384; CM-NEXT: ALU 5, @176, KC0[CB0:0-32], KC1[] 3385; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X 3386; CM-NEXT: CF_END 3387; CM-NEXT: PAD 3388; CM-NEXT: Fetch clause starting at 36: 3389; CM-NEXT: VTX_READ_8 T8.X, T7.X, 67, #3 3390; CM-NEXT: Fetch clause starting at 38: 3391; CM-NEXT: VTX_READ_8 T8.X, T7.X, 63, #3 3392; CM-NEXT: Fetch clause starting at 40: 3393; CM-NEXT: VTX_READ_8 T8.X, T7.X, 59, #3 3394; CM-NEXT: Fetch clause starting at 42: 3395; CM-NEXT: VTX_READ_8 T8.X, T7.X, 55, #3 3396; CM-NEXT: Fetch clause starting at 44: 3397; CM-NEXT: VTX_READ_8 T8.X, T7.X, 66, #3 3398; CM-NEXT: Fetch clause starting at 46: 3399; CM-NEXT: VTX_READ_8 T8.X, T7.X, 62, #3 3400; CM-NEXT: Fetch clause starting at 48: 3401; CM-NEXT: VTX_READ_8 T8.X, T7.X, 58, #3 3402; CM-NEXT: Fetch clause starting at 50: 3403; CM-NEXT: VTX_READ_8 T8.X, T7.X, 54, #3 3404; CM-NEXT: Fetch clause starting at 52: 3405; CM-NEXT: VTX_READ_8 T8.X, T7.X, 65, #3 3406; CM-NEXT: Fetch clause starting at 54: 3407; CM-NEXT: VTX_READ_8 T8.X, T7.X, 61, #3 3408; CM-NEXT: Fetch clause starting at 56: 3409; CM-NEXT: VTX_READ_8 T8.X, T7.X, 57, #3 3410; CM-NEXT: Fetch clause starting at 58: 3411; CM-NEXT: VTX_READ_8 T8.X, T7.X, 53, #3 3412; CM-NEXT: Fetch clause starting at 60: 3413; CM-NEXT: VTX_READ_8 T8.X, T7.X, 64, #3 3414; CM-NEXT: Fetch clause starting at 62: 3415; CM-NEXT: VTX_READ_8 T8.X, T7.X, 60, #3 3416; CM-NEXT: Fetch clause starting at 64: 3417; CM-NEXT: VTX_READ_8 T8.X, T7.X, 56, #3 3418; CM-NEXT: Fetch clause starting at 66: 3419; CM-NEXT: VTX_READ_8 T7.X, T7.X, 52, #3 3420; CM-NEXT: ALU clause starting at 68: 3421; CM-NEXT: MOV * T0.Y, T2.X, 3422; CM-NEXT: MOV * T7.X, 0.0, 3423; CM-NEXT: ALU clause starting at 70: 3424; CM-NEXT: LSHL T0.Z, T8.X, literal.x, 3425; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 3426; CM-NEXT: 24(3.363116e-44), 16777215(2.350989e-38) 3427; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 3428; CM-NEXT: MOV T2.X, PV.W, 3429; CM-NEXT: MOV * T0.Y, T3.X, 3430; CM-NEXT: ALU clause starting at 76: 3431; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3432; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3433; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3434; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3435; CM-NEXT: MOV T3.X, PV.W, 3436; CM-NEXT: MOV * T0.Y, T4.X, 3437; CM-NEXT: ALU clause starting at 82: 3438; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3439; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3440; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3441; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3442; CM-NEXT: MOV T4.X, PV.W, 3443; CM-NEXT: MOV * T0.Y, T5.X, 3444; CM-NEXT: ALU clause starting at 88: 3445; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3446; CM-NEXT: LSHL * T0.W, T8.X, literal.y, 3447; CM-NEXT: 16777215(2.350989e-38), 24(3.363116e-44) 3448; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3449; CM-NEXT: MOV T5.X, PV.W, 3450; CM-NEXT: MOV * T0.Y, T2.X, 3451; CM-NEXT: ALU clause starting at 94: 3452; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3453; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3454; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3455; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3456; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3457; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3458; CM-NEXT: MOV T2.X, PV.W, 3459; CM-NEXT: MOV * T0.Y, T3.X, 3460; CM-NEXT: ALU clause starting at 102: 3461; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3462; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3463; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3464; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3465; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3466; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3467; CM-NEXT: MOV T3.X, PV.W, 3468; CM-NEXT: MOV * T0.Y, T4.X, 3469; CM-NEXT: ALU clause starting at 110: 3470; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3471; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3472; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3473; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3474; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3475; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3476; CM-NEXT: MOV T4.X, PV.W, 3477; CM-NEXT: MOV * T0.Y, T5.X, 3478; CM-NEXT: ALU clause starting at 118: 3479; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3480; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3481; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3482; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3483; CM-NEXT: -16711681(-1.714704e+38), 16(2.242078e-44) 3484; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3485; CM-NEXT: MOV T5.X, PV.W, 3486; CM-NEXT: MOV * T0.Y, T2.X, 3487; CM-NEXT: ALU clause starting at 126: 3488; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3489; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3490; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3491; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3492; CM-NEXT: -65281(nan), 8(1.121039e-44) 3493; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3494; CM-NEXT: MOV T2.X, PV.W, 3495; CM-NEXT: MOV * T0.Y, T3.X, 3496; CM-NEXT: ALU clause starting at 134: 3497; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3498; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3499; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3500; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3501; CM-NEXT: -65281(nan), 8(1.121039e-44) 3502; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3503; CM-NEXT: MOV T3.X, PV.W, 3504; CM-NEXT: MOV * T0.Y, T4.X, 3505; CM-NEXT: ALU clause starting at 142: 3506; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3507; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3508; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3509; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3510; CM-NEXT: -65281(nan), 8(1.121039e-44) 3511; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3512; CM-NEXT: MOV T4.X, PV.W, 3513; CM-NEXT: MOV * T0.Y, T5.X, 3514; CM-NEXT: ALU clause starting at 150: 3515; CM-NEXT: AND_INT * T0.W, T8.X, literal.x, 3516; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 3517; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3518; CM-NEXT: LSHL * T0.W, PV.W, literal.y, 3519; CM-NEXT: -65281(nan), 8(1.121039e-44) 3520; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3521; CM-NEXT: MOV T5.X, PV.W, 3522; CM-NEXT: MOV * T0.Y, T2.X, 3523; CM-NEXT: ALU clause starting at 158: 3524; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3525; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3526; CM-NEXT: -256(nan), 255(3.573311e-43) 3527; CM-NEXT: OR_INT * T7.W, PV.Z, PV.W, 3528; CM-NEXT: MOV T2.X, PV.W, 3529; CM-NEXT: MOV * T0.Y, T3.X, 3530; CM-NEXT: ALU clause starting at 164: 3531; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3532; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3533; CM-NEXT: -256(nan), 255(3.573311e-43) 3534; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, 3535; CM-NEXT: MOV T3.X, PV.Z, 3536; CM-NEXT: MOV * T0.Y, T4.X, 3537; CM-NEXT: ALU clause starting at 170: 3538; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3539; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, 3540; CM-NEXT: -256(nan), 255(3.573311e-43) 3541; CM-NEXT: OR_INT * T7.Y, PV.Z, PV.W, 3542; CM-NEXT: MOV T4.X, PV.Y, 3543; CM-NEXT: MOV * T0.Y, T5.X, 3544; CM-NEXT: ALU clause starting at 176: 3545; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3546; CM-NEXT: AND_INT * T0.W, T7.X, literal.y, 3547; CM-NEXT: -256(nan), 255(3.573311e-43) 3548; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, 3549; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, 3550; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3551entry: 3552 store <16 x i8> %in, ptr addrspace(1) %out 3553 ret void 3554} 3555 3556define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { 3557; SI-LABEL: v16i16_arg: 3558; SI: ; %bb.0: ; %entry 3559; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 3560; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3561; SI-NEXT: s_mov_b32 s3, 0xf000 3562; SI-NEXT: s_mov_b32 s2, -1 3563; SI-NEXT: s_waitcnt lgkmcnt(0) 3564; SI-NEXT: v_mov_b32_e32 v0, s12 3565; SI-NEXT: v_mov_b32_e32 v1, s13 3566; SI-NEXT: v_mov_b32_e32 v2, s14 3567; SI-NEXT: v_mov_b32_e32 v3, s15 3568; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 3569; SI-NEXT: s_waitcnt expcnt(0) 3570; SI-NEXT: v_mov_b32_e32 v0, s8 3571; SI-NEXT: v_mov_b32_e32 v1, s9 3572; SI-NEXT: v_mov_b32_e32 v2, s10 3573; SI-NEXT: v_mov_b32_e32 v3, s11 3574; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3575; SI-NEXT: s_endpgm 3576; 3577; VI-LABEL: v16i16_arg: 3578; VI: ; %bb.0: ; %entry 3579; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 3580; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3581; VI-NEXT: s_waitcnt lgkmcnt(0) 3582; VI-NEXT: v_mov_b32_e32 v0, s12 3583; VI-NEXT: s_add_u32 s2, s0, 16 3584; VI-NEXT: s_addc_u32 s3, s1, 0 3585; VI-NEXT: v_mov_b32_e32 v5, s3 3586; VI-NEXT: v_mov_b32_e32 v1, s13 3587; VI-NEXT: v_mov_b32_e32 v2, s14 3588; VI-NEXT: v_mov_b32_e32 v3, s15 3589; VI-NEXT: v_mov_b32_e32 v4, s2 3590; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3591; VI-NEXT: v_mov_b32_e32 v5, s1 3592; VI-NEXT: v_mov_b32_e32 v0, s8 3593; VI-NEXT: v_mov_b32_e32 v1, s9 3594; VI-NEXT: v_mov_b32_e32 v2, s10 3595; VI-NEXT: v_mov_b32_e32 v3, s11 3596; VI-NEXT: v_mov_b32_e32 v4, s0 3597; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3598; VI-NEXT: s_endpgm 3599; 3600; GFX9-LABEL: v16i16_arg: 3601; GFX9: ; %bb.0: ; %entry 3602; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 3603; GFX9-NEXT: v_mov_b32_e32 v4, 0 3604; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 3605; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3606; GFX9-NEXT: v_mov_b32_e32 v0, s4 3607; GFX9-NEXT: v_mov_b32_e32 v1, s5 3608; GFX9-NEXT: v_mov_b32_e32 v2, s6 3609; GFX9-NEXT: v_mov_b32_e32 v3, s7 3610; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16 3611; GFX9-NEXT: s_nop 0 3612; GFX9-NEXT: v_mov_b32_e32 v0, s0 3613; GFX9-NEXT: v_mov_b32_e32 v1, s1 3614; GFX9-NEXT: v_mov_b32_e32 v2, s2 3615; GFX9-NEXT: v_mov_b32_e32 v3, s3 3616; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 3617; GFX9-NEXT: s_endpgm 3618; 3619; EG-LABEL: v16i16_arg: 3620; EG: ; %bb.0: ; %entry 3621; EG-NEXT: ALU 1, @68, KC0[], KC1[] 3622; EG-NEXT: TEX 0 @36 3623; EG-NEXT: ALU 5, @70, KC0[], KC1[] 3624; EG-NEXT: TEX 0 @38 3625; EG-NEXT: ALU 5, @76, KC0[], KC1[] 3626; EG-NEXT: TEX 0 @40 3627; EG-NEXT: ALU 5, @82, KC0[], KC1[] 3628; EG-NEXT: TEX 0 @42 3629; EG-NEXT: ALU 5, @88, KC0[], KC1[] 3630; EG-NEXT: TEX 0 @44 3631; EG-NEXT: ALU 5, @94, KC0[], KC1[] 3632; EG-NEXT: TEX 0 @46 3633; EG-NEXT: ALU 5, @100, KC0[], KC1[] 3634; EG-NEXT: TEX 0 @48 3635; EG-NEXT: ALU 5, @106, KC0[], KC1[] 3636; EG-NEXT: TEX 0 @50 3637; EG-NEXT: ALU 5, @112, KC0[], KC1[] 3638; EG-NEXT: TEX 0 @52 3639; EG-NEXT: ALU 5, @118, KC0[], KC1[] 3640; EG-NEXT: TEX 0 @54 3641; EG-NEXT: ALU 5, @124, KC0[], KC1[] 3642; EG-NEXT: TEX 0 @56 3643; EG-NEXT: ALU 5, @130, KC0[], KC1[] 3644; EG-NEXT: TEX 0 @58 3645; EG-NEXT: ALU 5, @136, KC0[], KC1[] 3646; EG-NEXT: TEX 0 @60 3647; EG-NEXT: ALU 5, @142, KC0[], KC1[] 3648; EG-NEXT: TEX 0 @62 3649; EG-NEXT: ALU 5, @148, KC0[], KC1[] 3650; EG-NEXT: TEX 0 @64 3651; EG-NEXT: ALU 5, @154, KC0[], KC1[] 3652; EG-NEXT: TEX 0 @66 3653; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] 3654; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 3655; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 3656; EG-NEXT: CF_END 3657; EG-NEXT: Fetch clause starting at 36: 3658; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 3659; EG-NEXT: Fetch clause starting at 38: 3660; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 3661; EG-NEXT: Fetch clause starting at 40: 3662; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 3663; EG-NEXT: Fetch clause starting at 42: 3664; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 3665; EG-NEXT: Fetch clause starting at 44: 3666; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 3667; EG-NEXT: Fetch clause starting at 46: 3668; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 3669; EG-NEXT: Fetch clause starting at 48: 3670; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 3671; EG-NEXT: Fetch clause starting at 50: 3672; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 3673; EG-NEXT: Fetch clause starting at 52: 3674; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 3675; EG-NEXT: Fetch clause starting at 54: 3676; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 3677; EG-NEXT: Fetch clause starting at 56: 3678; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 3679; EG-NEXT: Fetch clause starting at 58: 3680; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 3681; EG-NEXT: Fetch clause starting at 60: 3682; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 3683; EG-NEXT: Fetch clause starting at 62: 3684; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 3685; EG-NEXT: Fetch clause starting at 64: 3686; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 3687; EG-NEXT: Fetch clause starting at 66: 3688; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 3689; EG-NEXT: ALU clause starting at 68: 3690; EG-NEXT: MOV * T0.Y, T3.X, 3691; EG-NEXT: MOV * T11.X, 0.0, 3692; EG-NEXT: ALU clause starting at 70: 3693; EG-NEXT: LSHL T0.W, T12.X, literal.x, 3694; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 3695; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 3696; EG-NEXT: OR_INT * T0.W, PS, PV.W, 3697; EG-NEXT: MOV T3.X, PV.W, 3698; EG-NEXT: MOV * T0.Y, T5.X, 3699; EG-NEXT: ALU clause starting at 76: 3700; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3701; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3702; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3703; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3704; EG-NEXT: MOV T5.X, PV.W, 3705; EG-NEXT: MOV * T0.Y, T7.X, 3706; EG-NEXT: ALU clause starting at 82: 3707; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3708; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3709; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3710; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3711; EG-NEXT: MOV T7.X, PV.W, 3712; EG-NEXT: MOV * T0.Y, T9.X, 3713; EG-NEXT: ALU clause starting at 88: 3714; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3715; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3716; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3717; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3718; EG-NEXT: MOV T9.X, PV.W, 3719; EG-NEXT: MOV * T0.Y, T3.X, 3720; EG-NEXT: ALU clause starting at 94: 3721; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3722; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3723; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3724; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3725; EG-NEXT: MOV T3.X, PV.W, 3726; EG-NEXT: MOV * T0.Y, T5.X, 3727; EG-NEXT: ALU clause starting at 100: 3728; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3729; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3730; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3731; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3732; EG-NEXT: MOV T5.X, PV.W, 3733; EG-NEXT: MOV * T0.Y, T7.X, 3734; EG-NEXT: ALU clause starting at 106: 3735; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3736; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3737; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3738; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3739; EG-NEXT: MOV T7.X, PV.W, 3740; EG-NEXT: MOV * T0.Y, T9.X, 3741; EG-NEXT: ALU clause starting at 112: 3742; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3743; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3744; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3745; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3746; EG-NEXT: MOV T9.X, PV.W, 3747; EG-NEXT: MOV * T0.Y, T2.X, 3748; EG-NEXT: ALU clause starting at 118: 3749; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3750; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3751; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3752; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3753; EG-NEXT: MOV T2.X, PV.W, 3754; EG-NEXT: MOV * T0.Y, T4.X, 3755; EG-NEXT: ALU clause starting at 124: 3756; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3757; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3758; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3759; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3760; EG-NEXT: MOV T4.X, PV.W, 3761; EG-NEXT: MOV * T0.Y, T6.X, 3762; EG-NEXT: ALU clause starting at 130: 3763; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3764; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3765; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3766; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3767; EG-NEXT: MOV T6.X, PV.W, 3768; EG-NEXT: MOV * T0.Y, T8.X, 3769; EG-NEXT: ALU clause starting at 136: 3770; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3771; EG-NEXT: LSHL * T1.W, T12.X, literal.y, 3772; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3773; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3774; EG-NEXT: MOV T8.X, PV.W, 3775; EG-NEXT: MOV * T0.Y, T2.X, 3776; EG-NEXT: ALU clause starting at 142: 3777; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3778; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3779; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3780; EG-NEXT: OR_INT * T12.Z, PV.W, PS, 3781; EG-NEXT: MOV T2.X, PV.Z, 3782; EG-NEXT: MOV * T0.Y, T4.X, 3783; EG-NEXT: ALU clause starting at 148: 3784; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3785; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, 3786; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3787; EG-NEXT: OR_INT * T12.X, PV.W, PS, 3788; EG-NEXT: MOV T4.X, PV.X, 3789; EG-NEXT: MOV * T0.Y, T6.X, 3790; EG-NEXT: ALU clause starting at 154: 3791; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, 3792; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, 3793; EG-NEXT: -65536(nan), 65535(9.183409e-41) 3794; EG-NEXT: OR_INT * T11.Z, PV.W, PS, 3795; EG-NEXT: MOV T6.X, PV.Z, 3796; EG-NEXT: MOV * T0.Y, T8.X, 3797; EG-NEXT: ALU clause starting at 160: 3798; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, 3799; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 3800; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 3801; EG-NEXT: LSHR T14.X, PV.W, literal.x, 3802; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, 3803; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, 3804; EG-NEXT: 2(2.802597e-45), -65536(nan) 3805; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3806; EG-NEXT: OR_INT * T11.X, PV.W, PS, 3807; EG-NEXT: MOV T8.X, PV.X, 3808; EG-NEXT: MOV * T12.W, T3.X, 3809; EG-NEXT: MOV T12.Y, T5.X, 3810; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 3811; EG-NEXT: MOV * T11.Y, T9.X, 3812; 3813; CM-LABEL: v16i16_arg: 3814; CM: ; %bb.0: ; %entry 3815; CM-NEXT: ALU 1, @68, KC0[], KC1[] 3816; CM-NEXT: TEX 0 @36 3817; CM-NEXT: ALU 5, @70, KC0[], KC1[] 3818; CM-NEXT: TEX 0 @38 3819; CM-NEXT: ALU 5, @76, KC0[], KC1[] 3820; CM-NEXT: TEX 0 @40 3821; CM-NEXT: ALU 5, @82, KC0[], KC1[] 3822; CM-NEXT: TEX 0 @42 3823; CM-NEXT: ALU 5, @88, KC0[], KC1[] 3824; CM-NEXT: TEX 0 @44 3825; CM-NEXT: ALU 5, @94, KC0[], KC1[] 3826; CM-NEXT: TEX 0 @46 3827; CM-NEXT: ALU 5, @100, KC0[], KC1[] 3828; CM-NEXT: TEX 0 @48 3829; CM-NEXT: ALU 5, @106, KC0[], KC1[] 3830; CM-NEXT: TEX 0 @50 3831; CM-NEXT: ALU 5, @112, KC0[], KC1[] 3832; CM-NEXT: TEX 0 @52 3833; CM-NEXT: ALU 5, @118, KC0[], KC1[] 3834; CM-NEXT: TEX 0 @54 3835; CM-NEXT: ALU 5, @124, KC0[], KC1[] 3836; CM-NEXT: TEX 0 @56 3837; CM-NEXT: ALU 5, @130, KC0[], KC1[] 3838; CM-NEXT: TEX 0 @58 3839; CM-NEXT: ALU 5, @136, KC0[], KC1[] 3840; CM-NEXT: TEX 0 @60 3841; CM-NEXT: ALU 5, @142, KC0[], KC1[] 3842; CM-NEXT: TEX 0 @62 3843; CM-NEXT: ALU 5, @148, KC0[], KC1[] 3844; CM-NEXT: TEX 0 @64 3845; CM-NEXT: ALU 5, @154, KC0[], KC1[] 3846; CM-NEXT: TEX 0 @66 3847; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] 3848; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X 3849; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X 3850; CM-NEXT: CF_END 3851; CM-NEXT: Fetch clause starting at 36: 3852; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 3853; CM-NEXT: Fetch clause starting at 38: 3854; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 3855; CM-NEXT: Fetch clause starting at 40: 3856; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 3857; CM-NEXT: Fetch clause starting at 42: 3858; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 3859; CM-NEXT: Fetch clause starting at 44: 3860; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 3861; CM-NEXT: Fetch clause starting at 46: 3862; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 3863; CM-NEXT: Fetch clause starting at 48: 3864; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 3865; CM-NEXT: Fetch clause starting at 50: 3866; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 3867; CM-NEXT: Fetch clause starting at 52: 3868; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 3869; CM-NEXT: Fetch clause starting at 54: 3870; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 3871; CM-NEXT: Fetch clause starting at 56: 3872; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 3873; CM-NEXT: Fetch clause starting at 58: 3874; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 3875; CM-NEXT: Fetch clause starting at 60: 3876; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 3877; CM-NEXT: Fetch clause starting at 62: 3878; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 3879; CM-NEXT: Fetch clause starting at 64: 3880; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 3881; CM-NEXT: Fetch clause starting at 66: 3882; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 3883; CM-NEXT: ALU clause starting at 68: 3884; CM-NEXT: MOV * T0.Y, T3.X, 3885; CM-NEXT: MOV * T11.X, 0.0, 3886; CM-NEXT: ALU clause starting at 70: 3887; CM-NEXT: LSHL T0.Z, T12.X, literal.x, 3888; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, 3889; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 3890; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, 3891; CM-NEXT: MOV T3.X, PV.W, 3892; CM-NEXT: MOV * T0.Y, T5.X, 3893; CM-NEXT: ALU clause starting at 76: 3894; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3895; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3896; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3897; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3898; CM-NEXT: MOV T5.X, PV.W, 3899; CM-NEXT: MOV * T0.Y, T7.X, 3900; CM-NEXT: ALU clause starting at 82: 3901; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3902; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3903; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3904; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3905; CM-NEXT: MOV T7.X, PV.W, 3906; CM-NEXT: MOV * T0.Y, T9.X, 3907; CM-NEXT: ALU clause starting at 88: 3908; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3909; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3910; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3911; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3912; CM-NEXT: MOV T9.X, PV.W, 3913; CM-NEXT: MOV * T0.Y, T3.X, 3914; CM-NEXT: ALU clause starting at 94: 3915; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3916; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3917; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3918; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3919; CM-NEXT: MOV T3.X, PV.W, 3920; CM-NEXT: MOV * T0.Y, T5.X, 3921; CM-NEXT: ALU clause starting at 100: 3922; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3923; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3924; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3925; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3926; CM-NEXT: MOV T5.X, PV.W, 3927; CM-NEXT: MOV * T0.Y, T7.X, 3928; CM-NEXT: ALU clause starting at 106: 3929; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3930; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3931; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3932; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3933; CM-NEXT: MOV T7.X, PV.W, 3934; CM-NEXT: MOV * T0.Y, T9.X, 3935; CM-NEXT: ALU clause starting at 112: 3936; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3937; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3938; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3939; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3940; CM-NEXT: MOV T9.X, PV.W, 3941; CM-NEXT: MOV * T0.Y, T2.X, 3942; CM-NEXT: ALU clause starting at 118: 3943; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3944; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3945; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3946; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3947; CM-NEXT: MOV T2.X, PV.W, 3948; CM-NEXT: MOV * T0.Y, T4.X, 3949; CM-NEXT: ALU clause starting at 124: 3950; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3951; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3952; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3953; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3954; CM-NEXT: MOV T4.X, PV.W, 3955; CM-NEXT: MOV * T0.Y, T6.X, 3956; CM-NEXT: ALU clause starting at 130: 3957; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3958; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3959; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3960; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3961; CM-NEXT: MOV T6.X, PV.W, 3962; CM-NEXT: MOV * T0.Y, T8.X, 3963; CM-NEXT: ALU clause starting at 136: 3964; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3965; CM-NEXT: LSHL * T0.W, T12.X, literal.y, 3966; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 3967; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, 3968; CM-NEXT: MOV T8.X, PV.W, 3969; CM-NEXT: MOV * T0.Y, T2.X, 3970; CM-NEXT: ALU clause starting at 142: 3971; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3972; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3973; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3974; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, 3975; CM-NEXT: MOV T2.X, PV.Z, 3976; CM-NEXT: MOV * T0.Y, T4.X, 3977; CM-NEXT: ALU clause starting at 148: 3978; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3979; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, 3980; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3981; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, 3982; CM-NEXT: MOV T4.X, PV.X, 3983; CM-NEXT: MOV * T0.Y, T6.X, 3984; CM-NEXT: ALU clause starting at 154: 3985; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, 3986; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, 3987; CM-NEXT: -65536(nan), 65535(9.183409e-41) 3988; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, 3989; CM-NEXT: MOV T6.X, PV.Z, 3990; CM-NEXT: MOV * T0.Y, T8.X, 3991; CM-NEXT: ALU clause starting at 160: 3992; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 3993; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3994; CM-NEXT: LSHR * T13.X, PV.W, literal.x, 3995; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3996; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, 3997; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, 3998; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, 3999; CM-NEXT: 2(2.802597e-45), -65536(nan) 4000; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 4001; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, 4002; CM-NEXT: MOV T8.X, PV.X, 4003; CM-NEXT: MOV * T12.W, T3.X, 4004; CM-NEXT: MOV T12.Y, T5.X, 4005; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 4006; CM-NEXT: MOV * T11.Y, T9.X, 4007entry: 4008 store <16 x i16> %in, ptr addrspace(1) %out 4009 ret void 4010} 4011 4012define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32> %in) nounwind { 4013; SI-LABEL: v16i32_arg: 4014; SI: ; %bb.0: ; %entry 4015; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 4016; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4017; SI-NEXT: s_mov_b32 s3, 0xf000 4018; SI-NEXT: s_mov_b32 s2, -1 4019; SI-NEXT: s_waitcnt lgkmcnt(0) 4020; SI-NEXT: v_mov_b32_e32 v0, s20 4021; SI-NEXT: v_mov_b32_e32 v1, s21 4022; SI-NEXT: v_mov_b32_e32 v2, s22 4023; SI-NEXT: v_mov_b32_e32 v3, s23 4024; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4025; SI-NEXT: s_waitcnt expcnt(0) 4026; SI-NEXT: v_mov_b32_e32 v0, s16 4027; SI-NEXT: v_mov_b32_e32 v1, s17 4028; SI-NEXT: v_mov_b32_e32 v2, s18 4029; SI-NEXT: v_mov_b32_e32 v3, s19 4030; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4031; SI-NEXT: s_waitcnt expcnt(0) 4032; SI-NEXT: v_mov_b32_e32 v0, s12 4033; SI-NEXT: v_mov_b32_e32 v1, s13 4034; SI-NEXT: v_mov_b32_e32 v2, s14 4035; SI-NEXT: v_mov_b32_e32 v3, s15 4036; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4037; SI-NEXT: s_waitcnt expcnt(0) 4038; SI-NEXT: v_mov_b32_e32 v0, s8 4039; SI-NEXT: v_mov_b32_e32 v1, s9 4040; SI-NEXT: v_mov_b32_e32 v2, s10 4041; SI-NEXT: v_mov_b32_e32 v3, s11 4042; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4043; SI-NEXT: s_endpgm 4044; 4045; VI-LABEL: v16i32_arg: 4046; VI: ; %bb.0: ; %entry 4047; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 4048; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4049; VI-NEXT: s_waitcnt lgkmcnt(0) 4050; VI-NEXT: v_mov_b32_e32 v0, s20 4051; VI-NEXT: s_add_u32 s2, s0, 48 4052; VI-NEXT: s_addc_u32 s3, s1, 0 4053; VI-NEXT: v_mov_b32_e32 v5, s3 4054; VI-NEXT: v_mov_b32_e32 v4, s2 4055; VI-NEXT: s_add_u32 s2, s0, 32 4056; VI-NEXT: v_mov_b32_e32 v1, s21 4057; VI-NEXT: v_mov_b32_e32 v2, s22 4058; VI-NEXT: v_mov_b32_e32 v3, s23 4059; VI-NEXT: s_addc_u32 s3, s1, 0 4060; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4061; VI-NEXT: v_mov_b32_e32 v5, s3 4062; VI-NEXT: v_mov_b32_e32 v4, s2 4063; VI-NEXT: s_add_u32 s2, s0, 16 4064; VI-NEXT: v_mov_b32_e32 v0, s16 4065; VI-NEXT: v_mov_b32_e32 v1, s17 4066; VI-NEXT: v_mov_b32_e32 v2, s18 4067; VI-NEXT: v_mov_b32_e32 v3, s19 4068; VI-NEXT: s_addc_u32 s3, s1, 0 4069; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4070; VI-NEXT: v_mov_b32_e32 v5, s3 4071; VI-NEXT: v_mov_b32_e32 v0, s12 4072; VI-NEXT: v_mov_b32_e32 v1, s13 4073; VI-NEXT: v_mov_b32_e32 v2, s14 4074; VI-NEXT: v_mov_b32_e32 v3, s15 4075; VI-NEXT: v_mov_b32_e32 v4, s2 4076; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4077; VI-NEXT: v_mov_b32_e32 v5, s1 4078; VI-NEXT: v_mov_b32_e32 v0, s8 4079; VI-NEXT: v_mov_b32_e32 v1, s9 4080; VI-NEXT: v_mov_b32_e32 v2, s10 4081; VI-NEXT: v_mov_b32_e32 v3, s11 4082; VI-NEXT: v_mov_b32_e32 v4, s0 4083; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4084; VI-NEXT: s_endpgm 4085; 4086; GFX9-LABEL: v16i32_arg: 4087; GFX9: ; %bb.0: ; %entry 4088; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 4089; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4090; GFX9-NEXT: v_mov_b32_e32 v4, 0 4091; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4092; GFX9-NEXT: v_mov_b32_e32 v0, s24 4093; GFX9-NEXT: v_mov_b32_e32 v1, s25 4094; GFX9-NEXT: v_mov_b32_e32 v2, s26 4095; GFX9-NEXT: v_mov_b32_e32 v3, s27 4096; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4097; GFX9-NEXT: s_nop 0 4098; GFX9-NEXT: v_mov_b32_e32 v0, s20 4099; GFX9-NEXT: v_mov_b32_e32 v1, s21 4100; GFX9-NEXT: v_mov_b32_e32 v2, s22 4101; GFX9-NEXT: v_mov_b32_e32 v3, s23 4102; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4103; GFX9-NEXT: s_nop 0 4104; GFX9-NEXT: v_mov_b32_e32 v0, s16 4105; GFX9-NEXT: v_mov_b32_e32 v1, s17 4106; GFX9-NEXT: v_mov_b32_e32 v2, s18 4107; GFX9-NEXT: v_mov_b32_e32 v3, s19 4108; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4109; GFX9-NEXT: s_nop 0 4110; GFX9-NEXT: v_mov_b32_e32 v0, s12 4111; GFX9-NEXT: v_mov_b32_e32 v1, s13 4112; GFX9-NEXT: v_mov_b32_e32 v2, s14 4113; GFX9-NEXT: v_mov_b32_e32 v3, s15 4114; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4115; GFX9-NEXT: s_endpgm 4116; 4117; EG-LABEL: v16i32_arg: 4118; EG: ; %bb.0: ; %entry 4119; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4120; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4123; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4124; EG-NEXT: CF_END 4125; EG-NEXT: ALU clause starting at 6: 4126; EG-NEXT: MOV * T0.W, KC0[7].X, 4127; EG-NEXT: MOV * T0.Z, KC0[6].W, 4128; EG-NEXT: MOV T0.Y, KC0[6].Z, 4129; EG-NEXT: MOV * T1.W, KC0[8].X, 4130; EG-NEXT: MOV T0.X, KC0[6].Y, 4131; EG-NEXT: MOV * T1.Z, KC0[7].W, 4132; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4133; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4134; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4135; EG-NEXT: MOV * T3.W, KC0[9].X, 4136; EG-NEXT: MOV T1.X, KC0[7].Y, 4137; EG-NEXT: MOV * T3.Z, KC0[8].W, 4138; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4139; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4140; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4141; EG-NEXT: MOV T3.Y, KC0[8].Z, 4142; EG-NEXT: MOV * T5.W, KC0[10].X, 4143; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4144; EG-NEXT: MOV T3.X, KC0[8].Y, 4145; EG-NEXT: MOV * T5.Z, KC0[9].W, 4146; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4147; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4148; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4149; EG-NEXT: MOV T5.Y, KC0[9].Z, 4150; EG-NEXT: MOV * T5.X, KC0[9].Y, 4151; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4152; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4153; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4154; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4155; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4156; 4157; CM-LABEL: v16i32_arg: 4158; CM: ; %bb.0: ; %entry 4159; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4160; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4161; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4162; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4163; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4164; CM-NEXT: CF_END 4165; CM-NEXT: ALU clause starting at 6: 4166; CM-NEXT: MOV * T0.W, KC0[10].X, 4167; CM-NEXT: MOV * T0.Z, KC0[9].W, 4168; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4169; CM-NEXT: MOV T0.X, KC0[9].Y, 4170; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4171; CM-NEXT: MOV * T2.W, KC0[9].X, 4172; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4173; CM-NEXT: MOV T2.Z, KC0[8].W, 4174; CM-NEXT: MOV * T1.W, KC0[8].X, 4175; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4176; CM-NEXT: MOV T2.Y, KC0[8].Z, 4177; CM-NEXT: MOV * T1.Z, KC0[7].W, 4178; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4179; CM-NEXT: MOV T2.X, KC0[8].Y, 4180; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4181; CM-NEXT: MOV T1.X, KC0[7].Y, 4182; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4183; CM-NEXT: MOV * T4.W, KC0[7].X, 4184; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4185; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4186; CM-NEXT: MOV T4.Z, KC0[6].W, 4187; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4188; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4189; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4190; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4191; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4192; CM-NEXT: MOV * T4.X, KC0[6].Y, 4193; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4194; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4195entry: 4196 store <16 x i32> %in, ptr addrspace(1) %out, align 4 4197 ret void 4198} 4199 4200define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x float> %in) nounwind { 4201; SI-LABEL: v16f32_arg: 4202; SI: ; %bb.0: ; %entry 4203; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 4204; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4205; SI-NEXT: s_mov_b32 s3, 0xf000 4206; SI-NEXT: s_mov_b32 s2, -1 4207; SI-NEXT: s_waitcnt lgkmcnt(0) 4208; SI-NEXT: v_mov_b32_e32 v0, s20 4209; SI-NEXT: v_mov_b32_e32 v1, s21 4210; SI-NEXT: v_mov_b32_e32 v2, s22 4211; SI-NEXT: v_mov_b32_e32 v3, s23 4212; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 4213; SI-NEXT: s_waitcnt expcnt(0) 4214; SI-NEXT: v_mov_b32_e32 v0, s16 4215; SI-NEXT: v_mov_b32_e32 v1, s17 4216; SI-NEXT: v_mov_b32_e32 v2, s18 4217; SI-NEXT: v_mov_b32_e32 v3, s19 4218; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 4219; SI-NEXT: s_waitcnt expcnt(0) 4220; SI-NEXT: v_mov_b32_e32 v0, s12 4221; SI-NEXT: v_mov_b32_e32 v1, s13 4222; SI-NEXT: v_mov_b32_e32 v2, s14 4223; SI-NEXT: v_mov_b32_e32 v3, s15 4224; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 4225; SI-NEXT: s_waitcnt expcnt(0) 4226; SI-NEXT: v_mov_b32_e32 v0, s8 4227; SI-NEXT: v_mov_b32_e32 v1, s9 4228; SI-NEXT: v_mov_b32_e32 v2, s10 4229; SI-NEXT: v_mov_b32_e32 v3, s11 4230; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4231; SI-NEXT: s_endpgm 4232; 4233; VI-LABEL: v16f32_arg: 4234; VI: ; %bb.0: ; %entry 4235; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 4236; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4237; VI-NEXT: s_waitcnt lgkmcnt(0) 4238; VI-NEXT: v_mov_b32_e32 v0, s20 4239; VI-NEXT: s_add_u32 s2, s0, 48 4240; VI-NEXT: s_addc_u32 s3, s1, 0 4241; VI-NEXT: v_mov_b32_e32 v5, s3 4242; VI-NEXT: v_mov_b32_e32 v4, s2 4243; VI-NEXT: s_add_u32 s2, s0, 32 4244; VI-NEXT: v_mov_b32_e32 v1, s21 4245; VI-NEXT: v_mov_b32_e32 v2, s22 4246; VI-NEXT: v_mov_b32_e32 v3, s23 4247; VI-NEXT: s_addc_u32 s3, s1, 0 4248; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4249; VI-NEXT: v_mov_b32_e32 v5, s3 4250; VI-NEXT: v_mov_b32_e32 v4, s2 4251; VI-NEXT: s_add_u32 s2, s0, 16 4252; VI-NEXT: v_mov_b32_e32 v0, s16 4253; VI-NEXT: v_mov_b32_e32 v1, s17 4254; VI-NEXT: v_mov_b32_e32 v2, s18 4255; VI-NEXT: v_mov_b32_e32 v3, s19 4256; VI-NEXT: s_addc_u32 s3, s1, 0 4257; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4258; VI-NEXT: v_mov_b32_e32 v5, s3 4259; VI-NEXT: v_mov_b32_e32 v0, s12 4260; VI-NEXT: v_mov_b32_e32 v1, s13 4261; VI-NEXT: v_mov_b32_e32 v2, s14 4262; VI-NEXT: v_mov_b32_e32 v3, s15 4263; VI-NEXT: v_mov_b32_e32 v4, s2 4264; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4265; VI-NEXT: v_mov_b32_e32 v5, s1 4266; VI-NEXT: v_mov_b32_e32 v0, s8 4267; VI-NEXT: v_mov_b32_e32 v1, s9 4268; VI-NEXT: v_mov_b32_e32 v2, s10 4269; VI-NEXT: v_mov_b32_e32 v3, s11 4270; VI-NEXT: v_mov_b32_e32 v4, s0 4271; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 4272; VI-NEXT: s_endpgm 4273; 4274; GFX9-LABEL: v16f32_arg: 4275; GFX9: ; %bb.0: ; %entry 4276; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 4277; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4278; GFX9-NEXT: v_mov_b32_e32 v4, 0 4279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4280; GFX9-NEXT: v_mov_b32_e32 v0, s24 4281; GFX9-NEXT: v_mov_b32_e32 v1, s25 4282; GFX9-NEXT: v_mov_b32_e32 v2, s26 4283; GFX9-NEXT: v_mov_b32_e32 v3, s27 4284; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 4285; GFX9-NEXT: s_nop 0 4286; GFX9-NEXT: v_mov_b32_e32 v0, s20 4287; GFX9-NEXT: v_mov_b32_e32 v1, s21 4288; GFX9-NEXT: v_mov_b32_e32 v2, s22 4289; GFX9-NEXT: v_mov_b32_e32 v3, s23 4290; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 4291; GFX9-NEXT: s_nop 0 4292; GFX9-NEXT: v_mov_b32_e32 v0, s16 4293; GFX9-NEXT: v_mov_b32_e32 v1, s17 4294; GFX9-NEXT: v_mov_b32_e32 v2, s18 4295; GFX9-NEXT: v_mov_b32_e32 v3, s19 4296; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 4297; GFX9-NEXT: s_nop 0 4298; GFX9-NEXT: v_mov_b32_e32 v0, s12 4299; GFX9-NEXT: v_mov_b32_e32 v1, s13 4300; GFX9-NEXT: v_mov_b32_e32 v2, s14 4301; GFX9-NEXT: v_mov_b32_e32 v3, s15 4302; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 4303; GFX9-NEXT: s_endpgm 4304; 4305; EG-LABEL: v16f32_arg: 4306; EG: ; %bb.0: ; %entry 4307; EG-NEXT: ALU 29, @6, KC0[CB0:0-32], KC1[] 4308; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0 4309; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0 4310; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0 4311; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 4312; EG-NEXT: CF_END 4313; EG-NEXT: ALU clause starting at 6: 4314; EG-NEXT: MOV * T0.W, KC0[7].X, 4315; EG-NEXT: MOV * T0.Z, KC0[6].W, 4316; EG-NEXT: MOV T0.Y, KC0[6].Z, 4317; EG-NEXT: MOV * T1.W, KC0[8].X, 4318; EG-NEXT: MOV T0.X, KC0[6].Y, 4319; EG-NEXT: MOV * T1.Z, KC0[7].W, 4320; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 4321; EG-NEXT: MOV * T1.Y, KC0[7].Z, 4322; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4323; EG-NEXT: MOV * T3.W, KC0[9].X, 4324; EG-NEXT: MOV T1.X, KC0[7].Y, 4325; EG-NEXT: MOV * T3.Z, KC0[8].W, 4326; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4327; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4328; EG-NEXT: LSHR T4.X, PV.W, literal.x, 4329; EG-NEXT: MOV T3.Y, KC0[8].Z, 4330; EG-NEXT: MOV * T5.W, KC0[10].X, 4331; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4332; EG-NEXT: MOV T3.X, KC0[8].Y, 4333; EG-NEXT: MOV * T5.Z, KC0[9].W, 4334; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4335; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4336; EG-NEXT: LSHR T6.X, PV.W, literal.x, 4337; EG-NEXT: MOV T5.Y, KC0[9].Z, 4338; EG-NEXT: MOV * T5.X, KC0[9].Y, 4339; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4340; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4341; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4342; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4343; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4344; 4345; CM-LABEL: v16f32_arg: 4346; CM: ; %bb.0: ; %entry 4347; CM-NEXT: ALU 28, @6, KC0[CB0:0-32], KC1[] 4348; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X 4349; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T6.X 4350; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T5.X 4351; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X 4352; CM-NEXT: CF_END 4353; CM-NEXT: ALU clause starting at 6: 4354; CM-NEXT: MOV * T0.W, KC0[10].X, 4355; CM-NEXT: MOV * T0.Z, KC0[9].W, 4356; CM-NEXT: MOV * T0.Y, KC0[9].Z, 4357; CM-NEXT: MOV T0.X, KC0[9].Y, 4358; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x, 4359; CM-NEXT: MOV * T2.W, KC0[9].X, 4360; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 4361; CM-NEXT: MOV T2.Z, KC0[8].W, 4362; CM-NEXT: MOV * T1.W, KC0[8].X, 4363; CM-NEXT: LSHR T3.X, T1.Z, literal.x, 4364; CM-NEXT: MOV T2.Y, KC0[8].Z, 4365; CM-NEXT: MOV * T1.Z, KC0[7].W, 4366; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4367; CM-NEXT: MOV T2.X, KC0[8].Y, 4368; CM-NEXT: MOV * T1.Y, KC0[7].Z, 4369; CM-NEXT: MOV T1.X, KC0[7].Y, 4370; CM-NEXT: ADD_INT T3.Z, KC0[2].Y, literal.x, 4371; CM-NEXT: MOV * T4.W, KC0[7].X, 4372; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 4373; CM-NEXT: LSHR T5.X, PV.Z, literal.x, 4374; CM-NEXT: MOV T4.Z, KC0[6].W, 4375; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y, 4376; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44) 4377; CM-NEXT: LSHR T6.X, PV.W, literal.x, 4378; CM-NEXT: MOV * T4.Y, KC0[6].Z, 4379; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4380; CM-NEXT: MOV * T4.X, KC0[6].Y, 4381; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 4382; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4383entry: 4384 store <16 x float> %in, ptr addrspace(1) %out, align 4 4385 ret void 4386} 4387 4388define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwind { 4389; SI-LABEL: kernel_arg_i64: 4390; SI: ; %bb.0: 4391; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4392; SI-NEXT: s_mov_b32 s7, 0xf000 4393; SI-NEXT: s_mov_b32 s6, -1 4394; SI-NEXT: s_waitcnt lgkmcnt(0) 4395; SI-NEXT: s_mov_b32 s4, s0 4396; SI-NEXT: s_mov_b32 s5, s1 4397; SI-NEXT: v_mov_b32_e32 v0, s2 4398; SI-NEXT: v_mov_b32_e32 v1, s3 4399; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4400; SI-NEXT: s_endpgm 4401; 4402; VI-LABEL: kernel_arg_i64: 4403; VI: ; %bb.0: 4404; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4405; VI-NEXT: s_waitcnt lgkmcnt(0) 4406; VI-NEXT: v_mov_b32_e32 v0, s0 4407; VI-NEXT: v_mov_b32_e32 v1, s1 4408; VI-NEXT: v_mov_b32_e32 v2, s2 4409; VI-NEXT: v_mov_b32_e32 v3, s3 4410; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4411; VI-NEXT: s_endpgm 4412; 4413; GFX9-LABEL: kernel_arg_i64: 4414; GFX9: ; %bb.0: 4415; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4416; GFX9-NEXT: v_mov_b32_e32 v2, 0 4417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4418; GFX9-NEXT: v_mov_b32_e32 v0, s2 4419; GFX9-NEXT: v_mov_b32_e32 v1, s3 4420; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4421; GFX9-NEXT: s_endpgm 4422; 4423; EG-LABEL: kernel_arg_i64: 4424; EG: ; %bb.0: 4425; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4426; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4427; EG-NEXT: CF_END 4428; EG-NEXT: PAD 4429; EG-NEXT: ALU clause starting at 4: 4430; EG-NEXT: MOV * T0.Y, KC0[3].X, 4431; EG-NEXT: MOV T0.X, KC0[2].W, 4432; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4433; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4434; 4435; CM-LABEL: kernel_arg_i64: 4436; CM: ; %bb.0: 4437; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4438; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4439; CM-NEXT: CF_END 4440; CM-NEXT: PAD 4441; CM-NEXT: ALU clause starting at 4: 4442; CM-NEXT: MOV * T0.Y, KC0[3].X, 4443; CM-NEXT: MOV * T0.X, KC0[2].W, 4444; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4445; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4446 store i64 %a, ptr addrspace(1) %out, align 8 4447 ret void 4448} 4449 4450define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) { 4451; SI-LABEL: f64_kernel_arg: 4452; SI: ; %bb.0: ; %entry 4453; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4454; SI-NEXT: s_mov_b32 s7, 0xf000 4455; SI-NEXT: s_mov_b32 s6, -1 4456; SI-NEXT: s_waitcnt lgkmcnt(0) 4457; SI-NEXT: s_mov_b32 s4, s0 4458; SI-NEXT: s_mov_b32 s5, s1 4459; SI-NEXT: v_mov_b32_e32 v0, s2 4460; SI-NEXT: v_mov_b32_e32 v1, s3 4461; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4462; SI-NEXT: s_endpgm 4463; 4464; VI-LABEL: f64_kernel_arg: 4465; VI: ; %bb.0: ; %entry 4466; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4467; VI-NEXT: s_waitcnt lgkmcnt(0) 4468; VI-NEXT: v_mov_b32_e32 v0, s0 4469; VI-NEXT: v_mov_b32_e32 v1, s1 4470; VI-NEXT: v_mov_b32_e32 v2, s2 4471; VI-NEXT: v_mov_b32_e32 v3, s3 4472; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4473; VI-NEXT: s_endpgm 4474; 4475; GFX9-LABEL: f64_kernel_arg: 4476; GFX9: ; %bb.0: ; %entry 4477; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4478; GFX9-NEXT: v_mov_b32_e32 v2, 0 4479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4480; GFX9-NEXT: v_mov_b32_e32 v0, s2 4481; GFX9-NEXT: v_mov_b32_e32 v1, s3 4482; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4483; GFX9-NEXT: s_endpgm 4484; 4485; EG-LABEL: f64_kernel_arg: 4486; EG: ; %bb.0: ; %entry 4487; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4488; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4489; EG-NEXT: CF_END 4490; EG-NEXT: PAD 4491; EG-NEXT: ALU clause starting at 4: 4492; EG-NEXT: MOV * T0.Y, KC0[3].X, 4493; EG-NEXT: MOV T0.X, KC0[2].W, 4494; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4495; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4496; 4497; CM-LABEL: f64_kernel_arg: 4498; CM: ; %bb.0: ; %entry 4499; CM-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 4500; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4501; CM-NEXT: CF_END 4502; CM-NEXT: PAD 4503; CM-NEXT: ALU clause starting at 4: 4504; CM-NEXT: MOV * T0.Y, KC0[3].X, 4505; CM-NEXT: MOV * T0.X, KC0[2].W, 4506; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4507; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4508entry: 4509 store double %in, ptr addrspace(1) %out 4510 ret void 4511} 4512 4513; XFUNC-LABEL: {{^}}kernel_arg_v1i64: 4514; XGCN: s_load_dwordx2 4515; XGCN: s_load_dwordx2 4516; XGCN: buffer_store_dwordx2 4517; define amdgpu_kernel void @kernel_arg_v1i64(ptr addrspace(1) %out, <1 x i64> %a) nounwind { 4518; store <1 x i64> %a, ptr addrspace(1) %out, align 8 4519; ret void 4520; } 4521 4522define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nounwind { 4523; SI-LABEL: i65_arg: 4524; SI: ; %bb.0: ; %entry 4525; SI-NEXT: s_load_dword s6, s[4:5], 0xd 4526; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 4527; SI-NEXT: s_mov_b32 s7, 0xf000 4528; SI-NEXT: s_waitcnt lgkmcnt(0) 4529; SI-NEXT: s_and_b32 s8, s6, 1 4530; SI-NEXT: s_mov_b32 s6, -1 4531; SI-NEXT: s_mov_b32 s4, s0 4532; SI-NEXT: s_mov_b32 s5, s1 4533; SI-NEXT: v_mov_b32_e32 v0, s2 4534; SI-NEXT: v_mov_b32_e32 v1, s3 4535; SI-NEXT: v_mov_b32_e32 v2, s8 4536; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:8 4537; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4538; SI-NEXT: s_endpgm 4539; 4540; VI-LABEL: i65_arg: 4541; VI: ; %bb.0: ; %entry 4542; VI-NEXT: s_load_dword s6, s[4:5], 0x34 4543; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4544; VI-NEXT: s_waitcnt lgkmcnt(0) 4545; VI-NEXT: s_and_b32 s4, s6, 1 4546; VI-NEXT: v_mov_b32_e32 v0, s0 4547; VI-NEXT: v_mov_b32_e32 v1, s1 4548; VI-NEXT: s_add_u32 s0, s0, 8 4549; VI-NEXT: s_addc_u32 s1, s1, 0 4550; VI-NEXT: v_mov_b32_e32 v5, s1 4551; VI-NEXT: v_mov_b32_e32 v2, s2 4552; VI-NEXT: v_mov_b32_e32 v6, s4 4553; VI-NEXT: v_mov_b32_e32 v4, s0 4554; VI-NEXT: v_mov_b32_e32 v3, s3 4555; VI-NEXT: flat_store_byte v[4:5], v6 4556; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4557; VI-NEXT: s_endpgm 4558; 4559; GFX9-LABEL: i65_arg: 4560; GFX9: ; %bb.0: ; %entry 4561; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 4562; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4563; GFX9-NEXT: v_mov_b32_e32 v2, 0 4564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4565; GFX9-NEXT: s_and_b32 s4, s4, 1 4566; GFX9-NEXT: v_mov_b32_e32 v0, s2 4567; GFX9-NEXT: v_mov_b32_e32 v3, s4 4568; GFX9-NEXT: v_mov_b32_e32 v1, s3 4569; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 4570; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 4571; GFX9-NEXT: s_endpgm 4572; 4573; EG-LABEL: i65_arg: 4574; EG: ; %bb.0: ; %entry 4575; EG-NEXT: ALU 20, @6, KC0[CB0:0-32], KC1[] 4576; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 4577; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 4578; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 4579; EG-NEXT: CF_END 4580; EG-NEXT: PAD 4581; EG-NEXT: ALU clause starting at 6: 4582; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4583; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4584; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 4585; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4586; EG-NEXT: LSHL T1.W, PV.W, literal.x, 4587; EG-NEXT: AND_INT * T2.W, KC0[3].Y, 1, 4588; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4589; EG-NEXT: LSHL T1.X, PS, PV.W, 4590; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 4591; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4592; EG-NEXT: MOV T1.Y, 0.0, 4593; EG-NEXT: MOV * T1.Z, 0.0, 4594; EG-NEXT: LSHR T0.X, T0.W, literal.x, 4595; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, 4596; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) 4597; EG-NEXT: LSHR T2.X, PV.W, literal.x, 4598; EG-NEXT: MOV * T3.X, KC0[3].X, 4599; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4600; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, 4601; EG-NEXT: MOV * T5.X, KC0[2].W, 4602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4603; 4604; CM-LABEL: i65_arg: 4605; CM: ; %bb.0: ; %entry 4606; CM-NEXT: ALU 21, @6, KC0[CB0:0-32], KC1[] 4607; CM-NEXT: MEM_RAT MSKOR T1.XW, T5.X 4608; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 4609; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X 4610; CM-NEXT: CF_END 4611; CM-NEXT: PAD 4612; CM-NEXT: ALU clause starting at 6: 4613; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 4614; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) 4615; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, 4616; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4617; CM-NEXT: LSHL T0.Z, PV.W, literal.x, 4618; CM-NEXT: AND_INT * T1.W, KC0[3].Y, 1, 4619; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4620; CM-NEXT: LSHL T1.X, PV.W, PV.Z, 4621; CM-NEXT: LSHL * T1.W, literal.x, PV.Z, 4622; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4623; CM-NEXT: MOV T1.Y, 0.0, 4624; CM-NEXT: MOV * T1.Z, 0.0, 4625; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 4626; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4627; CM-NEXT: MOV T2.X, KC0[2].W, 4628; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 4629; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) 4630; CM-NEXT: LSHR * T3.X, PV.W, literal.x, 4631; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4632; CM-NEXT: MOV * T4.X, KC0[3].X, 4633; CM-NEXT: LSHR * T5.X, T0.W, literal.x, 4634; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4635entry: 4636 store i65 %in, ptr addrspace(1) %out, align 4 4637 ret void 4638} 4639 4640define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind { 4641; SI-LABEL: i1_arg: 4642; SI: ; %bb.0: 4643; SI-NEXT: s_load_dword s2, s[4:5], 0xb 4644; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4645; SI-NEXT: s_mov_b32 s3, 0xf000 4646; SI-NEXT: s_waitcnt lgkmcnt(0) 4647; SI-NEXT: s_and_b32 s4, s2, 1 4648; SI-NEXT: s_mov_b32 s2, -1 4649; SI-NEXT: v_mov_b32_e32 v0, s4 4650; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 4651; SI-NEXT: s_endpgm 4652; 4653; VI-LABEL: i1_arg: 4654; VI: ; %bb.0: 4655; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 4656; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4657; VI-NEXT: s_waitcnt lgkmcnt(0) 4658; VI-NEXT: s_and_b32 s2, s2, 1 4659; VI-NEXT: v_mov_b32_e32 v0, s0 4660; VI-NEXT: v_mov_b32_e32 v1, s1 4661; VI-NEXT: v_mov_b32_e32 v2, s2 4662; VI-NEXT: flat_store_byte v[0:1], v2 4663; VI-NEXT: s_endpgm 4664; 4665; GFX9-LABEL: i1_arg: 4666; GFX9: ; %bb.0: 4667; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 4668; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4669; GFX9-NEXT: v_mov_b32_e32 v0, 0 4670; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4671; GFX9-NEXT: s_and_b32 s2, s2, 1 4672; GFX9-NEXT: v_mov_b32_e32 v1, s2 4673; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 4674; GFX9-NEXT: s_endpgm 4675; 4676; EG-LABEL: i1_arg: 4677; EG: ; %bb.0: 4678; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4679; EG-NEXT: TEX 0 @6 4680; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4681; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4682; EG-NEXT: CF_END 4683; EG-NEXT: PAD 4684; EG-NEXT: Fetch clause starting at 6: 4685; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4686; EG-NEXT: ALU clause starting at 8: 4687; EG-NEXT: MOV * T0.X, 0.0, 4688; EG-NEXT: ALU clause starting at 9: 4689; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 4690; EG-NEXT: AND_INT * T1.W, T0.X, 1, 4691; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4692; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 4693; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4694; EG-NEXT: LSHL T0.X, T1.W, PV.W, 4695; EG-NEXT: LSHL * T0.W, literal.x, PV.W, 4696; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4697; EG-NEXT: MOV T0.Y, 0.0, 4698; EG-NEXT: MOV * T0.Z, 0.0, 4699; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4700; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4701; 4702; CM-LABEL: i1_arg: 4703; CM: ; %bb.0: 4704; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4705; CM-NEXT: TEX 0 @6 4706; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 4707; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 4708; CM-NEXT: CF_END 4709; CM-NEXT: PAD 4710; CM-NEXT: Fetch clause starting at 6: 4711; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4712; CM-NEXT: ALU clause starting at 8: 4713; CM-NEXT: MOV * T0.X, 0.0, 4714; CM-NEXT: ALU clause starting at 9: 4715; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 4716; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4717; CM-NEXT: AND_INT T0.Z, T0.X, 1, 4718; CM-NEXT: LSHL * T0.W, PV.W, literal.x, 4719; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) 4720; CM-NEXT: LSHL T0.X, PV.Z, PV.W, 4721; CM-NEXT: LSHL * T0.W, literal.x, PV.W, 4722; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 4723; CM-NEXT: MOV T0.Y, 0.0, 4724; CM-NEXT: MOV * T0.Z, 0.0, 4725; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4726; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4727 store i1 %x, ptr addrspace(1) %out, align 1 4728 ret void 4729} 4730 4731define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwind { 4732; SI-LABEL: i1_arg_zext_i32: 4733; SI: ; %bb.0: 4734; SI-NEXT: s_load_dword s2, s[4:5], 0xb 4735; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4736; SI-NEXT: s_mov_b32 s3, 0xf000 4737; SI-NEXT: s_waitcnt lgkmcnt(0) 4738; SI-NEXT: s_and_b32 s4, s2, 1 4739; SI-NEXT: s_mov_b32 s2, -1 4740; SI-NEXT: v_mov_b32_e32 v0, s4 4741; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 4742; SI-NEXT: s_endpgm 4743; 4744; VI-LABEL: i1_arg_zext_i32: 4745; VI: ; %bb.0: 4746; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 4747; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4748; VI-NEXT: s_waitcnt lgkmcnt(0) 4749; VI-NEXT: s_and_b32 s2, s2, 1 4750; VI-NEXT: v_mov_b32_e32 v0, s0 4751; VI-NEXT: v_mov_b32_e32 v1, s1 4752; VI-NEXT: v_mov_b32_e32 v2, s2 4753; VI-NEXT: flat_store_dword v[0:1], v2 4754; VI-NEXT: s_endpgm 4755; 4756; GFX9-LABEL: i1_arg_zext_i32: 4757; GFX9: ; %bb.0: 4758; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 4759; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4760; GFX9-NEXT: v_mov_b32_e32 v0, 0 4761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4762; GFX9-NEXT: s_and_b32 s2, s2, 1 4763; GFX9-NEXT: v_mov_b32_e32 v1, s2 4764; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4765; GFX9-NEXT: s_endpgm 4766; 4767; EG-LABEL: i1_arg_zext_i32: 4768; EG: ; %bb.0: 4769; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4770; EG-NEXT: TEX 0 @6 4771; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4772; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 4773; EG-NEXT: CF_END 4774; EG-NEXT: PAD 4775; EG-NEXT: Fetch clause starting at 6: 4776; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4777; EG-NEXT: ALU clause starting at 8: 4778; EG-NEXT: MOV * T0.X, 0.0, 4779; EG-NEXT: ALU clause starting at 9: 4780; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4781; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4782; 4783; CM-LABEL: i1_arg_zext_i32: 4784; CM: ; %bb.0: 4785; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4786; CM-NEXT: TEX 0 @6 4787; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[] 4788; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 4789; CM-NEXT: CF_END 4790; CM-NEXT: PAD 4791; CM-NEXT: Fetch clause starting at 6: 4792; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4793; CM-NEXT: ALU clause starting at 8: 4794; CM-NEXT: MOV * T0.X, 0.0, 4795; CM-NEXT: ALU clause starting at 9: 4796; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4797; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4798 %ext = zext i1 %x to i32 4799 store i32 %ext, ptr addrspace(1) %out, align 4 4800 ret void 4801} 4802 4803define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwind { 4804; SI-LABEL: i1_arg_zext_i64: 4805; SI: ; %bb.0: 4806; SI-NEXT: s_load_dword s6, s[4:5], 0xb 4807; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4808; SI-NEXT: s_mov_b32 s3, 0xf000 4809; SI-NEXT: s_mov_b32 s2, -1 4810; SI-NEXT: s_waitcnt lgkmcnt(0) 4811; SI-NEXT: s_and_b32 s4, s6, 1 4812; SI-NEXT: v_mov_b32_e32 v1, 0 4813; SI-NEXT: v_mov_b32_e32 v0, s4 4814; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4815; SI-NEXT: s_endpgm 4816; 4817; VI-LABEL: i1_arg_zext_i64: 4818; VI: ; %bb.0: 4819; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 4820; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4821; VI-NEXT: v_mov_b32_e32 v1, 0 4822; VI-NEXT: s_waitcnt lgkmcnt(0) 4823; VI-NEXT: s_and_b32 s2, s2, 1 4824; VI-NEXT: v_mov_b32_e32 v3, s1 4825; VI-NEXT: v_mov_b32_e32 v0, s2 4826; VI-NEXT: v_mov_b32_e32 v2, s0 4827; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 4828; VI-NEXT: s_endpgm 4829; 4830; GFX9-LABEL: i1_arg_zext_i64: 4831; GFX9: ; %bb.0: 4832; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 4833; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4834; GFX9-NEXT: v_mov_b32_e32 v1, 0 4835; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4836; GFX9-NEXT: s_and_b32 s2, s2, 1 4837; GFX9-NEXT: v_mov_b32_e32 v0, s2 4838; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 4839; GFX9-NEXT: s_endpgm 4840; 4841; EG-LABEL: i1_arg_zext_i64: 4842; EG: ; %bb.0: 4843; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4844; EG-NEXT: TEX 0 @6 4845; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 4846; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4847; EG-NEXT: CF_END 4848; EG-NEXT: PAD 4849; EG-NEXT: Fetch clause starting at 6: 4850; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4851; EG-NEXT: ALU clause starting at 8: 4852; EG-NEXT: MOV * T0.X, 0.0, 4853; EG-NEXT: ALU clause starting at 9: 4854; EG-NEXT: MOV * T0.Y, 0.0, 4855; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4856; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4857; 4858; CM-LABEL: i1_arg_zext_i64: 4859; CM: ; %bb.0: 4860; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4861; CM-NEXT: TEX 0 @6 4862; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 4863; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 4864; CM-NEXT: CF_END 4865; CM-NEXT: PAD 4866; CM-NEXT: Fetch clause starting at 6: 4867; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4868; CM-NEXT: ALU clause starting at 8: 4869; CM-NEXT: MOV * T0.X, 0.0, 4870; CM-NEXT: ALU clause starting at 9: 4871; CM-NEXT: MOV * T0.Y, 0.0, 4872; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4873; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4874 %ext = zext i1 %x to i64 4875 store i64 %ext, ptr addrspace(1) %out, align 8 4876 ret void 4877} 4878 4879define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwind { 4880; SI-LABEL: i1_arg_sext_i32: 4881; SI: ; %bb.0: 4882; SI-NEXT: s_load_dword s2, s[4:5], 0xb 4883; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4884; SI-NEXT: s_mov_b32 s3, 0xf000 4885; SI-NEXT: s_waitcnt lgkmcnt(0) 4886; SI-NEXT: s_bfe_i32 s4, s2, 0x10000 4887; SI-NEXT: s_mov_b32 s2, -1 4888; SI-NEXT: v_mov_b32_e32 v0, s4 4889; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 4890; SI-NEXT: s_endpgm 4891; 4892; VI-LABEL: i1_arg_sext_i32: 4893; VI: ; %bb.0: 4894; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 4895; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 4896; VI-NEXT: s_waitcnt lgkmcnt(0) 4897; VI-NEXT: s_bfe_i32 s2, s2, 0x10000 4898; VI-NEXT: v_mov_b32_e32 v0, s0 4899; VI-NEXT: v_mov_b32_e32 v1, s1 4900; VI-NEXT: v_mov_b32_e32 v2, s2 4901; VI-NEXT: flat_store_dword v[0:1], v2 4902; VI-NEXT: s_endpgm 4903; 4904; GFX9-LABEL: i1_arg_sext_i32: 4905; GFX9: ; %bb.0: 4906; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 4907; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 4908; GFX9-NEXT: v_mov_b32_e32 v0, 0 4909; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4910; GFX9-NEXT: s_bfe_i32 s2, s2, 0x10000 4911; GFX9-NEXT: v_mov_b32_e32 v1, s2 4912; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4913; GFX9-NEXT: s_endpgm 4914; 4915; EG-LABEL: i1_arg_sext_i32: 4916; EG: ; %bb.0: 4917; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4918; EG-NEXT: TEX 0 @6 4919; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 4920; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 4921; EG-NEXT: CF_END 4922; EG-NEXT: PAD 4923; EG-NEXT: Fetch clause starting at 6: 4924; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4925; EG-NEXT: ALU clause starting at 8: 4926; EG-NEXT: MOV * T0.X, 0.0, 4927; EG-NEXT: ALU clause starting at 9: 4928; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 4929; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4930; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4931; 4932; CM-LABEL: i1_arg_sext_i32: 4933; CM: ; %bb.0: 4934; CM-NEXT: ALU 0, @8, KC0[], KC1[] 4935; CM-NEXT: TEX 0 @6 4936; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 4937; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 4938; CM-NEXT: CF_END 4939; CM-NEXT: PAD 4940; CM-NEXT: Fetch clause starting at 6: 4941; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 4942; CM-NEXT: ALU clause starting at 8: 4943; CM-NEXT: MOV * T0.X, 0.0, 4944; CM-NEXT: ALU clause starting at 9: 4945; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 4946; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 4947; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4948 %ext = sext i1 %x to i32 4949 store i32 %ext, ptr addrspace(1) %out, align 4 4950 ret void 4951} 4952 4953define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwind { 4954; SI-LABEL: i1_arg_sext_i64: 4955; SI: ; %bb.0: 4956; SI-NEXT: s_load_dword s2, s[4:5], 0xb 4957; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4958; SI-NEXT: s_mov_b32 s3, 0xf000 4959; SI-NEXT: s_waitcnt lgkmcnt(0) 4960; SI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 4961; SI-NEXT: s_mov_b32 s2, -1 4962; SI-NEXT: v_mov_b32_e32 v0, s4 4963; SI-NEXT: v_mov_b32_e32 v1, s5 4964; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4965; SI-NEXT: s_endpgm 4966; 4967; VI-LABEL: i1_arg_sext_i64: 4968; VI: ; %bb.0: 4969; VI-NEXT: s_load_dword s0, s[4:5], 0x2c 4970; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 4971; VI-NEXT: s_waitcnt lgkmcnt(0) 4972; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 4973; VI-NEXT: v_mov_b32_e32 v0, s2 4974; VI-NEXT: v_mov_b32_e32 v3, s1 4975; VI-NEXT: v_mov_b32_e32 v1, s3 4976; VI-NEXT: v_mov_b32_e32 v2, s0 4977; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 4978; VI-NEXT: s_endpgm 4979; 4980; GFX9-LABEL: i1_arg_sext_i64: 4981; GFX9: ; %bb.0: 4982; GFX9-NEXT: s_load_dword s0, s[8:9], 0x8 4983; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 4984; GFX9-NEXT: v_mov_b32_e32 v2, 0 4985; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4986; GFX9-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 4987; GFX9-NEXT: v_mov_b32_e32 v0, s0 4988; GFX9-NEXT: v_mov_b32_e32 v1, s1 4989; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4990; GFX9-NEXT: s_endpgm 4991; 4992; EG-LABEL: i1_arg_sext_i64: 4993; EG: ; %bb.0: 4994; EG-NEXT: ALU 0, @8, KC0[], KC1[] 4995; EG-NEXT: TEX 0 @6 4996; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 4997; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 4998; EG-NEXT: CF_END 4999; EG-NEXT: PAD 5000; EG-NEXT: Fetch clause starting at 6: 5001; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5002; EG-NEXT: ALU clause starting at 8: 5003; EG-NEXT: MOV * T0.X, 0.0, 5004; EG-NEXT: ALU clause starting at 9: 5005; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, 1, 5006; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 5007; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5008; EG-NEXT: MOV * T0.Y, PV.X, 5009; 5010; CM-LABEL: i1_arg_sext_i64: 5011; CM: ; %bb.0: 5012; CM-NEXT: ALU 0, @8, KC0[], KC1[] 5013; CM-NEXT: TEX 0 @6 5014; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 5015; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X 5016; CM-NEXT: CF_END 5017; CM-NEXT: PAD 5018; CM-NEXT: Fetch clause starting at 6: 5019; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 5020; CM-NEXT: ALU clause starting at 8: 5021; CM-NEXT: MOV * T0.X, 0.0, 5022; CM-NEXT: ALU clause starting at 9: 5023; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, 1, 5024; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 5025; CM-NEXT: MOV * T0.Y, PV.X, 5026; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5027 %ext = sext i1 %x to i64 5028 store i64 %ext, ptr addrspace(1) %out, align 8 5029 ret void 5030} 5031 5032define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { 5033; SI-LABEL: empty_struct_arg: 5034; SI: ; %bb.0: 5035; SI-NEXT: s_endpgm 5036; 5037; VI-LABEL: empty_struct_arg: 5038; VI: ; %bb.0: 5039; VI-NEXT: s_endpgm 5040; 5041; GFX9-LABEL: empty_struct_arg: 5042; GFX9: ; %bb.0: 5043; GFX9-NEXT: s_endpgm 5044; 5045; EGCM-LABEL: empty_struct_arg: 5046; EGCM: ; %bb.0: 5047; EGCM-NEXT: CF_END 5048; EGCM-NEXT: PAD 5049 ret void 5050} 5051 5052; The correct load offsets for these: 5053; load 4 from 0, 5054; load 8 from 8 5055; load 4 from 24 5056; load 8 from 32 5057 5058; With the SelectionDAG argument lowering, the alignments for the 5059; struct members is not properly considered, making these wrong. 5060 5061; FIXME: Total argument size is computed wrong 5062define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { 5063; SI-LABEL: struct_argument_alignment: 5064; SI: ; %bb.0: 5065; SI-NEXT: s_load_dword s8, s[4:5], 0x9 5066; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xb 5067; SI-NEXT: s_load_dword s9, s[4:5], 0xf 5068; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 5069; SI-NEXT: s_mov_b32 s0, 0 5070; SI-NEXT: s_mov_b32 s3, 0xf000 5071; SI-NEXT: s_mov_b32 s2, -1 5072; SI-NEXT: s_mov_b32 s1, s0 5073; SI-NEXT: s_waitcnt lgkmcnt(0) 5074; SI-NEXT: v_mov_b32_e32 v0, s8 5075; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5076; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5077; SI-NEXT: v_mov_b32_e32 v0, s6 5078; SI-NEXT: v_mov_b32_e32 v1, s7 5079; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5080; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5081; SI-NEXT: v_mov_b32_e32 v0, s9 5082; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5083; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5084; SI-NEXT: v_mov_b32_e32 v0, s4 5085; SI-NEXT: v_mov_b32_e32 v1, s5 5086; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5087; SI-NEXT: s_waitcnt vmcnt(0) 5088; SI-NEXT: s_endpgm 5089; 5090; VI-LABEL: struct_argument_alignment: 5091; VI: ; %bb.0: 5092; VI-NEXT: s_load_dword s6, s[4:5], 0x24 5093; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c 5094; VI-NEXT: s_load_dword s7, s[4:5], 0x3c 5095; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x44 5096; VI-NEXT: v_mov_b32_e32 v0, 0 5097; VI-NEXT: v_mov_b32_e32 v1, 0 5098; VI-NEXT: s_waitcnt lgkmcnt(0) 5099; VI-NEXT: v_mov_b32_e32 v2, s6 5100; VI-NEXT: flat_store_dword v[0:1], v2 5101; VI-NEXT: s_waitcnt vmcnt(0) 5102; VI-NEXT: v_mov_b32_e32 v3, s1 5103; VI-NEXT: v_mov_b32_e32 v2, s0 5104; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5105; VI-NEXT: s_waitcnt vmcnt(0) 5106; VI-NEXT: v_mov_b32_e32 v2, s7 5107; VI-NEXT: flat_store_dword v[0:1], v2 5108; VI-NEXT: s_waitcnt vmcnt(0) 5109; VI-NEXT: v_mov_b32_e32 v2, s2 5110; VI-NEXT: v_mov_b32_e32 v3, s3 5111; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 5112; VI-NEXT: s_waitcnt vmcnt(0) 5113; VI-NEXT: s_endpgm 5114; 5115; GFX9-LABEL: struct_argument_alignment: 5116; GFX9: ; %bb.0: 5117; GFX9-NEXT: s_load_dword s4, s[8:9], 0x0 5118; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 5119; GFX9-NEXT: s_load_dword s5, s[8:9], 0x18 5120; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x20 5121; GFX9-NEXT: v_mov_b32_e32 v0, 0 5122; GFX9-NEXT: v_mov_b32_e32 v1, 0 5123; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5124; GFX9-NEXT: v_mov_b32_e32 v2, s4 5125; GFX9-NEXT: global_store_dword v[0:1], v2, off 5126; GFX9-NEXT: s_waitcnt vmcnt(0) 5127; GFX9-NEXT: v_mov_b32_e32 v3, s1 5128; GFX9-NEXT: v_mov_b32_e32 v2, s0 5129; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5130; GFX9-NEXT: s_waitcnt vmcnt(0) 5131; GFX9-NEXT: v_mov_b32_e32 v2, s5 5132; GFX9-NEXT: global_store_dword v[0:1], v2, off 5133; GFX9-NEXT: s_waitcnt vmcnt(0) 5134; GFX9-NEXT: v_mov_b32_e32 v2, s2 5135; GFX9-NEXT: v_mov_b32_e32 v3, s3 5136; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 5137; GFX9-NEXT: s_waitcnt vmcnt(0) 5138; GFX9-NEXT: s_endpgm 5139; 5140; EG-LABEL: struct_argument_alignment: 5141; EG: ; %bb.0: 5142; EG-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5143; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0 5144; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 5145; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0 5146; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0 5147; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0 5148; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 5149; EG-NEXT: CF_END 5150; EG-NEXT: ALU clause starting at 8: 5151; EG-NEXT: MOV T0.X, KC0[4].Y, 5152; EG-NEXT: MOV * T1.X, KC0[4].Z, 5153; EG-NEXT: MOV T2.X, KC0[3].W, 5154; EG-NEXT: MOV * T3.X, KC0[2].W, 5155; EG-NEXT: MOV T4.X, literal.x, 5156; EG-NEXT: MOV * T5.X, KC0[3].X, 5157; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5158; EG-NEXT: MOV T6.X, literal.x, 5159; EG-NEXT: MOV * T7.X, KC0[2].Y, 5160; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5161; 5162; CM-LABEL: struct_argument_alignment: 5163; CM: ; %bb.0: 5164; CM-NEXT: ALU 9, @8, KC0[CB0:0-32], KC1[] 5165; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X 5166; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X 5167; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X 5168; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X 5169; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5170; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X 5171; CM-NEXT: CF_END 5172; CM-NEXT: ALU clause starting at 8: 5173; CM-NEXT: MOV * T0.X, KC0[4].Y, 5174; CM-NEXT: MOV * T1.X, KC0[4].Z, 5175; CM-NEXT: MOV * T2.X, KC0[3].W, 5176; CM-NEXT: MOV * T3.X, KC0[2].W, 5177; CM-NEXT: MOV * T4.X, literal.x, 5178; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5179; CM-NEXT: MOV * T5.X, KC0[3].X, 5180; CM-NEXT: MOV * T6.X, literal.x, 5181; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5182; CM-NEXT: MOV * T7.X, KC0[2].Y, 5183 %val0 = extractvalue {i32, i64} %arg0, 0 5184 %val1 = extractvalue {i32, i64} %arg0, 1 5185 %val2 = extractvalue {i32, i64} %arg1, 0 5186 %val3 = extractvalue {i32, i64} %arg1, 1 5187 store volatile i32 %val0, ptr addrspace(1) null 5188 store volatile i64 %val1, ptr addrspace(1) null 5189 store volatile i32 %val2, ptr addrspace(1) null 5190 store volatile i64 %val3, ptr addrspace(1) null 5191 ret void 5192} 5193 5194; No padding between i8 and next struct, but round up at end to 4 byte 5195; multiple. 5196define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { 5197; SI-LABEL: packed_struct_argument_alignment: 5198; SI: ; %bb.0: 5199; SI-NEXT: s_mov_b32 s7, 0xf000 5200; SI-NEXT: s_mov_b32 s6, -1 5201; SI-NEXT: s_load_dword s2, s[4:5], 0x9 5202; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa 5203; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 5204; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50 5205; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 5206; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52 5207; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53 5208; SI-NEXT: s_mov_b32 s4, 0 5209; SI-NEXT: s_mov_b32 s5, s4 5210; SI-NEXT: s_waitcnt lgkmcnt(0) 5211; SI-NEXT: v_mov_b32_e32 v2, s2 5212; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 5213; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5214; SI-NEXT: v_mov_b32_e32 v3, s1 5215; SI-NEXT: v_mov_b32_e32 v2, s0 5216; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 5217; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5218; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 5219; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 5220; SI-NEXT: v_or_b32_e32 v2, v2, v4 5221; SI-NEXT: v_or_b32_e32 v3, v3, v6 5222; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 5223; SI-NEXT: v_or_b32_e32 v2, v3, v2 5224; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 5225; SI-NEXT: s_waitcnt vmcnt(0) 5226; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5227; SI-NEXT: s_waitcnt vmcnt(0) 5228; SI-NEXT: s_endpgm 5229; 5230; VI-LABEL: packed_struct_argument_alignment: 5231; VI: ; %bb.0: 5232; VI-NEXT: s_add_u32 s0, s4, 49 5233; VI-NEXT: s_addc_u32 s1, s5, 0 5234; VI-NEXT: s_add_u32 s2, s4, 50 5235; VI-NEXT: s_addc_u32 s3, s5, 0 5236; VI-NEXT: v_mov_b32_e32 v3, s1 5237; VI-NEXT: v_mov_b32_e32 v2, s0 5238; VI-NEXT: s_add_u32 s0, s0, 3 5239; VI-NEXT: s_addc_u32 s1, s1, 0 5240; VI-NEXT: v_mov_b32_e32 v5, s1 5241; VI-NEXT: v_mov_b32_e32 v4, s0 5242; VI-NEXT: s_add_u32 s0, s4, 51 5243; VI-NEXT: s_addc_u32 s1, s5, 0 5244; VI-NEXT: v_mov_b32_e32 v0, s2 5245; VI-NEXT: v_mov_b32_e32 v7, s1 5246; VI-NEXT: v_mov_b32_e32 v1, s3 5247; VI-NEXT: v_mov_b32_e32 v6, s0 5248; VI-NEXT: flat_load_ubyte v8, v[0:1] 5249; VI-NEXT: flat_load_ubyte v9, v[2:3] 5250; VI-NEXT: flat_load_ubyte v10, v[4:5] 5251; VI-NEXT: flat_load_ubyte v6, v[6:7] 5252; VI-NEXT: s_add_u32 s0, s4, 53 5253; VI-NEXT: s_addc_u32 s1, s5, 0 5254; VI-NEXT: v_mov_b32_e32 v0, s0 5255; VI-NEXT: v_mov_b32_e32 v1, s1 5256; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 5257; VI-NEXT: s_load_dword s2, s[4:5], 0x24 5258; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 5259; VI-NEXT: v_mov_b32_e32 v2, 0 5260; VI-NEXT: v_mov_b32_e32 v3, 0 5261; VI-NEXT: s_waitcnt lgkmcnt(0) 5262; VI-NEXT: v_mov_b32_e32 v7, s2 5263; VI-NEXT: v_mov_b32_e32 v5, s1 5264; VI-NEXT: v_mov_b32_e32 v4, s0 5265; VI-NEXT: flat_store_dword v[2:3], v7 5266; VI-NEXT: s_waitcnt vmcnt(0) 5267; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] 5268; VI-NEXT: s_waitcnt vmcnt(0) 5269; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 5270; VI-NEXT: v_or_b32_e32 v4, v4, v9 5271; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 5272; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 5273; VI-NEXT: v_or_b32_e32 v4, v5, v4 5274; VI-NEXT: flat_store_dword v[2:3], v4 5275; VI-NEXT: s_waitcnt vmcnt(0) 5276; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 5277; VI-NEXT: s_waitcnt vmcnt(0) 5278; VI-NEXT: s_endpgm 5279; 5280; GFX9-LABEL: packed_struct_argument_alignment: 5281; GFX9: ; %bb.0: 5282; GFX9-NEXT: v_mov_b32_e32 v2, 0 5283; GFX9-NEXT: global_load_dword v6, v2, s[8:9] offset:13 5284; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[8:9] offset:17 5285; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 5286; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x4 5287; GFX9-NEXT: v_mov_b32_e32 v2, 0 5288; GFX9-NEXT: v_mov_b32_e32 v3, 0 5289; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5290; GFX9-NEXT: v_mov_b32_e32 v7, s2 5291; GFX9-NEXT: v_mov_b32_e32 v5, s1 5292; GFX9-NEXT: v_mov_b32_e32 v4, s0 5293; GFX9-NEXT: global_store_dword v[2:3], v7, off 5294; GFX9-NEXT: s_waitcnt vmcnt(0) 5295; GFX9-NEXT: global_store_dwordx2 v[2:3], v[4:5], off 5296; GFX9-NEXT: s_waitcnt vmcnt(0) 5297; GFX9-NEXT: global_store_dword v[2:3], v6, off 5298; GFX9-NEXT: s_waitcnt vmcnt(0) 5299; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off 5300; GFX9-NEXT: s_waitcnt vmcnt(0) 5301; GFX9-NEXT: s_endpgm 5302; 5303; EG-LABEL: packed_struct_argument_alignment: 5304; EG: ; %bb.0: 5305; EG-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5306; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 5307; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5308; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5309; EG-NEXT: ALU 2, @25, KC0[], KC1[] 5310; EG-NEXT: TEX 0 @12 5311; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 5312; EG-NEXT: TEX 0 @14 5313; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0 5314; EG-NEXT: TEX 0 @16 5315; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1 5316; EG-NEXT: CF_END 5317; EG-NEXT: Fetch clause starting at 12: 5318; EG-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5319; EG-NEXT: Fetch clause starting at 14: 5320; EG-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5321; EG-NEXT: Fetch clause starting at 16: 5322; EG-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5323; EG-NEXT: ALU clause starting at 18: 5324; EG-NEXT: MOV T0.X, KC0[2].Z, 5325; EG-NEXT: MOV * T1.X, literal.x, 5326; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5327; EG-NEXT: MOV T2.X, KC0[2].W, 5328; EG-NEXT: MOV * T3.X, literal.x, 5329; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5330; EG-NEXT: MOV * T4.X, KC0[2].Y, 5331; EG-NEXT: ALU clause starting at 25: 5332; EG-NEXT: MOV T0.X, 0.0, 5333; EG-NEXT: MOV * T2.X, 0.0, 5334; EG-NEXT: MOV * T4.X, 0.0, 5335; 5336; CM-LABEL: packed_struct_argument_alignment: 5337; CM: ; %bb.0: 5338; CM-NEXT: ALU 6, @18, KC0[CB0:0-32], KC1[] 5339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5340; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5341; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5342; CM-NEXT: ALU 2, @25, KC0[], KC1[] 5343; CM-NEXT: TEX 0 @12 5344; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X 5345; CM-NEXT: TEX 0 @14 5346; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X 5347; CM-NEXT: TEX 0 @16 5348; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X 5349; CM-NEXT: CF_END 5350; CM-NEXT: Fetch clause starting at 12: 5351; CM-NEXT: VTX_READ_32 T0.X, T0.X, 49, #3 5352; CM-NEXT: Fetch clause starting at 14: 5353; CM-NEXT: VTX_READ_32 T2.X, T2.X, 57, #3 5354; CM-NEXT: Fetch clause starting at 16: 5355; CM-NEXT: VTX_READ_32 T4.X, T4.X, 53, #3 5356; CM-NEXT: ALU clause starting at 18: 5357; CM-NEXT: MOV * T0.X, KC0[2].Z, 5358; CM-NEXT: MOV * T1.X, literal.x, 5359; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5360; CM-NEXT: MOV * T2.X, KC0[2].W, 5361; CM-NEXT: MOV * T3.X, literal.x, 5362; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5363; CM-NEXT: MOV * T4.X, KC0[2].Y, 5364; CM-NEXT: ALU clause starting at 25: 5365; CM-NEXT: MOV * T0.X, 0.0, 5366; CM-NEXT: MOV * T2.X, 0.0, 5367; CM-NEXT: MOV * T4.X, 0.0, 5368 %val0 = extractvalue <{i32, i64}> %arg0, 0 5369 %val1 = extractvalue <{i32, i64}> %arg0, 1 5370 %val2 = extractvalue <{i32, i64}> %arg1, 0 5371 %val3 = extractvalue <{i32, i64}> %arg1, 1 5372 store volatile i32 %val0, ptr addrspace(1) null 5373 store volatile i64 %val1, ptr addrspace(1) null 5374 store volatile i32 %val2, ptr addrspace(1) null 5375 store volatile i64 %val3, ptr addrspace(1) null 5376 ret void 5377} 5378 5379define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { 5380; SI-LABEL: struct_argument_alignment_after: 5381; SI: ; %bb.0: 5382; SI-NEXT: s_load_dword s12, s[4:5], 0x9 5383; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xb 5384; SI-NEXT: s_load_dword s13, s[4:5], 0xf 5385; SI-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x11 5386; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x15 5387; SI-NEXT: s_mov_b32 s4, 0 5388; SI-NEXT: s_mov_b32 s7, 0xf000 5389; SI-NEXT: s_mov_b32 s6, -1 5390; SI-NEXT: s_mov_b32 s5, s4 5391; SI-NEXT: s_waitcnt lgkmcnt(0) 5392; SI-NEXT: v_mov_b32_e32 v0, s12 5393; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5394; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5395; SI-NEXT: v_mov_b32_e32 v0, s8 5396; SI-NEXT: v_mov_b32_e32 v1, s9 5397; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5398; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5399; SI-NEXT: v_mov_b32_e32 v0, s13 5400; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5401; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5402; SI-NEXT: v_mov_b32_e32 v0, s10 5403; SI-NEXT: v_mov_b32_e32 v1, s11 5404; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5405; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5406; SI-NEXT: v_mov_b32_e32 v0, s0 5407; SI-NEXT: v_mov_b32_e32 v1, s1 5408; SI-NEXT: v_mov_b32_e32 v2, s2 5409; SI-NEXT: v_mov_b32_e32 v3, s3 5410; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5411; SI-NEXT: s_waitcnt vmcnt(0) 5412; SI-NEXT: s_endpgm 5413; 5414; VI-LABEL: struct_argument_alignment_after: 5415; VI: ; %bb.0: 5416; VI-NEXT: s_load_dword s10, s[4:5], 0x24 5417; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c 5418; VI-NEXT: s_load_dword s11, s[4:5], 0x3c 5419; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x44 5420; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 5421; VI-NEXT: v_mov_b32_e32 v4, 0 5422; VI-NEXT: v_mov_b32_e32 v5, 0 5423; VI-NEXT: s_waitcnt lgkmcnt(0) 5424; VI-NEXT: v_mov_b32_e32 v0, s10 5425; VI-NEXT: flat_store_dword v[4:5], v0 5426; VI-NEXT: s_waitcnt vmcnt(0) 5427; VI-NEXT: v_mov_b32_e32 v0, s6 5428; VI-NEXT: v_mov_b32_e32 v1, s7 5429; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5430; VI-NEXT: s_waitcnt vmcnt(0) 5431; VI-NEXT: v_mov_b32_e32 v0, s11 5432; VI-NEXT: flat_store_dword v[4:5], v0 5433; VI-NEXT: s_waitcnt vmcnt(0) 5434; VI-NEXT: v_mov_b32_e32 v0, s8 5435; VI-NEXT: v_mov_b32_e32 v1, s9 5436; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 5437; VI-NEXT: s_waitcnt vmcnt(0) 5438; VI-NEXT: v_mov_b32_e32 v0, s0 5439; VI-NEXT: v_mov_b32_e32 v1, s1 5440; VI-NEXT: v_mov_b32_e32 v2, s2 5441; VI-NEXT: v_mov_b32_e32 v3, s3 5442; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 5443; VI-NEXT: s_waitcnt vmcnt(0) 5444; VI-NEXT: s_endpgm 5445; 5446; GFX9-LABEL: struct_argument_alignment_after: 5447; GFX9: ; %bb.0: 5448; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 5449; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 5450; GFX9-NEXT: s_load_dword s11, s[8:9], 0x18 5451; GFX9-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x20 5452; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30 5453; GFX9-NEXT: v_mov_b32_e32 v4, 0 5454; GFX9-NEXT: v_mov_b32_e32 v5, 0 5455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5456; GFX9-NEXT: v_mov_b32_e32 v0, s10 5457; GFX9-NEXT: global_store_dword v[4:5], v0, off 5458; GFX9-NEXT: s_waitcnt vmcnt(0) 5459; GFX9-NEXT: v_mov_b32_e32 v0, s4 5460; GFX9-NEXT: v_mov_b32_e32 v1, s5 5461; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5462; GFX9-NEXT: s_waitcnt vmcnt(0) 5463; GFX9-NEXT: v_mov_b32_e32 v0, s11 5464; GFX9-NEXT: global_store_dword v[4:5], v0, off 5465; GFX9-NEXT: s_waitcnt vmcnt(0) 5466; GFX9-NEXT: v_mov_b32_e32 v0, s6 5467; GFX9-NEXT: v_mov_b32_e32 v1, s7 5468; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off 5469; GFX9-NEXT: s_waitcnt vmcnt(0) 5470; GFX9-NEXT: v_mov_b32_e32 v0, s0 5471; GFX9-NEXT: v_mov_b32_e32 v1, s1 5472; GFX9-NEXT: v_mov_b32_e32 v2, s2 5473; GFX9-NEXT: v_mov_b32_e32 v3, s3 5474; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off 5475; GFX9-NEXT: s_waitcnt vmcnt(0) 5476; GFX9-NEXT: s_endpgm 5477; 5478; EG-LABEL: struct_argument_alignment_after: 5479; EG: ; %bb.0: 5480; EG-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5481; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0 5482; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0 5483; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0 5484; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0 5485; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0 5486; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0 5487; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1 5488; EG-NEXT: CF_END 5489; EG-NEXT: PAD 5490; EG-NEXT: ALU clause starting at 10: 5491; EG-NEXT: MOV * T0.W, KC0[6].X, 5492; EG-NEXT: MOV * T0.Z, KC0[5].W, 5493; EG-NEXT: MOV * T0.Y, KC0[5].Z, 5494; EG-NEXT: MOV T0.X, KC0[5].Y, 5495; EG-NEXT: MOV * T1.X, KC0[4].Y, 5496; EG-NEXT: MOV T2.X, KC0[4].Z, 5497; EG-NEXT: MOV * T3.X, KC0[3].W, 5498; EG-NEXT: MOV T4.X, KC0[2].W, 5499; EG-NEXT: MOV * T5.X, literal.x, 5500; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5501; EG-NEXT: MOV T6.X, KC0[3].X, 5502; EG-NEXT: MOV * T7.X, literal.x, 5503; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5504; EG-NEXT: MOV * T8.X, KC0[2].Y, 5505; 5506; CM-LABEL: struct_argument_alignment_after: 5507; CM: ; %bb.0: 5508; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[] 5509; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X 5510; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X 5511; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X 5512; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X 5513; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X 5514; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X 5515; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T7.X 5516; CM-NEXT: CF_END 5517; CM-NEXT: PAD 5518; CM-NEXT: ALU clause starting at 10: 5519; CM-NEXT: MOV * T0.W, KC0[6].X, 5520; CM-NEXT: MOV * T0.Z, KC0[5].W, 5521; CM-NEXT: MOV * T0.Y, KC0[5].Z, 5522; CM-NEXT: MOV * T0.X, KC0[5].Y, 5523; CM-NEXT: MOV * T1.X, KC0[4].Y, 5524; CM-NEXT: MOV * T2.X, KC0[4].Z, 5525; CM-NEXT: MOV * T3.X, KC0[3].W, 5526; CM-NEXT: MOV * T4.X, KC0[2].W, 5527; CM-NEXT: MOV * T5.X, literal.x, 5528; CM-NEXT: 1(1.401298e-45), 0(0.000000e+00) 5529; CM-NEXT: MOV * T6.X, KC0[3].X, 5530; CM-NEXT: MOV * T7.X, literal.x, 5531; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5532; CM-NEXT: MOV * T8.X, KC0[2].Y, 5533 %val0 = extractvalue {i32, i64} %arg0, 0 5534 %val1 = extractvalue {i32, i64} %arg0, 1 5535 %val2 = extractvalue {i32, i64} %arg2, 0 5536 %val3 = extractvalue {i32, i64} %arg2, 1 5537 store volatile i32 %val0, ptr addrspace(1) null 5538 store volatile i64 %val1, ptr addrspace(1) null 5539 store volatile i32 %val2, ptr addrspace(1) null 5540 store volatile i64 %val3, ptr addrspace(1) null 5541 store volatile <4 x i32> %arg4, ptr addrspace(1) null 5542 ret void 5543} 5544 5545define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { 5546; SI-LABEL: array_3xi32: 5547; SI: ; %bb.0: 5548; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5549; SI-NEXT: s_mov_b32 s7, 0xf000 5550; SI-NEXT: s_mov_b32 s6, -1 5551; SI-NEXT: s_waitcnt lgkmcnt(0) 5552; SI-NEXT: v_mov_b32_e32 v0, s0 5553; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 5554; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5555; SI-NEXT: v_mov_b32_e32 v0, s3 5556; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5557; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5558; SI-NEXT: v_mov_b32_e32 v0, s2 5559; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5560; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5561; SI-NEXT: v_mov_b32_e32 v0, s1 5562; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 5563; SI-NEXT: s_waitcnt vmcnt(0) 5564; SI-NEXT: s_endpgm 5565; 5566; VI-LABEL: array_3xi32: 5567; VI: ; %bb.0: 5568; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5569; VI-NEXT: s_waitcnt lgkmcnt(0) 5570; VI-NEXT: v_mov_b32_e32 v0, s0 5571; VI-NEXT: v_mov_b32_e32 v1, s3 5572; VI-NEXT: v_mov_b32_e32 v2, s2 5573; VI-NEXT: flat_store_short v[0:1], v0 5574; VI-NEXT: s_waitcnt vmcnt(0) 5575; VI-NEXT: flat_store_dword v[0:1], v1 5576; VI-NEXT: s_waitcnt vmcnt(0) 5577; VI-NEXT: flat_store_dword v[0:1], v2 5578; VI-NEXT: s_waitcnt vmcnt(0) 5579; VI-NEXT: v_mov_b32_e32 v0, s1 5580; VI-NEXT: flat_store_dword v[0:1], v0 5581; VI-NEXT: s_waitcnt vmcnt(0) 5582; VI-NEXT: s_endpgm 5583; 5584; GFX9-LABEL: array_3xi32: 5585; GFX9: ; %bb.0: 5586; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 5587; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5588; GFX9-NEXT: v_mov_b32_e32 v0, s0 5589; GFX9-NEXT: v_mov_b32_e32 v1, s3 5590; GFX9-NEXT: v_mov_b32_e32 v2, s2 5591; GFX9-NEXT: global_store_short v[0:1], v0, off 5592; GFX9-NEXT: s_waitcnt vmcnt(0) 5593; GFX9-NEXT: global_store_dword v[0:1], v1, off 5594; GFX9-NEXT: s_waitcnt vmcnt(0) 5595; GFX9-NEXT: global_store_dword v[0:1], v2, off 5596; GFX9-NEXT: s_waitcnt vmcnt(0) 5597; GFX9-NEXT: v_mov_b32_e32 v0, s1 5598; GFX9-NEXT: global_store_dword v[0:1], v0, off 5599; GFX9-NEXT: s_waitcnt vmcnt(0) 5600; GFX9-NEXT: s_endpgm 5601; 5602; EG-LABEL: array_3xi32: 5603; EG: ; %bb.0: 5604; EG-NEXT: ALU 0, @10, KC0[], KC1[] 5605; EG-NEXT: TEX 0 @8 5606; EG-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5607; EG-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5608; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0 5609; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 5610; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1 5611; EG-NEXT: CF_END 5612; EG-NEXT: Fetch clause starting at 8: 5613; EG-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5614; EG-NEXT: ALU clause starting at 10: 5615; EG-NEXT: MOV * T0.X, 0.0, 5616; EG-NEXT: ALU clause starting at 11: 5617; EG-NEXT: AND_INT T0.X, T0.X, literal.x, 5618; EG-NEXT: MOV * T0.W, literal.x, 5619; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5620; EG-NEXT: MOV T0.Y, 0.0, 5621; EG-NEXT: MOV * T0.Z, 0.0, 5622; EG-NEXT: MOV T1.X, KC0[2].Z, 5623; EG-NEXT: MOV * T2.X, KC0[2].W, 5624; EG-NEXT: MOV T3.X, KC0[3].X, 5625; EG-NEXT: MOV * T4.X, literal.x, 5626; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5627; 5628; CM-LABEL: array_3xi32: 5629; CM: ; %bb.0: 5630; CM-NEXT: ALU 0, @10, KC0[], KC1[] 5631; CM-NEXT: TEX 0 @8 5632; CM-NEXT: ALU 9, @11, KC0[CB0:0-32], KC1[] 5633; CM-NEXT: MEM_RAT MSKOR T0.XW, T4.X 5634; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X 5635; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X 5636; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X 5637; CM-NEXT: CF_END 5638; CM-NEXT: Fetch clause starting at 8: 5639; CM-NEXT: VTX_READ_16 T0.X, T0.X, 36, #3 5640; CM-NEXT: ALU clause starting at 10: 5641; CM-NEXT: MOV * T0.X, 0.0, 5642; CM-NEXT: ALU clause starting at 11: 5643; CM-NEXT: AND_INT T0.X, T0.X, literal.x, 5644; CM-NEXT: MOV * T0.W, literal.x, 5645; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5646; CM-NEXT: MOV T0.Y, 0.0, 5647; CM-NEXT: MOV * T0.Z, 0.0, 5648; CM-NEXT: MOV * T1.X, KC0[2].Z, 5649; CM-NEXT: MOV * T2.X, KC0[2].W, 5650; CM-NEXT: MOV * T3.X, KC0[3].X, 5651; CM-NEXT: MOV * T4.X, literal.x, 5652; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5653 store volatile i16 %arg0, ptr addrspace(1) undef 5654 store volatile [3 x i32] %arg1, ptr addrspace(1) undef 5655 ret void 5656} 5657 5658; FIXME: Why not all scalar loads? 5659define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { 5660; SI-LABEL: array_3xi16: 5661; SI: ; %bb.0: 5662; SI-NEXT: s_load_dword s0, s[4:5], 0x9 5663; SI-NEXT: s_mov_b32 s7, 0xf000 5664; SI-NEXT: s_mov_b32 s6, -1 5665; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:42 5666; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:40 5667; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:38 5668; SI-NEXT: s_waitcnt lgkmcnt(0) 5669; SI-NEXT: v_mov_b32_e32 v3, s0 5670; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0 5671; SI-NEXT: s_waitcnt vmcnt(0) 5672; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 5673; SI-NEXT: s_waitcnt vmcnt(0) 5674; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 5675; SI-NEXT: s_waitcnt vmcnt(0) 5676; SI-NEXT: buffer_store_short v2, off, s[4:7], 0 5677; SI-NEXT: s_waitcnt vmcnt(0) 5678; SI-NEXT: s_endpgm 5679; 5680; VI-LABEL: array_3xi16: 5681; VI: ; %bb.0: 5682; VI-NEXT: s_add_u32 s0, s4, 38 5683; VI-NEXT: s_addc_u32 s1, s5, 0 5684; VI-NEXT: s_add_u32 s2, s0, 2 5685; VI-NEXT: s_addc_u32 s3, s1, 0 5686; VI-NEXT: v_mov_b32_e32 v0, s0 5687; VI-NEXT: v_mov_b32_e32 v1, s1 5688; VI-NEXT: s_add_u32 s0, s4, 42 5689; VI-NEXT: s_addc_u32 s1, s5, 0 5690; VI-NEXT: v_mov_b32_e32 v3, s1 5691; VI-NEXT: v_mov_b32_e32 v2, s0 5692; VI-NEXT: flat_load_ushort v4, v[0:1] 5693; VI-NEXT: flat_load_ushort v2, v[2:3] 5694; VI-NEXT: v_mov_b32_e32 v0, s2 5695; VI-NEXT: v_mov_b32_e32 v1, s3 5696; VI-NEXT: flat_load_ushort v0, v[0:1] 5697; VI-NEXT: s_load_dword s0, s[4:5], 0x24 5698; VI-NEXT: s_waitcnt lgkmcnt(0) 5699; VI-NEXT: v_mov_b32_e32 v1, s0 5700; VI-NEXT: s_waitcnt vmcnt(0) 5701; VI-NEXT: flat_store_byte v[0:1], v1 5702; VI-NEXT: s_waitcnt vmcnt(0) 5703; VI-NEXT: flat_store_short v[0:1], v2 5704; VI-NEXT: s_waitcnt vmcnt(0) 5705; VI-NEXT: flat_store_short v[0:1], v4 5706; VI-NEXT: s_waitcnt vmcnt(0) 5707; VI-NEXT: flat_store_short v[0:1], v0 5708; VI-NEXT: s_waitcnt vmcnt(0) 5709; VI-NEXT: s_endpgm 5710; 5711; GFX9-LABEL: array_3xi16: 5712; GFX9: ; %bb.0: 5713; GFX9-NEXT: v_mov_b32_e32 v0, 0 5714; GFX9-NEXT: global_load_ushort v1, v0, s[8:9] offset:6 5715; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] offset:4 5716; GFX9-NEXT: global_load_ushort v3, v0, s[8:9] offset:2 5717; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0 5718; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5719; GFX9-NEXT: v_mov_b32_e32 v0, s0 5720; GFX9-NEXT: s_waitcnt vmcnt(2) 5721; GFX9-NEXT: global_store_byte v[0:1], v0, off 5722; GFX9-NEXT: s_waitcnt vmcnt(0) 5723; GFX9-NEXT: global_store_short v[0:1], v1, off 5724; GFX9-NEXT: s_waitcnt vmcnt(0) 5725; GFX9-NEXT: global_store_short v[0:1], v2, off 5726; GFX9-NEXT: s_waitcnt vmcnt(0) 5727; GFX9-NEXT: global_store_short v[0:1], v3, off 5728; GFX9-NEXT: s_waitcnt vmcnt(0) 5729; GFX9-NEXT: s_endpgm 5730; 5731; EG-LABEL: array_3xi16: 5732; EG: ; %bb.0: 5733; EG-NEXT: ALU 0, @20, KC0[], KC1[] 5734; EG-NEXT: TEX 1 @12 5735; EG-NEXT: ALU 11, @21, KC0[], KC1[] 5736; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5737; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5738; EG-NEXT: TEX 0 @16 5739; EG-NEXT: ALU 3, @33, KC0[], KC1[] 5740; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5741; EG-NEXT: TEX 0 @18 5742; EG-NEXT: ALU 3, @37, KC0[], KC1[] 5743; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5744; EG-NEXT: CF_END 5745; EG-NEXT: Fetch clause starting at 12: 5746; EG-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5747; EG-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5748; EG-NEXT: Fetch clause starting at 16: 5749; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5750; EG-NEXT: Fetch clause starting at 18: 5751; EG-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5752; EG-NEXT: ALU clause starting at 20: 5753; EG-NEXT: MOV * T0.X, 0.0, 5754; EG-NEXT: ALU clause starting at 21: 5755; EG-NEXT: AND_INT T1.X, T1.X, literal.x, 5756; EG-NEXT: MOV * T1.W, literal.x, 5757; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5758; EG-NEXT: MOV * T1.Y, 0.0, 5759; EG-NEXT: AND_INT T2.X, T2.X, literal.x, 5760; EG-NEXT: MOV * T2.W, literal.x, 5761; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5762; EG-NEXT: MOV T2.Y, 0.0, 5763; EG-NEXT: MOV T1.Z, 0.0, 5764; EG-NEXT: MOV * T2.Z, 0.0, 5765; EG-NEXT: MOV * T3.X, literal.x, 5766; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5767; EG-NEXT: ALU clause starting at 33: 5768; EG-NEXT: AND_INT T2.X, T1.X, literal.x, 5769; EG-NEXT: MOV T2.Y, 0.0, 5770; EG-NEXT: MOV * T2.Z, 0.0, 5771; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5772; EG-NEXT: ALU clause starting at 37: 5773; EG-NEXT: AND_INT T2.X, T0.X, literal.x, 5774; EG-NEXT: MOV T2.Y, 0.0, 5775; EG-NEXT: MOV * T2.Z, 0.0, 5776; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5777; 5778; CM-LABEL: array_3xi16: 5779; CM: ; %bb.0: 5780; CM-NEXT: ALU 0, @20, KC0[], KC1[] 5781; CM-NEXT: TEX 1 @12 5782; CM-NEXT: ALU 11, @21, KC0[], KC1[] 5783; CM-NEXT: MEM_RAT MSKOR T1.XW, T3.X 5784; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5785; CM-NEXT: TEX 0 @16 5786; CM-NEXT: ALU 3, @33, KC0[], KC1[] 5787; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5788; CM-NEXT: TEX 0 @18 5789; CM-NEXT: ALU 3, @37, KC0[], KC1[] 5790; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X 5791; CM-NEXT: CF_END 5792; CM-NEXT: Fetch clause starting at 12: 5793; CM-NEXT: VTX_READ_8 T1.X, T0.X, 36, #3 5794; CM-NEXT: VTX_READ_16 T2.X, T0.X, 42, #3 5795; CM-NEXT: Fetch clause starting at 16: 5796; CM-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 5797; CM-NEXT: Fetch clause starting at 18: 5798; CM-NEXT: VTX_READ_16 T0.X, T0.X, 38, #3 5799; CM-NEXT: ALU clause starting at 20: 5800; CM-NEXT: MOV * T0.X, 0.0, 5801; CM-NEXT: ALU clause starting at 21: 5802; CM-NEXT: AND_INT T1.X, T1.X, literal.x, 5803; CM-NEXT: MOV * T1.W, literal.x, 5804; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5805; CM-NEXT: MOV * T1.Y, 0.0, 5806; CM-NEXT: AND_INT T2.X, T2.X, literal.x, 5807; CM-NEXT: MOV * T2.W, literal.x, 5808; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5809; CM-NEXT: MOV T2.Y, 0.0, 5810; CM-NEXT: MOV * T1.Z, 0.0, 5811; CM-NEXT: MOV * T2.Z, 0.0, 5812; CM-NEXT: MOV * T3.X, literal.x, 5813; CM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5814; CM-NEXT: ALU clause starting at 33: 5815; CM-NEXT: AND_INT T2.X, T1.X, literal.x, 5816; CM-NEXT: MOV T2.Y, 0.0, 5817; CM-NEXT: MOV * T2.Z, 0.0, 5818; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5819; CM-NEXT: ALU clause starting at 37: 5820; CM-NEXT: AND_INT T2.X, T0.X, literal.x, 5821; CM-NEXT: MOV T2.Y, 0.0, 5822; CM-NEXT: MOV * T2.Z, 0.0, 5823; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 5824 store volatile i8 %arg0, ptr addrspace(1) undef 5825 store volatile [3 x i16] %arg1, ptr addrspace(1) undef 5826 ret void 5827} 5828 5829define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { 5830; SI-LABEL: small_array_round_down_offset: 5831; SI: ; %bb.0: 5832; SI-NEXT: s_mov_b32 s7, 0xf000 5833; SI-NEXT: s_mov_b32 s6, -1 5834; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:37 5835; SI-NEXT: s_waitcnt vmcnt(0) 5836; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 5837; SI-NEXT: s_waitcnt vmcnt(0) 5838; SI-NEXT: s_endpgm 5839; 5840; VI-LABEL: small_array_round_down_offset: 5841; VI: ; %bb.0: 5842; VI-NEXT: s_add_u32 s0, s4, 37 5843; VI-NEXT: s_addc_u32 s1, s5, 0 5844; VI-NEXT: v_mov_b32_e32 v0, s0 5845; VI-NEXT: v_mov_b32_e32 v1, s1 5846; VI-NEXT: flat_load_ubyte v0, v[0:1] 5847; VI-NEXT: s_waitcnt vmcnt(0) 5848; VI-NEXT: flat_store_byte v[0:1], v0 5849; VI-NEXT: s_waitcnt vmcnt(0) 5850; VI-NEXT: s_endpgm 5851; 5852; GFX9-LABEL: small_array_round_down_offset: 5853; GFX9: ; %bb.0: 5854; GFX9-NEXT: v_mov_b32_e32 v0, 0 5855; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] offset:1 5856; GFX9-NEXT: s_waitcnt vmcnt(0) 5857; GFX9-NEXT: global_store_byte v[0:1], v0, off 5858; GFX9-NEXT: s_waitcnt vmcnt(0) 5859; GFX9-NEXT: s_endpgm 5860; 5861; EGCM-LABEL: small_array_round_down_offset: 5862; EGCM: ; %bb.0: 5863; EGCM-NEXT: ALU 0, @8, KC0[], KC1[] 5864; EGCM-NEXT: TEX 0 @6 5865; EGCM-NEXT: ALU 6, @9, KC0[], KC1[] 5866; EGCM-NEXT: MEM_RAT MSKOR T0.XW, T1.X 5867; EGCM-NEXT: CF_END 5868; EGCM-NEXT: PAD 5869; EGCM-NEXT: Fetch clause starting at 6: 5870; EGCM-NEXT: VTX_READ_8 T0.X, T0.X, 37, #3 5871; EGCM-NEXT: ALU clause starting at 8: 5872; EGCM-NEXT: MOV * T0.X, 0.0, 5873; EGCM-NEXT: ALU clause starting at 9: 5874; EGCM-NEXT: AND_INT T0.X, T0.X, literal.x, 5875; EGCM-NEXT: MOV * T0.W, literal.x, 5876; EGCM-NEXT: 255(3.573311e-43), 0(0.000000e+00) 5877; EGCM-NEXT: MOV T0.Y, 0.0, 5878; EGCM-NEXT: MOV * T0.Z, 0.0, 5879; EGCM-NEXT: MOV * T1.X, literal.x, 5880; EGCM-NEXT: 0(0.000000e+00), 0(0.000000e+00) 5881 %val = extractvalue [1 x i8] %arg, 0 5882 store volatile i8 %val, ptr addrspace(1) undef 5883 ret void 5884} 5885 5886define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocapture %out, ptr addrspace(4) byref(i32) align(256) %in.byref, i32 %after.offset) { 5887; SI-LABEL: byref_align_constant_i32_arg: 5888; SI: ; %bb.0: 5889; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x49 5890; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5891; SI-NEXT: s_mov_b32 s3, 0xf000 5892; SI-NEXT: s_mov_b32 s2, -1 5893; SI-NEXT: s_waitcnt lgkmcnt(0) 5894; SI-NEXT: v_mov_b32_e32 v0, s6 5895; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5896; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5897; SI-NEXT: v_mov_b32_e32 v0, s7 5898; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 5899; SI-NEXT: s_waitcnt vmcnt(0) 5900; SI-NEXT: s_endpgm 5901; 5902; VI-LABEL: byref_align_constant_i32_arg: 5903; VI: ; %bb.0: 5904; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5905; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x124 5906; VI-NEXT: s_waitcnt lgkmcnt(0) 5907; VI-NEXT: v_mov_b32_e32 v0, s0 5908; VI-NEXT: v_mov_b32_e32 v1, s1 5909; VI-NEXT: v_mov_b32_e32 v2, s2 5910; VI-NEXT: v_mov_b32_e32 v3, s3 5911; VI-NEXT: flat_store_dword v[0:1], v2 5912; VI-NEXT: s_waitcnt vmcnt(0) 5913; VI-NEXT: flat_store_dword v[0:1], v3 5914; VI-NEXT: s_waitcnt vmcnt(0) 5915; VI-NEXT: s_endpgm 5916; 5917; GFX9-LABEL: byref_align_constant_i32_arg: 5918; GFX9: ; %bb.0: 5919; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x100 5920; GFX9-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 5921; GFX9-NEXT: v_mov_b32_e32 v0, 0 5922; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5923; GFX9-NEXT: v_mov_b32_e32 v1, s0 5924; GFX9-NEXT: v_mov_b32_e32 v2, s1 5925; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5926; GFX9-NEXT: s_waitcnt vmcnt(0) 5927; GFX9-NEXT: global_store_dword v0, v2, s[2:3] 5928; GFX9-NEXT: s_waitcnt vmcnt(0) 5929; GFX9-NEXT: s_endpgm 5930; 5931; EG-LABEL: byref_align_constant_i32_arg: 5932; EG: ; %bb.0: 5933; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 5934; EG-NEXT: TEX 0 @6 5935; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5936; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0 5937; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 5938; EG-NEXT: CF_END 5939; EG-NEXT: Fetch clause starting at 6: 5940; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 5941; EG-NEXT: ALU clause starting at 8: 5942; EG-NEXT: MOV * T0.X, KC0[18].Y, 5943; EG-NEXT: ALU clause starting at 9: 5944; EG-NEXT: MOV T1.X, KC0[18].Z, 5945; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 5946; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5947; 5948; CM-LABEL: byref_align_constant_i32_arg: 5949; CM: ; %bb.0: 5950; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 5951; CM-NEXT: TEX 0 @6 5952; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 5953; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X 5954; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 5955; CM-NEXT: CF_END 5956; CM-NEXT: Fetch clause starting at 6: 5957; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 5958; CM-NEXT: ALU clause starting at 8: 5959; CM-NEXT: MOV * T0.X, KC0[18].Y, 5960; CM-NEXT: ALU clause starting at 9: 5961; CM-NEXT: MOV * T1.X, KC0[18].Z, 5962; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 5963; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 5964 %in = load i32, ptr addrspace(4) %in.byref 5965 store volatile i32 %in, ptr addrspace(1) %out, align 4 5966 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 5967 ret void 5968} 5969 5970define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace(1) nocapture %out, i8, ptr addrspace(4) byref(<16 x i32>) align(64) %in.byref, i32 %after.offset) { 5971; SI-LABEL: byref_natural_align_constant_v16i32_arg: 5972; SI: ; %bb.0: 5973; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 5974; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5975; SI-NEXT: s_load_dword s4, s[4:5], 0x29 5976; SI-NEXT: s_mov_b32 s3, 0xf000 5977; SI-NEXT: s_mov_b32 s2, -1 5978; SI-NEXT: s_waitcnt lgkmcnt(0) 5979; SI-NEXT: v_mov_b32_e32 v0, s20 5980; SI-NEXT: v_mov_b32_e32 v1, s21 5981; SI-NEXT: v_mov_b32_e32 v2, s22 5982; SI-NEXT: v_mov_b32_e32 v3, s23 5983; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 5984; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5985; SI-NEXT: v_mov_b32_e32 v0, s16 5986; SI-NEXT: v_mov_b32_e32 v1, s17 5987; SI-NEXT: v_mov_b32_e32 v2, s18 5988; SI-NEXT: v_mov_b32_e32 v3, s19 5989; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 5990; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5991; SI-NEXT: v_mov_b32_e32 v0, s12 5992; SI-NEXT: v_mov_b32_e32 v1, s13 5993; SI-NEXT: v_mov_b32_e32 v2, s14 5994; SI-NEXT: v_mov_b32_e32 v3, s15 5995; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 5996; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 5997; SI-NEXT: v_mov_b32_e32 v0, s8 5998; SI-NEXT: v_mov_b32_e32 v1, s9 5999; SI-NEXT: v_mov_b32_e32 v2, s10 6000; SI-NEXT: v_mov_b32_e32 v3, s11 6001; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 6002; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 6003; SI-NEXT: v_mov_b32_e32 v0, s4 6004; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 6005; SI-NEXT: s_waitcnt vmcnt(0) 6006; SI-NEXT: s_endpgm 6007; 6008; VI-LABEL: byref_natural_align_constant_v16i32_arg: 6009; VI: ; %bb.0: 6010; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 6011; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6012; VI-NEXT: s_load_dword s4, s[4:5], 0xa4 6013; VI-NEXT: s_waitcnt lgkmcnt(0) 6014; VI-NEXT: v_mov_b32_e32 v0, s20 6015; VI-NEXT: s_add_u32 s2, s0, 48 6016; VI-NEXT: s_addc_u32 s3, s1, 0 6017; VI-NEXT: v_mov_b32_e32 v5, s3 6018; VI-NEXT: v_mov_b32_e32 v4, s2 6019; VI-NEXT: s_add_u32 s2, s0, 32 6020; VI-NEXT: v_mov_b32_e32 v1, s21 6021; VI-NEXT: v_mov_b32_e32 v2, s22 6022; VI-NEXT: v_mov_b32_e32 v3, s23 6023; VI-NEXT: s_addc_u32 s3, s1, 0 6024; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6025; VI-NEXT: s_waitcnt vmcnt(0) 6026; VI-NEXT: v_mov_b32_e32 v5, s3 6027; VI-NEXT: v_mov_b32_e32 v4, s2 6028; VI-NEXT: s_add_u32 s2, s0, 16 6029; VI-NEXT: v_mov_b32_e32 v0, s16 6030; VI-NEXT: v_mov_b32_e32 v1, s17 6031; VI-NEXT: v_mov_b32_e32 v2, s18 6032; VI-NEXT: v_mov_b32_e32 v3, s19 6033; VI-NEXT: s_addc_u32 s3, s1, 0 6034; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6035; VI-NEXT: s_waitcnt vmcnt(0) 6036; VI-NEXT: v_mov_b32_e32 v5, s3 6037; VI-NEXT: v_mov_b32_e32 v0, s12 6038; VI-NEXT: v_mov_b32_e32 v1, s13 6039; VI-NEXT: v_mov_b32_e32 v2, s14 6040; VI-NEXT: v_mov_b32_e32 v3, s15 6041; VI-NEXT: v_mov_b32_e32 v4, s2 6042; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6043; VI-NEXT: s_waitcnt vmcnt(0) 6044; VI-NEXT: v_mov_b32_e32 v5, s1 6045; VI-NEXT: v_mov_b32_e32 v0, s8 6046; VI-NEXT: v_mov_b32_e32 v1, s9 6047; VI-NEXT: v_mov_b32_e32 v2, s10 6048; VI-NEXT: v_mov_b32_e32 v3, s11 6049; VI-NEXT: v_mov_b32_e32 v4, s0 6050; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 6051; VI-NEXT: s_waitcnt vmcnt(0) 6052; VI-NEXT: v_mov_b32_e32 v0, s4 6053; VI-NEXT: flat_store_dword v[4:5], v0 6054; VI-NEXT: s_waitcnt vmcnt(0) 6055; VI-NEXT: s_endpgm 6056; 6057; GFX9-LABEL: byref_natural_align_constant_v16i32_arg: 6058; GFX9: ; %bb.0: 6059; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40 6060; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 6061; GFX9-NEXT: s_load_dword s2, s[8:9], 0x80 6062; GFX9-NEXT: v_mov_b32_e32 v4, 0 6063; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6064; GFX9-NEXT: v_mov_b32_e32 v0, s24 6065; GFX9-NEXT: v_mov_b32_e32 v1, s25 6066; GFX9-NEXT: v_mov_b32_e32 v2, s26 6067; GFX9-NEXT: v_mov_b32_e32 v3, s27 6068; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 6069; GFX9-NEXT: s_waitcnt vmcnt(0) 6070; GFX9-NEXT: v_mov_b32_e32 v0, s20 6071; GFX9-NEXT: v_mov_b32_e32 v1, s21 6072; GFX9-NEXT: v_mov_b32_e32 v2, s22 6073; GFX9-NEXT: v_mov_b32_e32 v3, s23 6074; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 6075; GFX9-NEXT: s_waitcnt vmcnt(0) 6076; GFX9-NEXT: v_mov_b32_e32 v0, s16 6077; GFX9-NEXT: v_mov_b32_e32 v1, s17 6078; GFX9-NEXT: v_mov_b32_e32 v2, s18 6079; GFX9-NEXT: v_mov_b32_e32 v3, s19 6080; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 6081; GFX9-NEXT: s_waitcnt vmcnt(0) 6082; GFX9-NEXT: v_mov_b32_e32 v0, s12 6083; GFX9-NEXT: v_mov_b32_e32 v1, s13 6084; GFX9-NEXT: v_mov_b32_e32 v2, s14 6085; GFX9-NEXT: v_mov_b32_e32 v3, s15 6086; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 6087; GFX9-NEXT: s_waitcnt vmcnt(0) 6088; GFX9-NEXT: v_mov_b32_e32 v0, s2 6089; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 6090; GFX9-NEXT: s_waitcnt vmcnt(0) 6091; GFX9-NEXT: s_endpgm 6092; 6093; EG-LABEL: byref_natural_align_constant_v16i32_arg: 6094; EG: ; %bb.0: 6095; EG-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6096; EG-NEXT: TEX 0 @16 6097; EG-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6098; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 6099; EG-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6100; EG-NEXT: TEX 0 @18 6101; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6102; EG-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6103; EG-NEXT: TEX 0 @20 6104; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0 6105; EG-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6106; EG-NEXT: TEX 0 @22 6107; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 6108; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1 6109; EG-NEXT: CF_END 6110; EG-NEXT: PAD 6111; EG-NEXT: Fetch clause starting at 16: 6112; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6113; EG-NEXT: Fetch clause starting at 18: 6114; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6115; EG-NEXT: Fetch clause starting at 20: 6116; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6117; EG-NEXT: Fetch clause starting at 22: 6118; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6119; EG-NEXT: ALU clause starting at 24: 6120; EG-NEXT: MOV * T0.X, KC0[6].Y, 6121; EG-NEXT: ALU clause starting at 25: 6122; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6123; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6124; EG-NEXT: LSHR * T2.X, PV.W, literal.x, 6125; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6126; EG-NEXT: ALU clause starting at 29: 6127; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6128; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6129; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6130; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6131; EG-NEXT: ALU clause starting at 33: 6132; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6133; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6134; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 6135; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6136; EG-NEXT: ALU clause starting at 37: 6137; EG-NEXT: MOV T1.X, KC0[10].Y, 6138; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6139; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6140; 6141; CM-LABEL: byref_natural_align_constant_v16i32_arg: 6142; CM: ; %bb.0: 6143; CM-NEXT: ALU 0, @24, KC0[CB0:0-32], KC1[] 6144; CM-NEXT: TEX 0 @16 6145; CM-NEXT: ALU 3, @25, KC0[CB0:0-32], KC1[] 6146; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X 6147; CM-NEXT: ALU 3, @29, KC0[CB0:0-32], KC1[] 6148; CM-NEXT: TEX 0 @18 6149; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6150; CM-NEXT: ALU 3, @33, KC0[CB0:0-32], KC1[] 6151; CM-NEXT: TEX 0 @20 6152; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2, T1.X 6153; CM-NEXT: ALU 2, @37, KC0[CB0:0-32], KC1[] 6154; CM-NEXT: TEX 0 @22 6155; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X 6156; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 6157; CM-NEXT: CF_END 6158; CM-NEXT: PAD 6159; CM-NEXT: Fetch clause starting at 16: 6160; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 48, #1 6161; CM-NEXT: Fetch clause starting at 18: 6162; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 32, #1 6163; CM-NEXT: Fetch clause starting at 20: 6164; CM-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 6165; CM-NEXT: Fetch clause starting at 22: 6166; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 6167; CM-NEXT: ALU clause starting at 24: 6168; CM-NEXT: MOV * T0.X, KC0[6].Y, 6169; CM-NEXT: ALU clause starting at 25: 6170; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6171; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) 6172; CM-NEXT: LSHR * T2.X, PV.W, literal.x, 6173; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6174; CM-NEXT: ALU clause starting at 29: 6175; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6176; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) 6177; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6178; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6179; CM-NEXT: ALU clause starting at 33: 6180; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 6181; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) 6182; CM-NEXT: LSHR * T1.X, PV.W, literal.x, 6183; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6184; CM-NEXT: ALU clause starting at 37: 6185; CM-NEXT: MOV * T1.X, KC0[10].Y, 6186; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 6187; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 6188 %in = load <16 x i32>, ptr addrspace(4) %in.byref 6189 store volatile <16 x i32> %in, ptr addrspace(1) %out, align 4 6190 store volatile i32 %after.offset, ptr addrspace(1) %out, align 4 6191 ret void 6192} 6193