1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s 5 6; Make sure we don't crash or assert on spir_kernel calling convention. 7 8define spir_kernel void @kernel(ptr addrspace(1) %out) { 9; SI-LABEL: kernel: 10; SI: ; %bb.0: ; %entry 11; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s3, 0xf000 13; SI-NEXT: s_mov_b32 s2, -1 14; SI-NEXT: v_mov_b32_e32 v0, 0 15; SI-NEXT: s_waitcnt lgkmcnt(0) 16; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: kernel: 20; VI: ; %bb.0: ; %entry 21; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 22; VI-NEXT: v_mov_b32_e32 v2, 0 23; VI-NEXT: s_waitcnt lgkmcnt(0) 24; VI-NEXT: v_mov_b32_e32 v0, s0 25; VI-NEXT: v_mov_b32_e32 v1, s1 26; VI-NEXT: flat_store_dword v[0:1], v2 27; VI-NEXT: s_endpgm 28; 29; GFX11-LABEL: kernel: 30; GFX11: ; %bb.0: ; %entry 31; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 32; GFX11-NEXT: v_mov_b32_e32 v0, 0 33; GFX11-NEXT: s_waitcnt lgkmcnt(0) 34; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] 35; GFX11-NEXT: s_endpgm 36entry: 37 store i32 0, ptr addrspace(1) %out 38 ret void 39} 40 41; FIXME: This is treated like a kernel 42; XGCN-LABEL: {{^}}func: 43; XGCN: s_endpgm 44; define spir_func void @func(ptr addrspace(1) %out) { 45; entry: 46; store i32 0, ptr addrspace(1) %out 47; ret void 48; } 49 50define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { 51; SI-LABEL: ps_ret_cc_f16: 52; SI: ; %bb.0: 53; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 54; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 55; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 56; SI-NEXT: ; return to shader part epilog 57; 58; VI-LABEL: ps_ret_cc_f16: 59; VI: ; %bb.0: 60; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 61; VI-NEXT: ; return to shader part epilog 62; 63; GFX11-LABEL: ps_ret_cc_f16: 64; GFX11: ; %bb.0: 65; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 66; GFX11-NEXT: ; return to shader part epilog 67 %add = fadd half %arg0, 1.0 68 ret half %add 69} 70 71define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { 72; SI-LABEL: ps_ret_cc_inreg_f16: 73; SI: ; %bb.0: 74; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 75; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 76; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 77; SI-NEXT: ; return to shader part epilog 78; 79; VI-LABEL: ps_ret_cc_inreg_f16: 80; VI: ; %bb.0: 81; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 82; VI-NEXT: ; return to shader part epilog 83; 84; GFX11-LABEL: ps_ret_cc_inreg_f16: 85; GFX11: ; %bb.0: 86; GFX11-NEXT: v_add_f16_e64 v0, s0, 1.0 87; GFX11-NEXT: ; return to shader part epilog 88 %add = fadd half %arg0, 1.0 89 ret half %add 90} 91 92define fastcc float @fastcc(float %arg0) #0 { 93; GCN-LABEL: fastcc: 94; GCN: ; %bb.0: 95; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 96; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 97; GCN-NEXT: s_setpc_b64 s[30:31] 98 %add = fadd float %arg0, 4.0 99 ret float %add 100} 101 102define coldcc float @coldcc(float %arg0) #0 { 103; GCN-LABEL: coldcc: 104; GCN: ; %bb.0: 105; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 107; GCN-NEXT: s_setpc_b64 s[30:31] 108 %add = fadd float %arg0, 4.0 109 ret float %add 110} 111 112define amdgpu_kernel void @call_coldcc() #0 { 113; SI-LABEL: call_coldcc: 114; SI: ; %bb.0: 115; SI-NEXT: s_mov_b32 s32, 0 116; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 117; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 118; SI-NEXT: s_mov_b32 s22, -1 119; SI-NEXT: s_mov_b32 s23, 0xe8f000 120; SI-NEXT: s_add_u32 s20, s20, s11 121; SI-NEXT: s_addc_u32 s21, s21, 0 122; SI-NEXT: s_mov_b32 s14, s10 123; SI-NEXT: s_mov_b32 s13, s9 124; SI-NEXT: s_mov_b32 s12, s8 125; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 126; SI-NEXT: s_add_u32 s8, s4, 36 127; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 128; SI-NEXT: s_addc_u32 s9, s5, 0 129; SI-NEXT: s_getpc_b64 s[4:5] 130; SI-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 131; SI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 132; SI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 133; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 134; SI-NEXT: v_or_b32_e32 v0, v0, v1 135; SI-NEXT: v_or_b32_e32 v31, v0, v2 136; SI-NEXT: v_mov_b32_e32 v0, 1.0 137; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 138; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 139; SI-NEXT: s_mov_b64 s[0:1], s[20:21] 140; SI-NEXT: s_mov_b64 s[2:3], s[22:23] 141; SI-NEXT: s_waitcnt lgkmcnt(0) 142; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] 143; SI-NEXT: s_mov_b32 s3, 0xf000 144; SI-NEXT: s_mov_b32 s2, -1 145; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 146; SI-NEXT: s_endpgm 147; 148; VI-LABEL: call_coldcc: 149; VI: ; %bb.0: 150; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 151; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 152; VI-NEXT: s_mov_b32 s90, -1 153; VI-NEXT: s_mov_b32 s91, 0xe80000 154; VI-NEXT: s_add_u32 s88, s88, s11 155; VI-NEXT: s_addc_u32 s89, s89, 0 156; VI-NEXT: s_mov_b32 s12, s8 157; VI-NEXT: s_add_u32 s8, s4, 36 158; VI-NEXT: s_mov_b32 s13, s9 159; VI-NEXT: s_addc_u32 s9, s5, 0 160; VI-NEXT: s_getpc_b64 s[4:5] 161; VI-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 162; VI-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 163; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 164; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 165; VI-NEXT: s_mov_b32 s14, s10 166; VI-NEXT: s_mov_b64 s[10:11], s[6:7] 167; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 168; VI-NEXT: v_or_b32_e32 v0, v0, v1 169; VI-NEXT: s_mov_b64 s[4:5], s[0:1] 170; VI-NEXT: s_mov_b64 s[6:7], s[2:3] 171; VI-NEXT: s_mov_b64 s[0:1], s[88:89] 172; VI-NEXT: v_or_b32_e32 v31, v0, v2 173; VI-NEXT: s_mov_b64 s[2:3], s[90:91] 174; VI-NEXT: v_mov_b32_e32 v0, 1.0 175; VI-NEXT: s_mov_b32 s32, 0 176; VI-NEXT: s_waitcnt lgkmcnt(0) 177; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] 178; VI-NEXT: flat_store_dword v[0:1], v0 179; VI-NEXT: s_endpgm 180; 181; GFX11-LABEL: call_coldcc: 182; GFX11: ; %bb.0: 183; GFX11-NEXT: s_add_u32 s8, s4, 36 184; GFX11-NEXT: s_addc_u32 s9, s5, 0 185; GFX11-NEXT: s_getpc_b64 s[4:5] 186; GFX11-NEXT: s_add_u32 s4, s4, coldcc@gotpcrel32@lo+4 187; GFX11-NEXT: s_addc_u32 s5, s5, coldcc@gotpcrel32@hi+12 188; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 189; GFX11-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 190; GFX11-NEXT: s_mov_b32 s12, s13 191; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] 192; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] 193; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] 194; GFX11-NEXT: s_mov_b32 s13, s14 195; GFX11-NEXT: s_mov_b32 s14, s15 196; GFX11-NEXT: s_mov_b32 s32, 0 197; GFX11-NEXT: s_waitcnt lgkmcnt(0) 198; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] 199; GFX11-NEXT: global_store_b32 v[0:1], v0, off 200; GFX11-NEXT: s_endpgm 201 %val = call float @coldcc(float 1.0) 202 store float %val, ptr addrspace(1) undef 203 ret void 204} 205 206define amdgpu_kernel void @call_fastcc() #0 { 207; SI-LABEL: call_fastcc: 208; SI: ; %bb.0: 209; SI-NEXT: s_mov_b32 s32, 0 210; SI-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0 211; SI-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1 212; SI-NEXT: s_mov_b32 s22, -1 213; SI-NEXT: s_mov_b32 s23, 0xe8f000 214; SI-NEXT: s_add_u32 s20, s20, s11 215; SI-NEXT: s_addc_u32 s21, s21, 0 216; SI-NEXT: s_mov_b32 s14, s10 217; SI-NEXT: s_mov_b32 s13, s9 218; SI-NEXT: s_mov_b32 s12, s8 219; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 220; SI-NEXT: s_add_u32 s8, s4, 36 221; SI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 222; SI-NEXT: s_addc_u32 s9, s5, 0 223; SI-NEXT: s_getpc_b64 s[4:5] 224; SI-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 225; SI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 226; SI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 227; SI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 228; SI-NEXT: v_or_b32_e32 v0, v0, v1 229; SI-NEXT: v_or_b32_e32 v31, v0, v2 230; SI-NEXT: v_mov_b32_e32 v0, 1.0 231; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 232; SI-NEXT: s_mov_b64 s[6:7], s[2:3] 233; SI-NEXT: s_mov_b64 s[0:1], s[20:21] 234; SI-NEXT: s_mov_b64 s[2:3], s[22:23] 235; SI-NEXT: s_waitcnt lgkmcnt(0) 236; SI-NEXT: s_swappc_b64 s[30:31], s[16:17] 237; SI-NEXT: s_mov_b32 s3, 0xf000 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 240; SI-NEXT: s_endpgm 241; 242; VI-LABEL: call_fastcc: 243; VI: ; %bb.0: 244; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 245; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 246; VI-NEXT: s_mov_b32 s90, -1 247; VI-NEXT: s_mov_b32 s91, 0xe80000 248; VI-NEXT: s_add_u32 s88, s88, s11 249; VI-NEXT: s_addc_u32 s89, s89, 0 250; VI-NEXT: s_mov_b32 s12, s8 251; VI-NEXT: s_add_u32 s8, s4, 36 252; VI-NEXT: s_mov_b32 s13, s9 253; VI-NEXT: s_addc_u32 s9, s5, 0 254; VI-NEXT: s_getpc_b64 s[4:5] 255; VI-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 256; VI-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 257; VI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 258; VI-NEXT: v_lshlrev_b32_e32 v1, 10, v1 259; VI-NEXT: s_mov_b32 s14, s10 260; VI-NEXT: s_mov_b64 s[10:11], s[6:7] 261; VI-NEXT: v_lshlrev_b32_e32 v2, 20, v2 262; VI-NEXT: v_or_b32_e32 v0, v0, v1 263; VI-NEXT: s_mov_b64 s[4:5], s[0:1] 264; VI-NEXT: s_mov_b64 s[6:7], s[2:3] 265; VI-NEXT: s_mov_b64 s[0:1], s[88:89] 266; VI-NEXT: v_or_b32_e32 v31, v0, v2 267; VI-NEXT: s_mov_b64 s[2:3], s[90:91] 268; VI-NEXT: v_mov_b32_e32 v0, 1.0 269; VI-NEXT: s_mov_b32 s32, 0 270; VI-NEXT: s_waitcnt lgkmcnt(0) 271; VI-NEXT: s_swappc_b64 s[30:31], s[16:17] 272; VI-NEXT: flat_store_dword v[0:1], v0 273; VI-NEXT: s_endpgm 274; 275; GFX11-LABEL: call_fastcc: 276; GFX11: ; %bb.0: 277; GFX11-NEXT: s_add_u32 s8, s4, 36 278; GFX11-NEXT: s_addc_u32 s9, s5, 0 279; GFX11-NEXT: s_getpc_b64 s[4:5] 280; GFX11-NEXT: s_add_u32 s4, s4, fastcc@gotpcrel32@lo+4 281; GFX11-NEXT: s_addc_u32 s5, s5, fastcc@gotpcrel32@hi+12 282; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0 283; GFX11-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 284; GFX11-NEXT: s_mov_b32 s12, s13 285; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] 286; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] 287; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] 288; GFX11-NEXT: s_mov_b32 s13, s14 289; GFX11-NEXT: s_mov_b32 s14, s15 290; GFX11-NEXT: s_mov_b32 s32, 0 291; GFX11-NEXT: s_waitcnt lgkmcnt(0) 292; GFX11-NEXT: s_swappc_b64 s[30:31], s[16:17] 293; GFX11-NEXT: global_store_b32 v[0:1], v0, off 294; GFX11-NEXT: s_endpgm 295 %val = call float @fastcc(float 1.0) 296 store float %val, ptr addrspace(1) undef 297 ret void 298} 299 300; Mesa compute shader: check for 47176 (COMPUTE_PGM_RSRC1) in .AMDGPU.config 301define amdgpu_cs half @cs_mesa(half %arg0) { 302; SI-LABEL: cs_mesa: 303; SI: ; %bb.0: 304; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 305; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 306; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 307; SI-NEXT: ; return to shader part epilog 308; 309; VI-LABEL: cs_mesa: 310; VI: ; %bb.0: 311; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 312; VI-NEXT: ; return to shader part epilog 313; 314; GFX11-LABEL: cs_mesa: 315; GFX11: ; %bb.0: 316; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 317; GFX11-NEXT: ; return to shader part epilog 318 %add = fadd half %arg0, 1.0 319 ret half %add 320} 321 322; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config 323define amdgpu_ps half @ps_mesa_f16(half %arg0) { 324; SI-LABEL: ps_mesa_f16: 325; SI: ; %bb.0: 326; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 327; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 328; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 329; SI-NEXT: ; return to shader part epilog 330; 331; VI-LABEL: ps_mesa_f16: 332; VI: ; %bb.0: 333; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 334; VI-NEXT: ; return to shader part epilog 335; 336; GFX11-LABEL: ps_mesa_f16: 337; GFX11: ; %bb.0: 338; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 339; GFX11-NEXT: ; return to shader part epilog 340 %add = fadd half %arg0, 1.0 341 ret half %add 342} 343 344; Mesa vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in .AMDGPU.config 345define amdgpu_vs half @vs_mesa(half %arg0) { 346; SI-LABEL: vs_mesa: 347; SI: ; %bb.0: 348; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 349; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 350; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 351; SI-NEXT: ; return to shader part epilog 352; 353; VI-LABEL: vs_mesa: 354; VI: ; %bb.0: 355; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 356; VI-NEXT: ; return to shader part epilog 357; 358; GFX11-LABEL: vs_mesa: 359; GFX11: ; %bb.0: 360; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 361; GFX11-NEXT: ; return to shader part epilog 362 %add = fadd half %arg0, 1.0 363 ret half %add 364} 365 366; Mesa geometry shader: check for 45608 (SPI_SHADER_PGM_RSRC1_GS) in .AMDGPU.config 367define amdgpu_gs half @gs_mesa(half %arg0) { 368; SI-LABEL: gs_mesa: 369; SI: ; %bb.0: 370; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 371; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 372; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 373; SI-NEXT: ; return to shader part epilog 374; 375; VI-LABEL: gs_mesa: 376; VI: ; %bb.0: 377; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 378; VI-NEXT: ; return to shader part epilog 379; 380; GFX11-LABEL: gs_mesa: 381; GFX11: ; %bb.0: 382; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 383; GFX11-NEXT: ; return to shader part epilog 384 %add = fadd half %arg0, 1.0 385 ret half %add 386} 387 388; Mesa hull shader: check for 46120 (SPI_SHADER_PGM_RSRC1_HS) in .AMDGPU.config 389define amdgpu_hs half @hs_mesa(half %arg0) { 390; SI-LABEL: hs_mesa: 391; SI: ; %bb.0: 392; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 393; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 394; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 395; SI-NEXT: ; return to shader part epilog 396; 397; VI-LABEL: hs_mesa: 398; VI: ; %bb.0: 399; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 400; VI-NEXT: ; return to shader part epilog 401; 402; GFX11-LABEL: hs_mesa: 403; GFX11: ; %bb.0: 404; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 405; GFX11-NEXT: ; return to shader part epilog 406 %add = fadd half %arg0, 1.0 407 ret half %add 408} 409 410; FIXME: Inconsistent ABI between targets 411 412define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { 413; SI-LABEL: ps_mesa_v2f16: 414; SI: ; %bb.0: 415; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 416; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 417; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 418; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 419; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 420; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 421; SI-NEXT: ; return to shader part epilog 422; 423; VI-LABEL: ps_mesa_v2f16: 424; VI: ; %bb.0: 425; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 426; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 427; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 428; VI-NEXT: v_or_b32_e32 v0, v0, v1 429; VI-NEXT: ; return to shader part epilog 430; 431; GFX11-LABEL: ps_mesa_v2f16: 432; GFX11: ; %bb.0: 433; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] 434; GFX11-NEXT: ; return to shader part epilog 435 %add = fadd <2 x half> %arg0, <half 1.0, half 1.0> 436 ret <2 x half> %add 437} 438 439define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { 440; SI-LABEL: ps_mesa_inreg_v2f16: 441; SI: ; %bb.0: 442; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 443; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 444; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 445; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 446; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 447; SI-NEXT: v_add_f32_e32 v1, 1.0, v2 448; SI-NEXT: ; return to shader part epilog 449; 450; VI-LABEL: ps_mesa_inreg_v2f16: 451; VI: ; %bb.0: 452; VI-NEXT: s_lshr_b32 s1, s0, 16 453; VI-NEXT: v_mov_b32_e32 v0, s1 454; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 455; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 456; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 457; VI-NEXT: v_or_b32_e32 v0, v1, v0 458; VI-NEXT: ; return to shader part epilog 459; 460; GFX11-LABEL: ps_mesa_inreg_v2f16: 461; GFX11: ; %bb.0: 462; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] 463; GFX11-NEXT: ; return to shader part epilog 464 %add = fadd <2 x half> %arg0, <half 1.0, half 1.0> 465 ret <2 x half> %add 466} 467 468define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { 469; SI-LABEL: ps_mesa_v2i16: 470; SI: ; %bb.0: 471; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 472; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 473; SI-NEXT: s_mov_b32 s3, 0xf000 474; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 475; SI-NEXT: v_or_b32_e32 v0, v1, v0 476; SI-NEXT: v_add_i32_e32 v0, vcc, 0x10000, v0 477; SI-NEXT: s_mov_b32 s2, -1 478; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 479; SI-NEXT: s_endpgm 480; 481; VI-LABEL: ps_mesa_v2i16: 482; VI: ; %bb.0: 483; VI-NEXT: v_mov_b32_e32 v2, 1 484; VI-NEXT: v_add_u16_e32 v1, 1, v0 485; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 486; VI-NEXT: v_or_b32_e32 v0, v1, v0 487; VI-NEXT: flat_store_dword v[0:1], v0 488; VI-NEXT: s_endpgm 489; 490; GFX11-LABEL: ps_mesa_v2i16: 491; GFX11: ; %bb.0: 492; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] 493; GFX11-NEXT: global_store_b32 v[0:1], v0, off 494; GFX11-NEXT: s_endpgm 495 %add = add <2 x i16> %arg0, <i16 1, i16 1> 496 store <2 x i16> %add, ptr addrspace(1) undef 497 ret void 498} 499 500define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { 501; SI-LABEL: ps_mesa_inreg_v2i16: 502; SI: ; %bb.0: 503; SI-NEXT: s_lshl_b32 s1, s1, 16 504; SI-NEXT: s_add_i32 s0, s0, 1 505; SI-NEXT: s_mov_b32 s3, 0xf000 506; SI-NEXT: s_and_b32 s0, s0, 0xffff 507; SI-NEXT: s_or_b32 s0, s1, s0 508; SI-NEXT: s_add_i32 s0, s0, 0x10000 509; SI-NEXT: s_mov_b32 s2, -1 510; SI-NEXT: v_mov_b32_e32 v0, s0 511; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 512; SI-NEXT: s_endpgm 513; 514; VI-LABEL: ps_mesa_inreg_v2i16: 515; VI: ; %bb.0: 516; VI-NEXT: s_and_b32 s1, s0, 0xffff0000 517; VI-NEXT: s_add_i32 s0, s0, 1 518; VI-NEXT: s_and_b32 s0, s0, 0xffff 519; VI-NEXT: s_or_b32 s0, s1, s0 520; VI-NEXT: s_add_i32 s0, s0, 0x10000 521; VI-NEXT: v_mov_b32_e32 v0, s0 522; VI-NEXT: flat_store_dword v[0:1], v0 523; VI-NEXT: s_endpgm 524; 525; GFX11-LABEL: ps_mesa_inreg_v2i16: 526; GFX11: ; %bb.0: 527; GFX11-NEXT: v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0] 528; GFX11-NEXT: global_store_b32 v[0:1], v0, off 529; GFX11-NEXT: s_endpgm 530 %add = add <2 x i16> %arg0, <i16 1, i16 1> 531 store <2 x i16> %add, ptr addrspace(1) undef 532 ret void 533} 534 535; FIXME: Differenet ABI for VI+ 536 537define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { 538; SI-LABEL: ps_mesa_v4f16: 539; SI: ; %bb.0: 540; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 541; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 542; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 543; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 544; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 545; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 546; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 547; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 548; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 549; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 550; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 551; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 552; SI-NEXT: ; return to shader part epilog 553; 554; VI-LABEL: ps_mesa_v4f16: 555; VI: ; %bb.0: 556; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 557; VI-NEXT: v_add_f16_e32 v2, 1.0, v1 558; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 559; VI-NEXT: v_add_f16_e32 v4, 1.0, v0 560; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 561; VI-NEXT: v_or_b32_e32 v0, v4, v0 562; VI-NEXT: v_or_b32_e32 v1, v2, v1 563; VI-NEXT: ; return to shader part epilog 564; 565; GFX11-LABEL: ps_mesa_v4f16: 566; GFX11: ; %bb.0: 567; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] 568; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] 569; GFX11-NEXT: ; return to shader part epilog 570 %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0> 571 ret <4 x half> %add 572} 573 574define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { 575; SI-LABEL: ps_mesa_inreg_v4f16: 576; SI: ; %bb.0: 577; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 578; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 579; SI-NEXT: v_cvt_f16_f32_e32 v2, s1 580; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 581; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 582; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 583; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 584; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 585; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 586; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 587; SI-NEXT: v_add_f32_e32 v2, 1.0, v5 588; SI-NEXT: v_add_f32_e32 v3, 1.0, v4 589; SI-NEXT: ; return to shader part epilog 590; 591; VI-LABEL: ps_mesa_inreg_v4f16: 592; VI: ; %bb.0: 593; VI-NEXT: v_add_f16_e64 v1, s1, 1.0 594; VI-NEXT: s_lshr_b32 s1, s1, 16 595; VI-NEXT: v_mov_b32_e32 v0, s1 596; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 597; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 598; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 599; VI-NEXT: s_lshr_b32 s0, s0, 16 600; VI-NEXT: v_mov_b32_e32 v4, s0 601; VI-NEXT: v_add_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 602; VI-NEXT: v_or_b32_e32 v0, v0, v2 603; VI-NEXT: v_or_b32_e32 v1, v1, v3 604; VI-NEXT: ; return to shader part epilog 605; 606; GFX11-LABEL: ps_mesa_inreg_v4f16: 607; GFX11: ; %bb.0: 608; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] 609; GFX11-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] 610; GFX11-NEXT: ; return to shader part epilog 611 %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0> 612 ret <4 x half> %add 613} 614 615define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { 616; SI-LABEL: ps_mesa_inreg_v3i32: 617; SI: ; %bb.0: 618; SI-NEXT: s_add_i32 s1, s1, 2 619; SI-NEXT: s_add_i32 s0, s0, 1 620; SI-NEXT: s_add_i32 s4, s2, 3 621; SI-NEXT: s_mov_b32 s3, 0xf000 622; SI-NEXT: s_mov_b32 s2, -1 623; SI-NEXT: v_mov_b32_e32 v0, s4 624; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 625; SI-NEXT: s_waitcnt expcnt(0) 626; SI-NEXT: v_mov_b32_e32 v0, s0 627; SI-NEXT: v_mov_b32_e32 v1, s1 628; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 629; SI-NEXT: s_endpgm 630; 631; VI-LABEL: ps_mesa_inreg_v3i32: 632; VI: ; %bb.0: 633; VI-NEXT: s_add_i32 s2, s2, 3 634; VI-NEXT: s_add_i32 s1, s1, 2 635; VI-NEXT: s_add_i32 s0, s0, 1 636; VI-NEXT: v_mov_b32_e32 v0, s0 637; VI-NEXT: v_mov_b32_e32 v1, s1 638; VI-NEXT: v_mov_b32_e32 v2, s2 639; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] 640; VI-NEXT: s_endpgm 641; 642; GFX11-LABEL: ps_mesa_inreg_v3i32: 643; GFX11: ; %bb.0: 644; GFX11-NEXT: s_add_i32 s2, s2, 3 645; GFX11-NEXT: s_add_i32 s0, s0, 1 646; GFX11-NEXT: s_add_i32 s1, s1, 2 647; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 648; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 649; GFX11-NEXT: v_mov_b32_e32 v2, s2 650; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off 651; GFX11-NEXT: s_endpgm 652 %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3> 653 store <3 x i32> %add, ptr addrspace(1) undef 654 ret void 655} 656 657define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { 658; SI-LABEL: ps_mesa_inreg_v3f32: 659; SI: ; %bb.0: 660; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 661; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 662; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 663; SI-NEXT: s_mov_b32 s3, 0xf000 664; SI-NEXT: s_mov_b32 s2, -1 665; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 666; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 667; SI-NEXT: s_endpgm 668; 669; VI-LABEL: ps_mesa_inreg_v3f32: 670; VI: ; %bb.0: 671; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 672; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 673; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 674; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] 675; VI-NEXT: s_endpgm 676; 677; GFX11-LABEL: ps_mesa_inreg_v3f32: 678; GFX11: ; %bb.0: 679; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 680; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 681; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 682; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off 683; GFX11-NEXT: s_endpgm 684 %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0> 685 store <3 x float> %add, ptr addrspace(1) undef 686 ret void 687} 688 689define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { 690; SI-LABEL: ps_mesa_inreg_v5i32: 691; SI: ; %bb.0: 692; SI-NEXT: s_add_i32 s5, s3, 4 693; SI-NEXT: s_add_i32 s6, s2, 3 694; SI-NEXT: s_add_i32 s1, s1, 2 695; SI-NEXT: s_add_i32 s0, s0, 1 696; SI-NEXT: s_add_i32 s4, s4, 5 697; SI-NEXT: s_mov_b32 s3, 0xf000 698; SI-NEXT: s_mov_b32 s2, -1 699; SI-NEXT: v_mov_b32_e32 v0, s4 700; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 701; SI-NEXT: s_waitcnt expcnt(0) 702; SI-NEXT: v_mov_b32_e32 v0, s0 703; SI-NEXT: v_mov_b32_e32 v1, s1 704; SI-NEXT: v_mov_b32_e32 v2, s6 705; SI-NEXT: v_mov_b32_e32 v3, s5 706; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 707; SI-NEXT: s_endpgm 708; 709; VI-LABEL: ps_mesa_inreg_v5i32: 710; VI: ; %bb.0: 711; VI-NEXT: s_add_i32 s4, s4, 5 712; VI-NEXT: s_add_i32 s3, s3, 4 713; VI-NEXT: s_add_i32 s2, s2, 3 714; VI-NEXT: s_add_i32 s1, s1, 2 715; VI-NEXT: s_add_i32 s0, s0, 1 716; VI-NEXT: v_mov_b32_e32 v0, s4 717; VI-NEXT: flat_store_dword v[0:1], v0 718; VI-NEXT: v_mov_b32_e32 v0, s0 719; VI-NEXT: v_mov_b32_e32 v1, s1 720; VI-NEXT: v_mov_b32_e32 v2, s2 721; VI-NEXT: v_mov_b32_e32 v3, s3 722; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 723; VI-NEXT: s_endpgm 724; 725; GFX11-LABEL: ps_mesa_inreg_v5i32: 726; GFX11: ; %bb.0: 727; GFX11-NEXT: s_add_i32 s3, s3, 4 728; GFX11-NEXT: s_add_i32 s2, s2, 3 729; GFX11-NEXT: s_add_i32 s1, s1, 2 730; GFX11-NEXT: s_add_i32 s4, s4, 5 731; GFX11-NEXT: s_add_i32 s0, s0, 1 732; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 733; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 734; GFX11-NEXT: v_mov_b32_e32 v2, s2 735; GFX11-NEXT: s_clause 0x1 736; GFX11-NEXT: global_store_b32 v[0:1], v4, off 737; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off 738; GFX11-NEXT: s_endpgm 739 %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5> 740 store <5 x i32> %add, ptr addrspace(1) undef 741 ret void 742} 743 744define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { 745; SI-LABEL: ps_mesa_inreg_v5f32: 746; SI: ; %bb.0: 747; SI-NEXT: v_add_f32_e64 v3, s3, -1.0 748; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 749; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 750; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 751; SI-NEXT: v_add_f32_e64 v4, s4, 0.5 752; SI-NEXT: s_mov_b32 s3, 0xf000 753; SI-NEXT: s_mov_b32 s2, -1 754; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 755; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 756; SI-NEXT: s_endpgm 757; 758; VI-LABEL: ps_mesa_inreg_v5f32: 759; VI: ; %bb.0: 760; VI-NEXT: v_add_f32_e64 v3, s3, -1.0 761; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 762; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 763; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 764; VI-NEXT: v_add_f32_e64 v4, s4, 0.5 765; VI-NEXT: flat_store_dword v[0:1], v4 766; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 767; VI-NEXT: s_endpgm 768; 769; GFX11-LABEL: ps_mesa_inreg_v5f32: 770; GFX11: ; %bb.0: 771; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0 772; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 773; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 774; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5 775; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 776; GFX11-NEXT: s_clause 0x1 777; GFX11-NEXT: global_store_b32 v[0:1], v4, off 778; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off 779; GFX11-NEXT: s_endpgm 780 %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5> 781 store <5 x float> %add, ptr addrspace(1) undef 782 ret void 783} 784 785define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { 786; SI-LABEL: ps_mesa_v3i32: 787; SI: ; %bb.0: 788; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 789; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 790; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 791; SI-NEXT: s_mov_b32 s3, 0xf000 792; SI-NEXT: s_mov_b32 s2, -1 793; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 794; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 795; SI-NEXT: s_endpgm 796; 797; VI-LABEL: ps_mesa_v3i32: 798; VI: ; %bb.0: 799; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 800; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 801; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 802; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] 803; VI-NEXT: s_endpgm 804; 805; GFX11-LABEL: ps_mesa_v3i32: 806; GFX11: ; %bb.0: 807; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 808; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 809; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 810; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off 811; GFX11-NEXT: s_endpgm 812 %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3> 813 store <3 x i32> %add, ptr addrspace(1) undef 814 ret void 815} 816 817define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { 818; SI-LABEL: ps_mesa_v3f32: 819; SI: ; %bb.0: 820; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 821; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 822; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 823; SI-NEXT: s_mov_b32 s3, 0xf000 824; SI-NEXT: s_mov_b32 s2, -1 825; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 826; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 827; SI-NEXT: s_endpgm 828; 829; VI-LABEL: ps_mesa_v3f32: 830; VI: ; %bb.0: 831; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 832; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 833; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 834; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] 835; VI-NEXT: s_endpgm 836; 837; GFX11-LABEL: ps_mesa_v3f32: 838; GFX11: ; %bb.0: 839; GFX11-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 840; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 841; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off 842; GFX11-NEXT: s_endpgm 843 %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0> 844 store <3 x float> %add, ptr addrspace(1) undef 845 ret void 846} 847 848define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { 849; SI-LABEL: ps_mesa_v5i32: 850; SI: ; %bb.0: 851; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v3 852; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 853; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 854; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 855; SI-NEXT: v_add_i32_e32 v4, vcc, 5, v4 856; SI-NEXT: s_mov_b32 s3, 0xf000 857; SI-NEXT: s_mov_b32 s2, -1 858; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 859; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 860; SI-NEXT: s_endpgm 861; 862; VI-LABEL: ps_mesa_v5i32: 863; VI: ; %bb.0: 864; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3 865; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 866; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 867; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 868; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4 869; VI-NEXT: flat_store_dword v[0:1], v4 870; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 871; VI-NEXT: s_endpgm 872; 873; GFX11-LABEL: ps_mesa_v5i32: 874; GFX11: ; %bb.0: 875; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3 876; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 877; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 878; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4 879; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 880; GFX11-NEXT: s_clause 0x1 881; GFX11-NEXT: global_store_b32 v[0:1], v4, off 882; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off 883; GFX11-NEXT: s_endpgm 884 %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5> 885 store <5 x i32> %add, ptr addrspace(1) undef 886 ret void 887} 888 889define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { 890; SI-LABEL: ps_mesa_v5f32: 891; SI: ; %bb.0: 892; SI-NEXT: v_add_f32_e32 v3, -1.0, v3 893; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 894; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 895; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 896; SI-NEXT: v_add_f32_e32 v4, 0.5, v4 897; SI-NEXT: s_mov_b32 s3, 0xf000 898; SI-NEXT: s_mov_b32 s2, -1 899; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 900; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 901; SI-NEXT: s_endpgm 902; 903; VI-LABEL: ps_mesa_v5f32: 904; VI: ; %bb.0: 905; VI-NEXT: v_add_f32_e32 v3, -1.0, v3 906; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 907; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 908; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 909; VI-NEXT: v_add_f32_e32 v4, 0.5, v4 910; VI-NEXT: flat_store_dword v[0:1], v4 911; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 912; VI-NEXT: s_endpgm 913; 914; GFX11-LABEL: ps_mesa_v5f32: 915; GFX11: ; %bb.0: 916; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 917; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 918; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 919; GFX11-NEXT: s_clause 0x1 920; GFX11-NEXT: global_store_b32 v[0:1], v4, off 921; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off 922; GFX11-NEXT: s_endpgm 923 %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5> 924 store <5 x float> %add, ptr addrspace(1) undef 925 ret void 926} 927 928define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { 929; SI-LABEL: ps_mesa_i16: 930; SI: ; %bb.0: 931; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v0 932; SI-NEXT: s_mov_b32 s3, 0xf000 933; SI-NEXT: s_mov_b32 s2, -1 934; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 935; SI-NEXT: s_endpgm 936; 937; VI-LABEL: ps_mesa_i16: 938; VI: ; %bb.0: 939; VI-NEXT: v_add_u16_e32 v0, v0, v0 940; VI-NEXT: flat_store_short v[0:1], v0 941; VI-NEXT: s_endpgm 942; 943; GFX11-LABEL: ps_mesa_i16: 944; GFX11: ; %bb.0: 945; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 946; GFX11-NEXT: global_store_b16 v[0:1], v0, off 947; GFX11-NEXT: s_endpgm 948 %add = add i16 %arg0, %arg0 949 store i16 %add, ptr addrspace(1) undef 950 ret void 951} 952 953define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { 954; SI-LABEL: ps_mesa_inreg_i16: 955; SI: ; %bb.0: 956; SI-NEXT: s_add_i32 s0, s0, s0 957; SI-NEXT: s_mov_b32 s3, 0xf000 958; SI-NEXT: s_mov_b32 s2, -1 959; SI-NEXT: v_mov_b32_e32 v0, s0 960; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 961; SI-NEXT: s_endpgm 962; 963; VI-LABEL: ps_mesa_inreg_i16: 964; VI: ; %bb.0: 965; VI-NEXT: s_and_b32 s0, 0xffff, s0 966; VI-NEXT: s_add_i32 s0, s0, s0 967; VI-NEXT: v_mov_b32_e32 v0, s0 968; VI-NEXT: flat_store_short v[0:1], v0 969; VI-NEXT: s_endpgm 970; 971; GFX11-LABEL: ps_mesa_inreg_i16: 972; GFX11: ; %bb.0: 973; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 974; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 975; GFX11-NEXT: s_add_i32 s0, s0, s0 976; GFX11-NEXT: v_mov_b32_e32 v0, s0 977; GFX11-NEXT: global_store_b16 v[0:1], v0, off 978; GFX11-NEXT: s_endpgm 979 %add = add i16 %arg0, %arg0 980 store i16 %add, ptr addrspace(1) undef 981 ret void 982} 983 984define amdgpu_ps i16 @ret_ps_mesa_i16() { 985; GCN-LABEL: ret_ps_mesa_i16: 986; GCN: ; %bb.0: 987; GCN-NEXT: s_movk_i32 s0, 0x7b 988; GCN-NEXT: ; return to shader part epilog 989 ret i16 123 990} 991 992define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) { 993; SI-LABEL: amd_kernel_i8: 994; SI: ; %bb.0: ; %entry 995; SI-NEXT: s_load_dword s0, s[4:5], 0x9 996; SI-NEXT: s_mov_b32 s3, 0xf000 997; SI-NEXT: s_waitcnt lgkmcnt(0) 998; SI-NEXT: s_add_i32 s0, s0, s0 999; SI-NEXT: s_mov_b32 s2, -1 1000; SI-NEXT: v_mov_b32_e32 v0, s0 1001; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1002; SI-NEXT: s_endpgm 1003; 1004; VI-LABEL: amd_kernel_i8: 1005; VI: ; %bb.0: ; %entry 1006; VI-NEXT: s_load_dword s0, s[4:5], 0x24 1007; VI-NEXT: s_waitcnt lgkmcnt(0) 1008; VI-NEXT: s_add_i32 s0, s0, s0 1009; VI-NEXT: v_mov_b32_e32 v0, s0 1010; VI-NEXT: flat_store_byte v[0:1], v0 1011; VI-NEXT: s_endpgm 1012; 1013; GFX11-LABEL: amd_kernel_i8: 1014; GFX11: ; %bb.0: ; %entry 1015; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 1016; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX11-NEXT: s_add_i32 s0, s0, s0 1018; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1019; GFX11-NEXT: v_mov_b32_e32 v0, s0 1020; GFX11-NEXT: global_store_b8 v[0:1], v0, off 1021; GFX11-NEXT: s_endpgm 1022entry: 1023 %add = add i8 %arg0, %arg0 1024 store i8 %add, ptr addrspace(1) undef 1025 ret void 1026} 1027 1028define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) { 1029; SI-LABEL: amd_kernel_v2i8: 1030; SI: ; %bb.0: ; %entry 1031; SI-NEXT: s_load_dword s1, s[4:5], 0x9 1032; SI-NEXT: s_mov_b32 s0, 0 1033; SI-NEXT: s_mov_b32 s3, 0xf000 1034; SI-NEXT: s_waitcnt lgkmcnt(0) 1035; SI-NEXT: s_bfe_u32 s2, s1, 0x80008 1036; SI-NEXT: s_add_i32 s1, s1, s1 1037; SI-NEXT: s_and_b32 s1, s1, 0xff 1038; SI-NEXT: s_add_i32 s2, s2, s2 1039; SI-NEXT: s_lshl_b32 s2, s2, 8 1040; SI-NEXT: s_or_b32 s4, s1, s2 1041; SI-NEXT: s_mov_b32 s2, -1 1042; SI-NEXT: s_mov_b32 s1, s0 1043; SI-NEXT: v_mov_b32_e32 v0, s4 1044; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1045; SI-NEXT: s_endpgm 1046; 1047; VI-LABEL: amd_kernel_v2i8: 1048; VI: ; %bb.0: ; %entry 1049; VI-NEXT: s_load_dword s0, s[4:5], 0x24 1050; VI-NEXT: v_mov_b32_e32 v0, 0 1051; VI-NEXT: v_mov_b32_e32 v1, 0 1052; VI-NEXT: s_waitcnt lgkmcnt(0) 1053; VI-NEXT: s_bfe_u32 s1, s0, 0x80008 1054; VI-NEXT: s_add_i32 s0, s0, s0 1055; VI-NEXT: s_add_i32 s1, s1, s1 1056; VI-NEXT: s_and_b32 s0, s0, 0xff 1057; VI-NEXT: s_lshl_b32 s1, s1, 8 1058; VI-NEXT: s_or_b32 s0, s0, s1 1059; VI-NEXT: v_mov_b32_e32 v2, s0 1060; VI-NEXT: flat_store_short v[0:1], v2 1061; VI-NEXT: s_endpgm 1062; 1063; GFX11-LABEL: amd_kernel_v2i8: 1064; GFX11: ; %bb.0: ; %entry 1065; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 1066; GFX11-NEXT: v_mov_b32_e32 v0, 0 1067; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1068; GFX11-NEXT: s_bfe_u32 s1, s0, 0x80008 1069; GFX11-NEXT: s_add_i32 s0, s0, s0 1070; GFX11-NEXT: s_add_i32 s1, s1, s1 1071; GFX11-NEXT: s_and_b32 s0, s0, 0xff 1072; GFX11-NEXT: s_lshl_b32 s1, s1, 8 1073; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1074; GFX11-NEXT: s_or_b32 s0, s0, s1 1075; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 1076; GFX11-NEXT: global_store_b16 v[0:1], v2, off 1077; GFX11-NEXT: s_endpgm 1078entry: 1079 %add = add <2 x i8> %arg0, %arg0 1080 store <2 x i8> %add, ptr addrspace(1) null 1081 ret void 1082} 1083 1084define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) { 1085; SI-LABEL: amd_kernel_v4i8: 1086; SI: ; %bb.0: ; %entry 1087; SI-NEXT: s_load_dword s1, s[4:5], 0x9 1088; SI-NEXT: s_mov_b32 s0, 0 1089; SI-NEXT: s_mov_b32 s3, 0xf000 1090; SI-NEXT: s_waitcnt lgkmcnt(0) 1091; SI-NEXT: s_lshr_b32 s2, s1, 16 1092; SI-NEXT: s_lshr_b32 s4, s1, 24 1093; SI-NEXT: s_bfe_u32 s5, s1, 0x80008 1094; SI-NEXT: s_add_i32 s1, s1, s1 1095; SI-NEXT: s_add_i32 s4, s4, s4 1096; SI-NEXT: s_add_i32 s2, s2, s2 1097; SI-NEXT: s_and_b32 s1, s1, 0xff 1098; SI-NEXT: s_add_i32 s5, s5, s5 1099; SI-NEXT: s_lshl_b32 s4, s4, 24 1100; SI-NEXT: s_and_b32 s2, s2, 0xff 1101; SI-NEXT: s_lshl_b32 s5, s5, 8 1102; SI-NEXT: s_lshl_b32 s2, s2, 16 1103; SI-NEXT: s_or_b32 s1, s1, s5 1104; SI-NEXT: s_or_b32 s2, s4, s2 1105; SI-NEXT: s_and_b32 s1, s1, 0xffff 1106; SI-NEXT: s_or_b32 s4, s1, s2 1107; SI-NEXT: s_mov_b32 s2, -1 1108; SI-NEXT: s_mov_b32 s1, s0 1109; SI-NEXT: v_mov_b32_e32 v0, s4 1110; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1111; SI-NEXT: s_endpgm 1112; 1113; VI-LABEL: amd_kernel_v4i8: 1114; VI: ; %bb.0: ; %entry 1115; VI-NEXT: s_load_dword s0, s[4:5], 0x24 1116; VI-NEXT: v_mov_b32_e32 v0, 0 1117; VI-NEXT: v_mov_b32_e32 v1, 0 1118; VI-NEXT: s_waitcnt lgkmcnt(0) 1119; VI-NEXT: s_lshr_b32 s2, s0, 16 1120; VI-NEXT: s_lshr_b32 s1, s0, 24 1121; VI-NEXT: s_add_i32 s2, s2, s2 1122; VI-NEXT: s_bfe_u32 s3, s0, 0x80008 1123; VI-NEXT: s_add_i32 s1, s1, s1 1124; VI-NEXT: s_and_b32 s2, s2, 0xff 1125; VI-NEXT: s_add_i32 s3, s3, s3 1126; VI-NEXT: s_add_i32 s0, s0, s0 1127; VI-NEXT: s_lshl_b32 s1, s1, 24 1128; VI-NEXT: s_lshl_b32 s2, s2, 16 1129; VI-NEXT: s_or_b32 s1, s1, s2 1130; VI-NEXT: s_and_b32 s0, s0, 0xff 1131; VI-NEXT: s_lshl_b32 s2, s3, 8 1132; VI-NEXT: s_or_b32 s0, s0, s2 1133; VI-NEXT: s_and_b32 s0, s0, 0xffff 1134; VI-NEXT: s_or_b32 s0, s0, s1 1135; VI-NEXT: v_mov_b32_e32 v2, s0 1136; VI-NEXT: flat_store_dword v[0:1], v2 1137; VI-NEXT: s_endpgm 1138; 1139; GFX11-LABEL: amd_kernel_v4i8: 1140; GFX11: ; %bb.0: ; %entry 1141; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 1142; GFX11-NEXT: v_mov_b32_e32 v0, 0 1143; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1144; GFX11-NEXT: s_lshr_b32 s1, s0, 16 1145; GFX11-NEXT: s_lshr_b32 s2, s0, 24 1146; GFX11-NEXT: s_add_i32 s3, s0, s0 1147; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80008 1148; GFX11-NEXT: s_add_i32 s2, s2, s2 1149; GFX11-NEXT: s_add_i32 s0, s0, s0 1150; GFX11-NEXT: s_add_i32 s1, s1, s1 1151; GFX11-NEXT: s_and_b32 s3, s3, 0xff 1152; GFX11-NEXT: s_lshl_b32 s0, s0, 8 1153; GFX11-NEXT: s_lshl_b32 s2, s2, 8 1154; GFX11-NEXT: s_and_b32 s1, s1, 0xff 1155; GFX11-NEXT: s_or_b32 s0, s3, s0 1156; GFX11-NEXT: s_or_b32 s1, s1, s2 1157; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 1158; GFX11-NEXT: s_lshl_b32 s1, s1, 16 1159; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1160; GFX11-NEXT: s_or_b32 s0, s0, s1 1161; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 1162; GFX11-NEXT: global_store_b32 v[0:1], v2, off 1163; GFX11-NEXT: s_endpgm 1164entry: 1165 %add = add <4 x i8> %arg0, %arg0 1166 store <4 x i8> %add, ptr addrspace(1) null 1167 ret void 1168} 1169 1170define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) { 1171; SI-LABEL: amd_kernel_v3i8: 1172; SI: ; %bb.0: ; %entry 1173; SI-NEXT: s_load_dword s4, s[4:5], 0x9 1174; SI-NEXT: s_mov_b32 s1, 0 1175; SI-NEXT: s_mov_b32 s0, 2 1176; SI-NEXT: s_mov_b32 s3, 0xf000 1177; SI-NEXT: s_mov_b32 s2, -1 1178; SI-NEXT: s_waitcnt lgkmcnt(0) 1179; SI-NEXT: s_lshr_b32 s5, s4, 16 1180; SI-NEXT: s_bfe_u32 s6, s4, 0x80008 1181; SI-NEXT: s_add_i32 s4, s4, s4 1182; SI-NEXT: s_and_b32 s4, s4, 0xff 1183; SI-NEXT: s_add_i32 s6, s6, s6 1184; SI-NEXT: s_add_i32 s5, s5, s5 1185; SI-NEXT: s_lshl_b32 s6, s6, 8 1186; SI-NEXT: v_mov_b32_e32 v0, s5 1187; SI-NEXT: s_or_b32 s4, s4, s6 1188; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1189; SI-NEXT: s_mov_b32 s0, s1 1190; SI-NEXT: s_waitcnt expcnt(0) 1191; SI-NEXT: v_mov_b32_e32 v0, s4 1192; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1193; SI-NEXT: s_endpgm 1194; 1195; VI-LABEL: amd_kernel_v3i8: 1196; VI: ; %bb.0: ; %entry 1197; VI-NEXT: s_load_dword s0, s[4:5], 0x24 1198; VI-NEXT: v_mov_b32_e32 v0, 2 1199; VI-NEXT: v_mov_b32_e32 v1, 0 1200; VI-NEXT: s_waitcnt lgkmcnt(0) 1201; VI-NEXT: s_lshr_b32 s1, s0, 16 1202; VI-NEXT: s_bfe_u32 s2, s0, 0x80008 1203; VI-NEXT: s_add_i32 s0, s0, s0 1204; VI-NEXT: s_add_i32 s1, s1, s1 1205; VI-NEXT: s_add_i32 s2, s2, s2 1206; VI-NEXT: s_and_b32 s0, s0, 0xff 1207; VI-NEXT: s_lshl_b32 s2, s2, 8 1208; VI-NEXT: v_mov_b32_e32 v2, s1 1209; VI-NEXT: s_or_b32 s0, s0, s2 1210; VI-NEXT: flat_store_byte v[0:1], v2 1211; VI-NEXT: v_mov_b32_e32 v0, 0 1212; VI-NEXT: v_mov_b32_e32 v1, 0 1213; VI-NEXT: v_mov_b32_e32 v2, s0 1214; VI-NEXT: flat_store_short v[0:1], v2 1215; VI-NEXT: s_endpgm 1216; 1217; GFX11-LABEL: amd_kernel_v3i8: 1218; GFX11: ; %bb.0: ; %entry 1219; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 1220; GFX11-NEXT: v_mov_b32_e32 v0, 2 1221; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 1223; GFX11-NEXT: s_lshr_b32 s1, s0, 16 1224; GFX11-NEXT: s_add_i32 s0, s0, s0 1225; GFX11-NEXT: s_add_i32 s2, s2, s2 1226; GFX11-NEXT: s_and_b32 s0, s0, 0xff 1227; GFX11-NEXT: s_lshl_b32 s2, s2, 8 1228; GFX11-NEXT: s_add_i32 s1, s1, s1 1229; GFX11-NEXT: s_or_b32 s0, s0, s2 1230; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1 1231; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 1232; GFX11-NEXT: v_mov_b32_e32 v3, 0 1233; GFX11-NEXT: s_clause 0x1 1234; GFX11-NEXT: global_store_b8 v[0:1], v4, off 1235; GFX11-NEXT: global_store_b16 v[2:3], v5, off 1236; GFX11-NEXT: s_endpgm 1237entry: 1238 %add = add <3 x i8> %arg0, %arg0 1239 store <3 x i8> %add, ptr addrspace(1) null 1240 ret void 1241} 1242 1243define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { 1244; SI-LABEL: amd_kernel_v5i8: 1245; SI: ; %bb.0: ; %entry 1246; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1247; SI-NEXT: s_mov_b32 s1, 0 1248; SI-NEXT: s_mov_b32 s0, 4 1249; SI-NEXT: s_mov_b32 s3, 0xf000 1250; SI-NEXT: s_mov_b32 s2, -1 1251; SI-NEXT: s_waitcnt lgkmcnt(0) 1252; SI-NEXT: s_lshr_b32 s6, s4, 16 1253; SI-NEXT: s_lshr_b32 s7, s4, 24 1254; SI-NEXT: s_bfe_u32 s8, s4, 0x80008 1255; SI-NEXT: s_add_i32 s4, s4, s4 1256; SI-NEXT: s_add_i32 s5, s5, s5 1257; SI-NEXT: s_add_i32 s7, s7, s7 1258; SI-NEXT: s_add_i32 s6, s6, s6 1259; SI-NEXT: s_and_b32 s4, s4, 0xff 1260; SI-NEXT: s_add_i32 s8, s8, s8 1261; SI-NEXT: v_mov_b32_e32 v0, s5 1262; SI-NEXT: s_lshl_b32 s5, s7, 24 1263; SI-NEXT: s_and_b32 s6, s6, 0xff 1264; SI-NEXT: s_lshl_b32 s7, s8, 8 1265; SI-NEXT: s_lshl_b32 s6, s6, 16 1266; SI-NEXT: s_or_b32 s4, s4, s7 1267; SI-NEXT: s_or_b32 s5, s5, s6 1268; SI-NEXT: s_and_b32 s4, s4, 0xffff 1269; SI-NEXT: s_or_b32 s4, s4, s5 1270; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1271; SI-NEXT: s_mov_b32 s0, s1 1272; SI-NEXT: s_waitcnt expcnt(0) 1273; SI-NEXT: v_mov_b32_e32 v0, s4 1274; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1275; SI-NEXT: s_endpgm 1276; 1277; VI-LABEL: amd_kernel_v5i8: 1278; VI: ; %bb.0: ; %entry 1279; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1280; VI-NEXT: v_mov_b32_e32 v0, 4 1281; VI-NEXT: v_mov_b32_e32 v1, 0 1282; VI-NEXT: s_waitcnt lgkmcnt(0) 1283; VI-NEXT: s_lshr_b32 s3, s0, 16 1284; VI-NEXT: s_lshr_b32 s2, s0, 24 1285; VI-NEXT: s_add_i32 s3, s3, s3 1286; VI-NEXT: s_bfe_u32 s4, s0, 0x80008 1287; VI-NEXT: s_add_i32 s2, s2, s2 1288; VI-NEXT: s_and_b32 s3, s3, 0xff 1289; VI-NEXT: s_add_i32 s4, s4, s4 1290; VI-NEXT: s_add_i32 s0, s0, s0 1291; VI-NEXT: s_lshl_b32 s2, s2, 24 1292; VI-NEXT: s_lshl_b32 s3, s3, 16 1293; VI-NEXT: s_and_b32 s1, s1, 0xff 1294; VI-NEXT: s_or_b32 s2, s2, s3 1295; VI-NEXT: s_and_b32 s0, s0, 0xff 1296; VI-NEXT: s_lshl_b32 s3, s4, 8 1297; VI-NEXT: s_add_i32 s1, s1, s1 1298; VI-NEXT: s_or_b32 s0, s0, s3 1299; VI-NEXT: s_and_b32 s0, s0, 0xffff 1300; VI-NEXT: v_mov_b32_e32 v2, s1 1301; VI-NEXT: s_or_b32 s0, s0, s2 1302; VI-NEXT: flat_store_byte v[0:1], v2 1303; VI-NEXT: v_mov_b32_e32 v0, 0 1304; VI-NEXT: v_mov_b32_e32 v1, 0 1305; VI-NEXT: v_mov_b32_e32 v2, s0 1306; VI-NEXT: flat_store_dword v[0:1], v2 1307; VI-NEXT: s_endpgm 1308; 1309; GFX11-LABEL: amd_kernel_v5i8: 1310; GFX11: ; %bb.0: ; %entry 1311; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1312; GFX11-NEXT: v_mov_b32_e32 v0, 4 1313; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX11-NEXT: s_lshr_b32 s2, s0, 16 1315; GFX11-NEXT: s_lshr_b32 s3, s0, 24 1316; GFX11-NEXT: s_add_i32 s4, s0, s0 1317; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80008 1318; GFX11-NEXT: s_add_i32 s3, s3, s3 1319; GFX11-NEXT: s_add_i32 s0, s0, s0 1320; GFX11-NEXT: s_add_i32 s2, s2, s2 1321; GFX11-NEXT: s_and_b32 s4, s4, 0xff 1322; GFX11-NEXT: s_lshl_b32 s0, s0, 8 1323; GFX11-NEXT: s_lshl_b32 s3, s3, 8 1324; GFX11-NEXT: s_and_b32 s2, s2, 0xff 1325; GFX11-NEXT: s_or_b32 s0, s4, s0 1326; GFX11-NEXT: s_or_b32 s2, s2, s3 1327; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 1328; GFX11-NEXT: s_lshl_b32 s2, s2, 16 1329; GFX11-NEXT: s_add_i32 s1, s1, s1 1330; GFX11-NEXT: s_or_b32 s0, s0, s2 1331; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s1 1332; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 1333; GFX11-NEXT: v_mov_b32_e32 v3, 0 1334; GFX11-NEXT: s_clause 0x1 1335; GFX11-NEXT: global_store_b8 v[0:1], v4, off 1336; GFX11-NEXT: global_store_b32 v[2:3], v5, off 1337; GFX11-NEXT: s_endpgm 1338entry: 1339 %add = add <5 x i8> %arg0, %arg0 1340 store <5 x i8> %add, ptr addrspace(1) null 1341 ret void 1342} 1343 1344define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) { 1345; SI-LABEL: amd_kernel_v8i8: 1346; SI: ; %bb.0: ; %entry 1347; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1348; SI-NEXT: s_mov_b32 s0, 0 1349; SI-NEXT: s_mov_b32 s3, 0xf000 1350; SI-NEXT: s_waitcnt lgkmcnt(0) 1351; SI-NEXT: s_lshr_b32 s1, s4, 16 1352; SI-NEXT: s_lshr_b32 s2, s4, 24 1353; SI-NEXT: s_lshr_b32 s6, s5, 16 1354; SI-NEXT: s_lshr_b32 s7, s5, 24 1355; SI-NEXT: s_bfe_u32 s8, s4, 0x80008 1356; SI-NEXT: s_bfe_u32 s9, s5, 0x80008 1357; SI-NEXT: s_add_i32 s5, s5, s5 1358; SI-NEXT: s_add_i32 s4, s4, s4 1359; SI-NEXT: s_add_i32 s7, s7, s7 1360; SI-NEXT: s_add_i32 s6, s6, s6 1361; SI-NEXT: s_and_b32 s5, s5, 0xff 1362; SI-NEXT: s_add_i32 s9, s9, s9 1363; SI-NEXT: s_add_i32 s2, s2, s2 1364; SI-NEXT: s_add_i32 s1, s1, s1 1365; SI-NEXT: s_and_b32 s4, s4, 0xff 1366; SI-NEXT: s_add_i32 s8, s8, s8 1367; SI-NEXT: s_lshl_b32 s7, s7, 24 1368; SI-NEXT: s_and_b32 s6, s6, 0xff 1369; SI-NEXT: s_lshl_b32 s9, s9, 8 1370; SI-NEXT: s_lshl_b32 s2, s2, 24 1371; SI-NEXT: s_and_b32 s1, s1, 0xff 1372; SI-NEXT: s_lshl_b32 s8, s8, 8 1373; SI-NEXT: s_lshl_b32 s6, s6, 16 1374; SI-NEXT: s_or_b32 s5, s5, s9 1375; SI-NEXT: s_lshl_b32 s1, s1, 16 1376; SI-NEXT: s_or_b32 s4, s4, s8 1377; SI-NEXT: s_or_b32 s6, s7, s6 1378; SI-NEXT: s_and_b32 s5, s5, 0xffff 1379; SI-NEXT: s_or_b32 s1, s2, s1 1380; SI-NEXT: s_and_b32 s2, s4, 0xffff 1381; SI-NEXT: s_or_b32 s4, s5, s6 1382; SI-NEXT: s_or_b32 s5, s2, s1 1383; SI-NEXT: s_mov_b32 s2, -1 1384; SI-NEXT: s_mov_b32 s1, s0 1385; SI-NEXT: v_mov_b32_e32 v0, s5 1386; SI-NEXT: v_mov_b32_e32 v1, s4 1387; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1388; SI-NEXT: s_endpgm 1389; 1390; VI-LABEL: amd_kernel_v8i8: 1391; VI: ; %bb.0: ; %entry 1392; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1393; VI-NEXT: v_mov_b32_e32 v2, 0 1394; VI-NEXT: v_mov_b32_e32 v3, 0 1395; VI-NEXT: s_waitcnt lgkmcnt(0) 1396; VI-NEXT: s_lshr_b32 s3, s1, 16 1397; VI-NEXT: s_lshr_b32 s2, s1, 24 1398; VI-NEXT: s_add_i32 s3, s3, s3 1399; VI-NEXT: s_bfe_u32 s6, s1, 0x80008 1400; VI-NEXT: s_add_i32 s2, s2, s2 1401; VI-NEXT: s_and_b32 s3, s3, 0xff 1402; VI-NEXT: s_add_i32 s6, s6, s6 1403; VI-NEXT: s_add_i32 s1, s1, s1 1404; VI-NEXT: s_lshl_b32 s2, s2, 24 1405; VI-NEXT: s_lshl_b32 s3, s3, 16 1406; VI-NEXT: s_lshr_b32 s5, s0, 16 1407; VI-NEXT: s_or_b32 s2, s2, s3 1408; VI-NEXT: s_and_b32 s1, s1, 0xff 1409; VI-NEXT: s_lshl_b32 s3, s6, 8 1410; VI-NEXT: s_lshr_b32 s4, s0, 24 1411; VI-NEXT: s_add_i32 s5, s5, s5 1412; VI-NEXT: s_or_b32 s1, s1, s3 1413; VI-NEXT: s_bfe_u32 s7, s0, 0x80008 1414; VI-NEXT: s_add_i32 s4, s4, s4 1415; VI-NEXT: s_and_b32 s1, s1, 0xffff 1416; VI-NEXT: s_and_b32 s3, s5, 0xff 1417; VI-NEXT: s_add_i32 s7, s7, s7 1418; VI-NEXT: s_add_i32 s0, s0, s0 1419; VI-NEXT: s_or_b32 s1, s1, s2 1420; VI-NEXT: s_lshl_b32 s2, s4, 24 1421; VI-NEXT: s_lshl_b32 s3, s3, 16 1422; VI-NEXT: s_or_b32 s2, s2, s3 1423; VI-NEXT: s_and_b32 s0, s0, 0xff 1424; VI-NEXT: s_lshl_b32 s3, s7, 8 1425; VI-NEXT: s_or_b32 s0, s0, s3 1426; VI-NEXT: s_and_b32 s0, s0, 0xffff 1427; VI-NEXT: s_or_b32 s0, s0, s2 1428; VI-NEXT: v_mov_b32_e32 v0, s0 1429; VI-NEXT: v_mov_b32_e32 v1, s1 1430; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1431; VI-NEXT: s_endpgm 1432; 1433; GFX11-LABEL: amd_kernel_v8i8: 1434; GFX11: ; %bb.0: ; %entry 1435; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1436; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1437; GFX11-NEXT: s_lshr_b32 s2, s0, 16 1438; GFX11-NEXT: s_lshr_b32 s3, s0, 24 1439; GFX11-NEXT: s_lshr_b32 s4, s1, 16 1440; GFX11-NEXT: s_lshr_b32 s5, s1, 24 1441; GFX11-NEXT: s_bfe_u32 s6, s0, 0x80008 1442; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 1443; GFX11-NEXT: s_add_i32 s1, s1, s1 1444; GFX11-NEXT: s_add_i32 s0, s0, s0 1445; GFX11-NEXT: s_add_i32 s7, s7, s7 1446; GFX11-NEXT: s_add_i32 s5, s5, s5 1447; GFX11-NEXT: s_add_i32 s4, s4, s4 1448; GFX11-NEXT: s_add_i32 s6, s6, s6 1449; GFX11-NEXT: s_add_i32 s3, s3, s3 1450; GFX11-NEXT: s_add_i32 s2, s2, s2 1451; GFX11-NEXT: s_and_b32 s1, s1, 0xff 1452; GFX11-NEXT: s_and_b32 s0, s0, 0xff 1453; GFX11-NEXT: s_lshl_b32 s7, s7, 8 1454; GFX11-NEXT: s_lshl_b32 s5, s5, 8 1455; GFX11-NEXT: s_and_b32 s4, s4, 0xff 1456; GFX11-NEXT: s_lshl_b32 s6, s6, 8 1457; GFX11-NEXT: s_lshl_b32 s3, s3, 8 1458; GFX11-NEXT: s_and_b32 s2, s2, 0xff 1459; GFX11-NEXT: s_or_b32 s1, s1, s7 1460; GFX11-NEXT: s_or_b32 s4, s4, s5 1461; GFX11-NEXT: s_or_b32 s0, s0, s6 1462; GFX11-NEXT: s_or_b32 s2, s2, s3 1463; GFX11-NEXT: s_and_b32 s1, s1, 0xffff 1464; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 1465; GFX11-NEXT: s_lshl_b32 s2, s2, 16 1466; GFX11-NEXT: s_lshl_b32 s3, s4, 16 1467; GFX11-NEXT: s_or_b32 s0, s0, s2 1468; GFX11-NEXT: s_or_b32 s1, s1, s3 1469; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1470; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 1471; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0 1472; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off 1473; GFX11-NEXT: s_endpgm 1474entry: 1475 %add = add <8 x i8> %arg0, %arg0 1476 store <8 x i8> %add, ptr addrspace(1) null 1477 ret void 1478} 1479 1480define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) { 1481; SI-LABEL: amd_kernel_v16i8: 1482; SI: ; %bb.0: ; %entry 1483; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1484; SI-NEXT: s_mov_b32 s4, 0 1485; SI-NEXT: s_mov_b32 s7, 0xf000 1486; SI-NEXT: s_waitcnt lgkmcnt(0) 1487; SI-NEXT: s_lshr_b32 s5, s0, 16 1488; SI-NEXT: s_lshr_b32 s6, s0, 24 1489; SI-NEXT: s_lshr_b32 s8, s1, 16 1490; SI-NEXT: s_lshr_b32 s9, s1, 24 1491; SI-NEXT: s_lshr_b32 s10, s2, 16 1492; SI-NEXT: s_lshr_b32 s11, s2, 24 1493; SI-NEXT: s_lshr_b32 s12, s3, 16 1494; SI-NEXT: s_lshr_b32 s13, s3, 24 1495; SI-NEXT: s_bfe_u32 s14, s0, 0x80008 1496; SI-NEXT: s_bfe_u32 s15, s1, 0x80008 1497; SI-NEXT: s_bfe_u32 s16, s2, 0x80008 1498; SI-NEXT: s_bfe_u32 s17, s3, 0x80008 1499; SI-NEXT: s_add_i32 s3, s3, s3 1500; SI-NEXT: s_add_i32 s2, s2, s2 1501; SI-NEXT: s_add_i32 s1, s1, s1 1502; SI-NEXT: s_add_i32 s0, s0, s0 1503; SI-NEXT: s_add_i32 s13, s13, s13 1504; SI-NEXT: s_add_i32 s12, s12, s12 1505; SI-NEXT: s_and_b32 s3, s3, 0xff 1506; SI-NEXT: s_add_i32 s17, s17, s17 1507; SI-NEXT: s_add_i32 s11, s11, s11 1508; SI-NEXT: s_add_i32 s10, s10, s10 1509; SI-NEXT: s_and_b32 s2, s2, 0xff 1510; SI-NEXT: s_add_i32 s16, s16, s16 1511; SI-NEXT: s_add_i32 s9, s9, s9 1512; SI-NEXT: s_add_i32 s8, s8, s8 1513; SI-NEXT: s_and_b32 s1, s1, 0xff 1514; SI-NEXT: s_add_i32 s15, s15, s15 1515; SI-NEXT: s_add_i32 s6, s6, s6 1516; SI-NEXT: s_add_i32 s5, s5, s5 1517; SI-NEXT: s_and_b32 s0, s0, 0xff 1518; SI-NEXT: s_add_i32 s14, s14, s14 1519; SI-NEXT: s_lshl_b32 s13, s13, 24 1520; SI-NEXT: s_and_b32 s12, s12, 0xff 1521; SI-NEXT: s_lshl_b32 s17, s17, 8 1522; SI-NEXT: s_lshl_b32 s11, s11, 24 1523; SI-NEXT: s_and_b32 s10, s10, 0xff 1524; SI-NEXT: s_lshl_b32 s16, s16, 8 1525; SI-NEXT: s_lshl_b32 s9, s9, 24 1526; SI-NEXT: s_and_b32 s8, s8, 0xff 1527; SI-NEXT: s_lshl_b32 s15, s15, 8 1528; SI-NEXT: s_lshl_b32 s6, s6, 24 1529; SI-NEXT: s_and_b32 s5, s5, 0xff 1530; SI-NEXT: s_lshl_b32 s14, s14, 8 1531; SI-NEXT: s_lshl_b32 s12, s12, 16 1532; SI-NEXT: s_or_b32 s3, s3, s17 1533; SI-NEXT: s_lshl_b32 s10, s10, 16 1534; SI-NEXT: s_or_b32 s2, s2, s16 1535; SI-NEXT: s_lshl_b32 s8, s8, 16 1536; SI-NEXT: s_or_b32 s1, s1, s15 1537; SI-NEXT: s_lshl_b32 s5, s5, 16 1538; SI-NEXT: s_or_b32 s0, s0, s14 1539; SI-NEXT: s_or_b32 s12, s13, s12 1540; SI-NEXT: s_and_b32 s3, s3, 0xffff 1541; SI-NEXT: s_or_b32 s10, s11, s10 1542; SI-NEXT: s_and_b32 s2, s2, 0xffff 1543; SI-NEXT: s_or_b32 s8, s9, s8 1544; SI-NEXT: s_and_b32 s1, s1, 0xffff 1545; SI-NEXT: s_or_b32 s5, s6, s5 1546; SI-NEXT: s_and_b32 s0, s0, 0xffff 1547; SI-NEXT: s_or_b32 s3, s3, s12 1548; SI-NEXT: s_or_b32 s2, s2, s10 1549; SI-NEXT: s_or_b32 s1, s1, s8 1550; SI-NEXT: s_or_b32 s0, s0, s5 1551; SI-NEXT: s_mov_b32 s6, -1 1552; SI-NEXT: s_mov_b32 s5, s4 1553; SI-NEXT: v_mov_b32_e32 v0, s0 1554; SI-NEXT: v_mov_b32_e32 v1, s1 1555; SI-NEXT: v_mov_b32_e32 v2, s2 1556; SI-NEXT: v_mov_b32_e32 v3, s3 1557; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1558; SI-NEXT: s_endpgm 1559; 1560; VI-LABEL: amd_kernel_v16i8: 1561; VI: ; %bb.0: ; %entry 1562; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1563; VI-NEXT: v_mov_b32_e32 v4, 0 1564; VI-NEXT: v_mov_b32_e32 v5, 0 1565; VI-NEXT: s_waitcnt lgkmcnt(0) 1566; VI-NEXT: s_lshr_b32 s5, s3, 16 1567; VI-NEXT: s_lshr_b32 s4, s3, 24 1568; VI-NEXT: s_add_i32 s5, s5, s5 1569; VI-NEXT: s_bfe_u32 s12, s3, 0x80008 1570; VI-NEXT: s_add_i32 s4, s4, s4 1571; VI-NEXT: s_and_b32 s5, s5, 0xff 1572; VI-NEXT: s_add_i32 s12, s12, s12 1573; VI-NEXT: s_add_i32 s3, s3, s3 1574; VI-NEXT: s_lshl_b32 s4, s4, 24 1575; VI-NEXT: s_lshl_b32 s5, s5, 16 1576; VI-NEXT: s_lshr_b32 s7, s2, 16 1577; VI-NEXT: s_or_b32 s4, s4, s5 1578; VI-NEXT: s_and_b32 s3, s3, 0xff 1579; VI-NEXT: s_lshl_b32 s5, s12, 8 1580; VI-NEXT: s_lshr_b32 s6, s2, 24 1581; VI-NEXT: s_add_i32 s7, s7, s7 1582; VI-NEXT: s_or_b32 s3, s3, s5 1583; VI-NEXT: s_bfe_u32 s13, s2, 0x80008 1584; VI-NEXT: s_add_i32 s6, s6, s6 1585; VI-NEXT: s_and_b32 s3, s3, 0xffff 1586; VI-NEXT: s_and_b32 s5, s7, 0xff 1587; VI-NEXT: s_add_i32 s13, s13, s13 1588; VI-NEXT: s_add_i32 s2, s2, s2 1589; VI-NEXT: s_or_b32 s3, s3, s4 1590; VI-NEXT: s_lshl_b32 s4, s6, 24 1591; VI-NEXT: s_lshl_b32 s5, s5, 16 1592; VI-NEXT: s_lshr_b32 s9, s1, 16 1593; VI-NEXT: s_or_b32 s4, s4, s5 1594; VI-NEXT: s_and_b32 s2, s2, 0xff 1595; VI-NEXT: s_lshl_b32 s5, s13, 8 1596; VI-NEXT: s_lshr_b32 s8, s1, 24 1597; VI-NEXT: s_add_i32 s9, s9, s9 1598; VI-NEXT: s_or_b32 s2, s2, s5 1599; VI-NEXT: s_bfe_u32 s14, s1, 0x80008 1600; VI-NEXT: s_add_i32 s8, s8, s8 1601; VI-NEXT: s_and_b32 s2, s2, 0xffff 1602; VI-NEXT: s_and_b32 s5, s9, 0xff 1603; VI-NEXT: s_add_i32 s14, s14, s14 1604; VI-NEXT: s_add_i32 s1, s1, s1 1605; VI-NEXT: s_or_b32 s2, s2, s4 1606; VI-NEXT: s_lshl_b32 s4, s8, 24 1607; VI-NEXT: s_lshl_b32 s5, s5, 16 1608; VI-NEXT: s_lshr_b32 s11, s0, 16 1609; VI-NEXT: s_or_b32 s4, s4, s5 1610; VI-NEXT: s_and_b32 s1, s1, 0xff 1611; VI-NEXT: s_lshl_b32 s5, s14, 8 1612; VI-NEXT: s_lshr_b32 s10, s0, 24 1613; VI-NEXT: s_add_i32 s11, s11, s11 1614; VI-NEXT: s_or_b32 s1, s1, s5 1615; VI-NEXT: s_bfe_u32 s15, s0, 0x80008 1616; VI-NEXT: s_add_i32 s10, s10, s10 1617; VI-NEXT: s_and_b32 s1, s1, 0xffff 1618; VI-NEXT: s_and_b32 s5, s11, 0xff 1619; VI-NEXT: s_add_i32 s15, s15, s15 1620; VI-NEXT: s_add_i32 s0, s0, s0 1621; VI-NEXT: s_or_b32 s1, s1, s4 1622; VI-NEXT: s_lshl_b32 s4, s10, 24 1623; VI-NEXT: s_lshl_b32 s5, s5, 16 1624; VI-NEXT: s_or_b32 s4, s4, s5 1625; VI-NEXT: s_and_b32 s0, s0, 0xff 1626; VI-NEXT: s_lshl_b32 s5, s15, 8 1627; VI-NEXT: s_or_b32 s0, s0, s5 1628; VI-NEXT: s_and_b32 s0, s0, 0xffff 1629; VI-NEXT: s_or_b32 s0, s0, s4 1630; VI-NEXT: v_mov_b32_e32 v0, s0 1631; VI-NEXT: v_mov_b32_e32 v1, s1 1632; VI-NEXT: v_mov_b32_e32 v2, s2 1633; VI-NEXT: v_mov_b32_e32 v3, s3 1634; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1635; VI-NEXT: s_endpgm 1636; 1637; GFX11-LABEL: amd_kernel_v16i8: 1638; GFX11: ; %bb.0: ; %entry 1639; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1640; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1641; GFX11-NEXT: s_lshr_b32 s6, s1, 16 1642; GFX11-NEXT: s_lshr_b32 s7, s1, 24 1643; GFX11-NEXT: s_lshr_b32 s8, s2, 16 1644; GFX11-NEXT: s_lshr_b32 s9, s2, 24 1645; GFX11-NEXT: s_lshr_b32 s10, s3, 16 1646; GFX11-NEXT: s_lshr_b32 s11, s3, 24 1647; GFX11-NEXT: s_lshr_b32 s4, s0, 16 1648; GFX11-NEXT: s_lshr_b32 s5, s0, 24 1649; GFX11-NEXT: s_bfe_u32 s12, s0, 0x80008 1650; GFX11-NEXT: s_bfe_u32 s13, s1, 0x80008 1651; GFX11-NEXT: s_bfe_u32 s14, s2, 0x80008 1652; GFX11-NEXT: s_bfe_u32 s15, s3, 0x80008 1653; GFX11-NEXT: s_add_i32 s11, s11, s11 1654; GFX11-NEXT: s_add_i32 s10, s10, s10 1655; GFX11-NEXT: s_add_i32 s9, s9, s9 1656; GFX11-NEXT: s_add_i32 s8, s8, s8 1657; GFX11-NEXT: s_add_i32 s7, s7, s7 1658; GFX11-NEXT: s_add_i32 s6, s6, s6 1659; GFX11-NEXT: s_add_i32 s3, s3, s3 1660; GFX11-NEXT: s_add_i32 s2, s2, s2 1661; GFX11-NEXT: s_add_i32 s15, s15, s15 1662; GFX11-NEXT: s_add_i32 s14, s14, s14 1663; GFX11-NEXT: s_lshl_b32 s11, s11, 8 1664; GFX11-NEXT: s_and_b32 s10, s10, 0xff 1665; GFX11-NEXT: s_lshl_b32 s9, s9, 8 1666; GFX11-NEXT: s_and_b32 s8, s8, 0xff 1667; GFX11-NEXT: s_add_i32 s1, s1, s1 1668; GFX11-NEXT: s_add_i32 s13, s13, s13 1669; GFX11-NEXT: s_lshl_b32 s7, s7, 8 1670; GFX11-NEXT: s_and_b32 s6, s6, 0xff 1671; GFX11-NEXT: s_add_i32 s0, s0, s0 1672; GFX11-NEXT: s_add_i32 s12, s12, s12 1673; GFX11-NEXT: s_add_i32 s5, s5, s5 1674; GFX11-NEXT: s_add_i32 s4, s4, s4 1675; GFX11-NEXT: s_and_b32 s3, s3, 0xff 1676; GFX11-NEXT: s_and_b32 s2, s2, 0xff 1677; GFX11-NEXT: s_lshl_b32 s15, s15, 8 1678; GFX11-NEXT: s_or_b32 s10, s10, s11 1679; GFX11-NEXT: s_lshl_b32 s11, s14, 8 1680; GFX11-NEXT: s_or_b32 s8, s8, s9 1681; GFX11-NEXT: s_and_b32 s1, s1, 0xff 1682; GFX11-NEXT: s_lshl_b32 s9, s13, 8 1683; GFX11-NEXT: s_or_b32 s6, s6, s7 1684; GFX11-NEXT: s_and_b32 s0, s0, 0xff 1685; GFX11-NEXT: s_lshl_b32 s7, s12, 8 1686; GFX11-NEXT: s_lshl_b32 s5, s5, 8 1687; GFX11-NEXT: s_and_b32 s4, s4, 0xff 1688; GFX11-NEXT: s_or_b32 s3, s3, s15 1689; GFX11-NEXT: s_or_b32 s2, s2, s11 1690; GFX11-NEXT: s_or_b32 s1, s1, s9 1691; GFX11-NEXT: s_or_b32 s0, s0, s7 1692; GFX11-NEXT: s_or_b32 s4, s4, s5 1693; GFX11-NEXT: s_and_b32 s3, s3, 0xffff 1694; GFX11-NEXT: s_lshl_b32 s10, s10, 16 1695; GFX11-NEXT: s_and_b32 s2, s2, 0xffff 1696; GFX11-NEXT: s_lshl_b32 s8, s8, 16 1697; GFX11-NEXT: s_and_b32 s1, s1, 0xffff 1698; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 1699; GFX11-NEXT: s_lshl_b32 s4, s4, 16 1700; GFX11-NEXT: s_lshl_b32 s5, s6, 16 1701; GFX11-NEXT: s_or_b32 s3, s3, s10 1702; GFX11-NEXT: s_or_b32 s2, s2, s8 1703; GFX11-NEXT: s_or_b32 s0, s0, s4 1704; GFX11-NEXT: s_or_b32 s1, s1, s5 1705; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3 1706; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1707; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0 1708; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off 1709; GFX11-NEXT: s_endpgm 1710entry: 1711 %add = add <16 x i8> %arg0, %arg0 1712 store <16 x i8> %add, ptr addrspace(1) null 1713 ret void 1714} 1715 1716define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { 1717; SI-LABEL: amd_kernel_v32i8: 1718; SI: ; %bb.0: ; %entry 1719; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1720; SI-NEXT: s_mov_b32 s9, 0 1721; SI-NEXT: s_mov_b32 s8, 16 1722; SI-NEXT: s_mov_b32 s11, 0xf000 1723; SI-NEXT: s_mov_b32 s10, -1 1724; SI-NEXT: s_waitcnt lgkmcnt(0) 1725; SI-NEXT: s_lshr_b32 s12, s4, 16 1726; SI-NEXT: s_lshr_b32 s13, s4, 24 1727; SI-NEXT: s_lshr_b32 s14, s5, 16 1728; SI-NEXT: s_lshr_b32 s15, s5, 24 1729; SI-NEXT: s_lshr_b32 s16, s6, 16 1730; SI-NEXT: s_lshr_b32 s17, s6, 24 1731; SI-NEXT: s_lshr_b32 s18, s7, 16 1732; SI-NEXT: s_lshr_b32 s19, s7, 24 1733; SI-NEXT: s_lshr_b32 s20, s0, 16 1734; SI-NEXT: s_lshr_b32 s21, s0, 24 1735; SI-NEXT: s_lshr_b32 s22, s1, 16 1736; SI-NEXT: s_lshr_b32 s23, s1, 24 1737; SI-NEXT: s_lshr_b32 s24, s2, 16 1738; SI-NEXT: s_lshr_b32 s25, s2, 24 1739; SI-NEXT: s_lshr_b32 s26, s3, 16 1740; SI-NEXT: s_lshr_b32 s27, s3, 24 1741; SI-NEXT: s_bfe_u32 s28, s4, 0x80008 1742; SI-NEXT: s_bfe_u32 s29, s5, 0x80008 1743; SI-NEXT: s_bfe_u32 s30, s6, 0x80008 1744; SI-NEXT: s_bfe_u32 s31, s7, 0x80008 1745; SI-NEXT: s_bfe_u32 s33, s0, 0x80008 1746; SI-NEXT: s_bfe_u32 s34, s1, 0x80008 1747; SI-NEXT: s_bfe_u32 s35, s2, 0x80008 1748; SI-NEXT: s_bfe_u32 s36, s3, 0x80008 1749; SI-NEXT: s_add_i32 s3, s3, s3 1750; SI-NEXT: s_add_i32 s2, s2, s2 1751; SI-NEXT: s_add_i32 s1, s1, s1 1752; SI-NEXT: s_add_i32 s0, s0, s0 1753; SI-NEXT: s_add_i32 s7, s7, s7 1754; SI-NEXT: s_add_i32 s6, s6, s6 1755; SI-NEXT: s_add_i32 s5, s5, s5 1756; SI-NEXT: s_add_i32 s4, s4, s4 1757; SI-NEXT: s_add_i32 s27, s27, s27 1758; SI-NEXT: s_add_i32 s26, s26, s26 1759; SI-NEXT: s_and_b32 s3, s3, 0xff 1760; SI-NEXT: s_add_i32 s36, s36, s36 1761; SI-NEXT: s_add_i32 s25, s25, s25 1762; SI-NEXT: s_add_i32 s24, s24, s24 1763; SI-NEXT: s_and_b32 s2, s2, 0xff 1764; SI-NEXT: s_add_i32 s35, s35, s35 1765; SI-NEXT: s_add_i32 s23, s23, s23 1766; SI-NEXT: s_add_i32 s22, s22, s22 1767; SI-NEXT: s_and_b32 s1, s1, 0xff 1768; SI-NEXT: s_add_i32 s34, s34, s34 1769; SI-NEXT: s_add_i32 s21, s21, s21 1770; SI-NEXT: s_add_i32 s20, s20, s20 1771; SI-NEXT: s_and_b32 s0, s0, 0xff 1772; SI-NEXT: s_add_i32 s33, s33, s33 1773; SI-NEXT: s_add_i32 s19, s19, s19 1774; SI-NEXT: s_add_i32 s18, s18, s18 1775; SI-NEXT: s_and_b32 s7, s7, 0xff 1776; SI-NEXT: s_add_i32 s31, s31, s31 1777; SI-NEXT: s_add_i32 s17, s17, s17 1778; SI-NEXT: s_add_i32 s16, s16, s16 1779; SI-NEXT: s_and_b32 s6, s6, 0xff 1780; SI-NEXT: s_add_i32 s30, s30, s30 1781; SI-NEXT: s_add_i32 s15, s15, s15 1782; SI-NEXT: s_add_i32 s14, s14, s14 1783; SI-NEXT: s_and_b32 s5, s5, 0xff 1784; SI-NEXT: s_add_i32 s29, s29, s29 1785; SI-NEXT: s_add_i32 s13, s13, s13 1786; SI-NEXT: s_add_i32 s12, s12, s12 1787; SI-NEXT: s_and_b32 s4, s4, 0xff 1788; SI-NEXT: s_add_i32 s28, s28, s28 1789; SI-NEXT: s_lshl_b32 s27, s27, 24 1790; SI-NEXT: s_and_b32 s26, s26, 0xff 1791; SI-NEXT: s_lshl_b32 s36, s36, 8 1792; SI-NEXT: s_lshl_b32 s25, s25, 24 1793; SI-NEXT: s_and_b32 s24, s24, 0xff 1794; SI-NEXT: s_lshl_b32 s35, s35, 8 1795; SI-NEXT: s_lshl_b32 s23, s23, 24 1796; SI-NEXT: s_and_b32 s22, s22, 0xff 1797; SI-NEXT: s_lshl_b32 s34, s34, 8 1798; SI-NEXT: s_lshl_b32 s21, s21, 24 1799; SI-NEXT: s_and_b32 s20, s20, 0xff 1800; SI-NEXT: s_lshl_b32 s33, s33, 8 1801; SI-NEXT: s_lshl_b32 s19, s19, 24 1802; SI-NEXT: s_and_b32 s18, s18, 0xff 1803; SI-NEXT: s_lshl_b32 s31, s31, 8 1804; SI-NEXT: s_lshl_b32 s17, s17, 24 1805; SI-NEXT: s_and_b32 s16, s16, 0xff 1806; SI-NEXT: s_lshl_b32 s30, s30, 8 1807; SI-NEXT: s_lshl_b32 s15, s15, 24 1808; SI-NEXT: s_and_b32 s14, s14, 0xff 1809; SI-NEXT: s_lshl_b32 s29, s29, 8 1810; SI-NEXT: s_lshl_b32 s13, s13, 24 1811; SI-NEXT: s_and_b32 s12, s12, 0xff 1812; SI-NEXT: s_lshl_b32 s28, s28, 8 1813; SI-NEXT: s_lshl_b32 s26, s26, 16 1814; SI-NEXT: s_or_b32 s3, s3, s36 1815; SI-NEXT: s_lshl_b32 s24, s24, 16 1816; SI-NEXT: s_or_b32 s2, s2, s35 1817; SI-NEXT: s_lshl_b32 s22, s22, 16 1818; SI-NEXT: s_or_b32 s1, s1, s34 1819; SI-NEXT: s_lshl_b32 s20, s20, 16 1820; SI-NEXT: s_or_b32 s0, s0, s33 1821; SI-NEXT: s_lshl_b32 s18, s18, 16 1822; SI-NEXT: s_or_b32 s7, s7, s31 1823; SI-NEXT: s_lshl_b32 s16, s16, 16 1824; SI-NEXT: s_or_b32 s6, s6, s30 1825; SI-NEXT: s_lshl_b32 s14, s14, 16 1826; SI-NEXT: s_or_b32 s5, s5, s29 1827; SI-NEXT: s_lshl_b32 s12, s12, 16 1828; SI-NEXT: s_or_b32 s4, s4, s28 1829; SI-NEXT: s_or_b32 s26, s27, s26 1830; SI-NEXT: s_and_b32 s3, s3, 0xffff 1831; SI-NEXT: s_or_b32 s24, s25, s24 1832; SI-NEXT: s_and_b32 s2, s2, 0xffff 1833; SI-NEXT: s_or_b32 s22, s23, s22 1834; SI-NEXT: s_and_b32 s1, s1, 0xffff 1835; SI-NEXT: s_or_b32 s20, s21, s20 1836; SI-NEXT: s_and_b32 s0, s0, 0xffff 1837; SI-NEXT: s_or_b32 s18, s19, s18 1838; SI-NEXT: s_and_b32 s7, s7, 0xffff 1839; SI-NEXT: s_or_b32 s16, s17, s16 1840; SI-NEXT: s_and_b32 s6, s6, 0xffff 1841; SI-NEXT: s_or_b32 s14, s15, s14 1842; SI-NEXT: s_and_b32 s5, s5, 0xffff 1843; SI-NEXT: s_or_b32 s12, s13, s12 1844; SI-NEXT: s_and_b32 s4, s4, 0xffff 1845; SI-NEXT: s_or_b32 s3, s3, s26 1846; SI-NEXT: s_or_b32 s2, s2, s24 1847; SI-NEXT: s_or_b32 s1, s1, s22 1848; SI-NEXT: s_or_b32 s7, s7, s18 1849; SI-NEXT: s_or_b32 s6, s6, s16 1850; SI-NEXT: s_or_b32 s5, s5, s14 1851; SI-NEXT: s_or_b32 s4, s4, s12 1852; SI-NEXT: s_or_b32 s0, s0, s20 1853; SI-NEXT: v_mov_b32_e32 v0, s4 1854; SI-NEXT: v_mov_b32_e32 v1, s5 1855; SI-NEXT: v_mov_b32_e32 v2, s6 1856; SI-NEXT: v_mov_b32_e32 v3, s7 1857; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1858; SI-NEXT: s_waitcnt expcnt(0) 1859; SI-NEXT: v_mov_b32_e32 v0, s0 1860; SI-NEXT: v_mov_b32_e32 v1, s1 1861; SI-NEXT: v_mov_b32_e32 v2, s2 1862; SI-NEXT: v_mov_b32_e32 v3, s3 1863; SI-NEXT: s_mov_b32 s8, s9 1864; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 1865; SI-NEXT: s_endpgm 1866; 1867; VI-LABEL: amd_kernel_v32i8: 1868; VI: ; %bb.0: ; %entry 1869; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1870; VI-NEXT: v_mov_b32_e32 v4, 16 1871; VI-NEXT: v_mov_b32_e32 v5, 0 1872; VI-NEXT: s_waitcnt lgkmcnt(0) 1873; VI-NEXT: s_lshr_b32 s9, s3, 16 1874; VI-NEXT: s_lshr_b32 s8, s3, 24 1875; VI-NEXT: s_add_i32 s9, s9, s9 1876; VI-NEXT: s_bfe_u32 s24, s3, 0x80008 1877; VI-NEXT: s_add_i32 s8, s8, s8 1878; VI-NEXT: s_and_b32 s9, s9, 0xff 1879; VI-NEXT: s_add_i32 s24, s24, s24 1880; VI-NEXT: s_add_i32 s3, s3, s3 1881; VI-NEXT: s_lshl_b32 s8, s8, 24 1882; VI-NEXT: s_lshl_b32 s9, s9, 16 1883; VI-NEXT: s_lshr_b32 s11, s2, 16 1884; VI-NEXT: s_or_b32 s8, s8, s9 1885; VI-NEXT: s_and_b32 s3, s3, 0xff 1886; VI-NEXT: s_lshl_b32 s9, s24, 8 1887; VI-NEXT: s_lshr_b32 s10, s2, 24 1888; VI-NEXT: s_add_i32 s11, s11, s11 1889; VI-NEXT: s_or_b32 s3, s3, s9 1890; VI-NEXT: s_bfe_u32 s25, s2, 0x80008 1891; VI-NEXT: s_add_i32 s10, s10, s10 1892; VI-NEXT: s_and_b32 s3, s3, 0xffff 1893; VI-NEXT: s_and_b32 s9, s11, 0xff 1894; VI-NEXT: s_add_i32 s25, s25, s25 1895; VI-NEXT: s_add_i32 s2, s2, s2 1896; VI-NEXT: s_or_b32 s3, s3, s8 1897; VI-NEXT: s_lshl_b32 s8, s10, 24 1898; VI-NEXT: s_lshl_b32 s9, s9, 16 1899; VI-NEXT: s_lshr_b32 s13, s1, 16 1900; VI-NEXT: s_or_b32 s8, s8, s9 1901; VI-NEXT: s_and_b32 s2, s2, 0xff 1902; VI-NEXT: s_lshl_b32 s9, s25, 8 1903; VI-NEXT: s_lshr_b32 s12, s1, 24 1904; VI-NEXT: s_add_i32 s13, s13, s13 1905; VI-NEXT: s_or_b32 s2, s2, s9 1906; VI-NEXT: s_bfe_u32 s26, s1, 0x80008 1907; VI-NEXT: s_add_i32 s12, s12, s12 1908; VI-NEXT: s_and_b32 s2, s2, 0xffff 1909; VI-NEXT: s_and_b32 s9, s13, 0xff 1910; VI-NEXT: s_add_i32 s26, s26, s26 1911; VI-NEXT: s_add_i32 s1, s1, s1 1912; VI-NEXT: s_or_b32 s2, s2, s8 1913; VI-NEXT: s_lshl_b32 s8, s12, 24 1914; VI-NEXT: s_lshl_b32 s9, s9, 16 1915; VI-NEXT: s_lshr_b32 s15, s0, 16 1916; VI-NEXT: s_or_b32 s8, s8, s9 1917; VI-NEXT: s_and_b32 s1, s1, 0xff 1918; VI-NEXT: s_lshl_b32 s9, s26, 8 1919; VI-NEXT: s_lshr_b32 s14, s0, 24 1920; VI-NEXT: s_add_i32 s15, s15, s15 1921; VI-NEXT: s_or_b32 s1, s1, s9 1922; VI-NEXT: s_bfe_u32 s27, s0, 0x80008 1923; VI-NEXT: s_add_i32 s14, s14, s14 1924; VI-NEXT: s_and_b32 s1, s1, 0xffff 1925; VI-NEXT: s_and_b32 s9, s15, 0xff 1926; VI-NEXT: s_add_i32 s27, s27, s27 1927; VI-NEXT: s_add_i32 s0, s0, s0 1928; VI-NEXT: s_or_b32 s1, s1, s8 1929; VI-NEXT: s_lshl_b32 s8, s14, 24 1930; VI-NEXT: s_lshl_b32 s9, s9, 16 1931; VI-NEXT: s_lshr_b32 s17, s7, 16 1932; VI-NEXT: s_or_b32 s8, s8, s9 1933; VI-NEXT: s_and_b32 s0, s0, 0xff 1934; VI-NEXT: s_lshl_b32 s9, s27, 8 1935; VI-NEXT: s_lshr_b32 s16, s7, 24 1936; VI-NEXT: s_add_i32 s17, s17, s17 1937; VI-NEXT: s_or_b32 s0, s0, s9 1938; VI-NEXT: s_bfe_u32 s28, s7, 0x80008 1939; VI-NEXT: s_add_i32 s16, s16, s16 1940; VI-NEXT: s_and_b32 s0, s0, 0xffff 1941; VI-NEXT: s_and_b32 s9, s17, 0xff 1942; VI-NEXT: s_add_i32 s28, s28, s28 1943; VI-NEXT: s_add_i32 s7, s7, s7 1944; VI-NEXT: s_or_b32 s0, s0, s8 1945; VI-NEXT: s_lshl_b32 s8, s16, 24 1946; VI-NEXT: s_lshl_b32 s9, s9, 16 1947; VI-NEXT: s_lshr_b32 s19, s6, 16 1948; VI-NEXT: s_or_b32 s8, s8, s9 1949; VI-NEXT: s_and_b32 s7, s7, 0xff 1950; VI-NEXT: s_lshl_b32 s9, s28, 8 1951; VI-NEXT: s_lshr_b32 s18, s6, 24 1952; VI-NEXT: s_add_i32 s19, s19, s19 1953; VI-NEXT: s_or_b32 s7, s7, s9 1954; VI-NEXT: s_bfe_u32 s29, s6, 0x80008 1955; VI-NEXT: s_add_i32 s18, s18, s18 1956; VI-NEXT: s_and_b32 s7, s7, 0xffff 1957; VI-NEXT: s_and_b32 s9, s19, 0xff 1958; VI-NEXT: s_add_i32 s29, s29, s29 1959; VI-NEXT: s_add_i32 s6, s6, s6 1960; VI-NEXT: s_or_b32 s7, s7, s8 1961; VI-NEXT: s_lshl_b32 s8, s18, 24 1962; VI-NEXT: s_lshl_b32 s9, s9, 16 1963; VI-NEXT: s_lshr_b32 s21, s5, 16 1964; VI-NEXT: s_or_b32 s8, s8, s9 1965; VI-NEXT: s_and_b32 s6, s6, 0xff 1966; VI-NEXT: s_lshl_b32 s9, s29, 8 1967; VI-NEXT: s_lshr_b32 s20, s5, 24 1968; VI-NEXT: s_add_i32 s21, s21, s21 1969; VI-NEXT: s_or_b32 s6, s6, s9 1970; VI-NEXT: s_bfe_u32 s30, s5, 0x80008 1971; VI-NEXT: s_add_i32 s20, s20, s20 1972; VI-NEXT: s_and_b32 s6, s6, 0xffff 1973; VI-NEXT: s_and_b32 s9, s21, 0xff 1974; VI-NEXT: s_add_i32 s30, s30, s30 1975; VI-NEXT: s_add_i32 s5, s5, s5 1976; VI-NEXT: s_or_b32 s6, s6, s8 1977; VI-NEXT: s_lshl_b32 s8, s20, 24 1978; VI-NEXT: s_lshl_b32 s9, s9, 16 1979; VI-NEXT: s_lshr_b32 s23, s4, 16 1980; VI-NEXT: s_or_b32 s8, s8, s9 1981; VI-NEXT: s_and_b32 s5, s5, 0xff 1982; VI-NEXT: s_lshl_b32 s9, s30, 8 1983; VI-NEXT: s_lshr_b32 s22, s4, 24 1984; VI-NEXT: s_add_i32 s23, s23, s23 1985; VI-NEXT: s_or_b32 s5, s5, s9 1986; VI-NEXT: s_bfe_u32 s31, s4, 0x80008 1987; VI-NEXT: s_add_i32 s22, s22, s22 1988; VI-NEXT: s_and_b32 s5, s5, 0xffff 1989; VI-NEXT: s_and_b32 s9, s23, 0xff 1990; VI-NEXT: s_add_i32 s31, s31, s31 1991; VI-NEXT: s_add_i32 s4, s4, s4 1992; VI-NEXT: s_or_b32 s5, s5, s8 1993; VI-NEXT: s_lshl_b32 s8, s22, 24 1994; VI-NEXT: s_lshl_b32 s9, s9, 16 1995; VI-NEXT: s_or_b32 s8, s8, s9 1996; VI-NEXT: s_and_b32 s4, s4, 0xff 1997; VI-NEXT: s_lshl_b32 s9, s31, 8 1998; VI-NEXT: s_or_b32 s4, s4, s9 1999; VI-NEXT: s_and_b32 s4, s4, 0xffff 2000; VI-NEXT: s_or_b32 s4, s4, s8 2001; VI-NEXT: v_mov_b32_e32 v0, s4 2002; VI-NEXT: v_mov_b32_e32 v1, s5 2003; VI-NEXT: v_mov_b32_e32 v2, s6 2004; VI-NEXT: v_mov_b32_e32 v3, s7 2005; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2006; VI-NEXT: v_mov_b32_e32 v4, 0 2007; VI-NEXT: v_mov_b32_e32 v0, s0 2008; VI-NEXT: v_mov_b32_e32 v1, s1 2009; VI-NEXT: v_mov_b32_e32 v2, s2 2010; VI-NEXT: v_mov_b32_e32 v5, 0 2011; VI-NEXT: v_mov_b32_e32 v3, s3 2012; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2013; VI-NEXT: s_endpgm 2014; 2015; GFX11-LABEL: amd_kernel_v32i8: 2016; GFX11: ; %bb.0: ; %entry 2017; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2018; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2019; GFX11-NEXT: s_lshr_b32 s16, s0, 16 2020; GFX11-NEXT: s_lshr_b32 s17, s0, 24 2021; GFX11-NEXT: s_lshr_b32 s20, s2, 16 2022; GFX11-NEXT: s_lshr_b32 s21, s2, 24 2023; GFX11-NEXT: s_lshr_b32 s14, s7, 16 2024; GFX11-NEXT: s_lshr_b32 s15, s7, 24 2025; GFX11-NEXT: s_bfe_u32 s27, s7, 0x80008 2026; GFX11-NEXT: s_add_i32 s17, s17, s17 2027; GFX11-NEXT: s_add_i32 s16, s16, s16 2028; GFX11-NEXT: s_lshr_b32 s18, s1, 16 2029; GFX11-NEXT: s_lshr_b32 s19, s1, 24 2030; GFX11-NEXT: s_lshr_b32 s22, s3, 16 2031; GFX11-NEXT: s_lshr_b32 s23, s3, 24 2032; GFX11-NEXT: s_bfe_u32 s29, s1, 0x80008 2033; GFX11-NEXT: s_bfe_u32 s30, s3, 0x80008 2034; GFX11-NEXT: s_add_i32 s21, s21, s21 2035; GFX11-NEXT: s_add_i32 s20, s20, s20 2036; GFX11-NEXT: s_lshl_b32 s17, s17, 8 2037; GFX11-NEXT: s_and_b32 s16, s16, 0xff 2038; GFX11-NEXT: s_add_i32 s7, s7, s7 2039; GFX11-NEXT: s_add_i32 s27, s27, s27 2040; GFX11-NEXT: s_add_i32 s15, s15, s15 2041; GFX11-NEXT: s_add_i32 s14, s14, s14 2042; GFX11-NEXT: s_add_i32 s3, s3, s3 2043; GFX11-NEXT: s_add_i32 s30, s30, s30 2044; GFX11-NEXT: s_add_i32 s23, s23, s23 2045; GFX11-NEXT: s_add_i32 s22, s22, s22 2046; GFX11-NEXT: s_lshl_b32 s21, s21, 8 2047; GFX11-NEXT: s_and_b32 s20, s20, 0xff 2048; GFX11-NEXT: s_add_i32 s1, s1, s1 2049; GFX11-NEXT: s_add_i32 s29, s29, s29 2050; GFX11-NEXT: s_add_i32 s19, s19, s19 2051; GFX11-NEXT: s_add_i32 s18, s18, s18 2052; GFX11-NEXT: s_lshr_b32 s10, s5, 16 2053; GFX11-NEXT: s_lshr_b32 s11, s5, 24 2054; GFX11-NEXT: s_lshr_b32 s12, s6, 16 2055; GFX11-NEXT: s_lshr_b32 s13, s6, 24 2056; GFX11-NEXT: s_or_b32 s16, s16, s17 2057; GFX11-NEXT: s_and_b32 s7, s7, 0xff 2058; GFX11-NEXT: s_lshl_b32 s17, s27, 8 2059; GFX11-NEXT: s_lshl_b32 s15, s15, 8 2060; GFX11-NEXT: s_and_b32 s14, s14, 0xff 2061; GFX11-NEXT: s_and_b32 s3, s3, 0xff 2062; GFX11-NEXT: s_lshl_b32 s30, s30, 8 2063; GFX11-NEXT: s_lshl_b32 s23, s23, 8 2064; GFX11-NEXT: s_and_b32 s22, s22, 0xff 2065; GFX11-NEXT: s_or_b32 s20, s20, s21 2066; GFX11-NEXT: s_and_b32 s1, s1, 0xff 2067; GFX11-NEXT: s_lshl_b32 s21, s29, 8 2068; GFX11-NEXT: s_lshl_b32 s19, s19, 8 2069; GFX11-NEXT: s_and_b32 s18, s18, 0xff 2070; GFX11-NEXT: s_lshr_b32 s8, s4, 16 2071; GFX11-NEXT: s_lshr_b32 s9, s4, 24 2072; GFX11-NEXT: s_bfe_u32 s24, s4, 0x80008 2073; GFX11-NEXT: s_bfe_u32 s25, s5, 0x80008 2074; GFX11-NEXT: s_bfe_u32 s26, s6, 0x80008 2075; GFX11-NEXT: s_or_b32 s7, s7, s17 2076; GFX11-NEXT: s_or_b32 s14, s14, s15 2077; GFX11-NEXT: s_add_i32 s13, s13, s13 2078; GFX11-NEXT: s_add_i32 s12, s12, s12 2079; GFX11-NEXT: s_add_i32 s11, s11, s11 2080; GFX11-NEXT: s_add_i32 s10, s10, s10 2081; GFX11-NEXT: s_bfe_u32 s28, s0, 0x80008 2082; GFX11-NEXT: s_or_b32 s3, s3, s30 2083; GFX11-NEXT: s_or_b32 s22, s22, s23 2084; GFX11-NEXT: s_bfe_u32 s23, s2, 0x80008 2085; GFX11-NEXT: s_or_b32 s1, s1, s21 2086; GFX11-NEXT: s_or_b32 s18, s18, s19 2087; GFX11-NEXT: s_and_b32 s7, s7, 0xffff 2088; GFX11-NEXT: s_lshl_b32 s14, s14, 16 2089; GFX11-NEXT: s_add_i32 s6, s6, s6 2090; GFX11-NEXT: s_add_i32 s26, s26, s26 2091; GFX11-NEXT: s_lshl_b32 s13, s13, 8 2092; GFX11-NEXT: s_and_b32 s12, s12, 0xff 2093; GFX11-NEXT: s_add_i32 s5, s5, s5 2094; GFX11-NEXT: s_add_i32 s25, s25, s25 2095; GFX11-NEXT: s_lshl_b32 s11, s11, 8 2096; GFX11-NEXT: s_and_b32 s10, s10, 0xff 2097; GFX11-NEXT: s_add_i32 s4, s4, s4 2098; GFX11-NEXT: s_add_i32 s24, s24, s24 2099; GFX11-NEXT: s_add_i32 s9, s9, s9 2100; GFX11-NEXT: s_add_i32 s8, s8, s8 2101; GFX11-NEXT: s_and_b32 s3, s3, 0xffff 2102; GFX11-NEXT: s_lshl_b32 s22, s22, 16 2103; GFX11-NEXT: s_add_i32 s2, s2, s2 2104; GFX11-NEXT: s_add_i32 s23, s23, s23 2105; GFX11-NEXT: s_and_b32 s1, s1, 0xffff 2106; GFX11-NEXT: s_lshl_b32 s18, s18, 16 2107; GFX11-NEXT: s_add_i32 s0, s0, s0 2108; GFX11-NEXT: s_add_i32 s28, s28, s28 2109; GFX11-NEXT: s_or_b32 s7, s7, s14 2110; GFX11-NEXT: s_and_b32 s6, s6, 0xff 2111; GFX11-NEXT: s_lshl_b32 s14, s26, 8 2112; GFX11-NEXT: s_or_b32 s12, s12, s13 2113; GFX11-NEXT: s_and_b32 s5, s5, 0xff 2114; GFX11-NEXT: s_lshl_b32 s13, s25, 8 2115; GFX11-NEXT: s_or_b32 s10, s10, s11 2116; GFX11-NEXT: s_and_b32 s4, s4, 0xff 2117; GFX11-NEXT: s_lshl_b32 s11, s24, 8 2118; GFX11-NEXT: s_lshl_b32 s9, s9, 8 2119; GFX11-NEXT: s_and_b32 s8, s8, 0xff 2120; GFX11-NEXT: s_or_b32 s3, s3, s22 2121; GFX11-NEXT: s_and_b32 s2, s2, 0xff 2122; GFX11-NEXT: s_lshl_b32 s22, s23, 8 2123; GFX11-NEXT: s_or_b32 s1, s1, s18 2124; GFX11-NEXT: s_and_b32 s0, s0, 0xff 2125; GFX11-NEXT: s_lshl_b32 s18, s28, 8 2126; GFX11-NEXT: s_or_b32 s6, s6, s14 2127; GFX11-NEXT: s_or_b32 s5, s5, s13 2128; GFX11-NEXT: s_or_b32 s4, s4, s11 2129; GFX11-NEXT: s_or_b32 s8, s8, s9 2130; GFX11-NEXT: s_or_b32 s2, s2, s22 2131; GFX11-NEXT: s_or_b32 s0, s0, s18 2132; GFX11-NEXT: s_and_b32 s6, s6, 0xffff 2133; GFX11-NEXT: s_lshl_b32 s12, s12, 16 2134; GFX11-NEXT: s_and_b32 s5, s5, 0xffff 2135; GFX11-NEXT: s_and_b32 s4, s4, 0xffff 2136; GFX11-NEXT: s_lshl_b32 s8, s8, 16 2137; GFX11-NEXT: s_lshl_b32 s9, s10, 16 2138; GFX11-NEXT: s_and_b32 s2, s2, 0xffff 2139; GFX11-NEXT: s_lshl_b32 s20, s20, 16 2140; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 2141; GFX11-NEXT: s_lshl_b32 s16, s16, 16 2142; GFX11-NEXT: s_or_b32 s6, s6, s12 2143; GFX11-NEXT: s_or_b32 s4, s4, s8 2144; GFX11-NEXT: s_or_b32 s5, s5, s9 2145; GFX11-NEXT: s_or_b32 s2, s2, s20 2146; GFX11-NEXT: s_or_b32 s0, s0, s16 2147; GFX11-NEXT: v_dual_mov_b32 v8, 16 :: v_dual_mov_b32 v5, s1 2148; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2149; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 2150; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v4, s0 2151; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v7, s3 2152; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v11, 0 2153; GFX11-NEXT: s_clause 0x1 2154; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off 2155; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off 2156; GFX11-NEXT: s_endpgm 2157entry: 2158 %add = add <32 x i8> %arg0, %arg0 2159 store <32 x i8> %add, ptr addrspace(1) null 2160 ret void 2161} 2162 2163define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) { 2164; SI-LABEL: amdgpu_cs_i1: 2165; SI: ; %bb.0: 2166; SI-NEXT: v_and_b32_e32 v0, 1, v0 2167; SI-NEXT: s_mov_b32 s3, 0xf000 2168; SI-NEXT: s_mov_b32 s2, -1 2169; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2170; SI-NEXT: s_endpgm 2171; 2172; VI-LABEL: amdgpu_cs_i1: 2173; VI: ; %bb.0: 2174; VI-NEXT: v_and_b32_e32 v0, 1, v0 2175; VI-NEXT: flat_store_byte v[0:1], v0 2176; VI-NEXT: s_endpgm 2177; 2178; GFX11-LABEL: amdgpu_cs_i1: 2179; GFX11: ; %bb.0: 2180; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 2181; GFX11-NEXT: global_store_b8 v[0:1], v0, off 2182; GFX11-NEXT: s_endpgm 2183 store i1 %arg0, ptr addrspace(1) undef 2184 ret void 2185} 2186 2187define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) { 2188; SI-LABEL: amdgpu_cs_v8i1: 2189; SI: ; %bb.0: 2190; SI-NEXT: v_lshlrev_b32_e32 v7, 3, v7 2191; SI-NEXT: v_and_b32_e32 v6, 1, v6 2192; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 2193; SI-NEXT: v_and_b32_e32 v4, 1, v4 2194; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v3 2195; SI-NEXT: v_and_b32_e32 v2, 1, v2 2196; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 2197; SI-NEXT: v_and_b32_e32 v0, 1, v0 2198; SI-NEXT: s_mov_b32 s3, 0xf000 2199; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 2200; SI-NEXT: v_or_b32_e32 v4, v4, v5 2201; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2202; SI-NEXT: v_or_b32_e32 v0, v0, v1 2203; SI-NEXT: v_or_b32_e32 v1, v7, v6 2204; SI-NEXT: v_and_b32_e32 v4, 3, v4 2205; SI-NEXT: v_or_b32_e32 v2, v3, v2 2206; SI-NEXT: v_and_b32_e32 v0, 3, v0 2207; SI-NEXT: v_or_b32_e32 v1, v4, v1 2208; SI-NEXT: v_or_b32_e32 v0, v0, v2 2209; SI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 2210; SI-NEXT: v_and_b32_e32 v0, 15, v0 2211; SI-NEXT: v_or_b32_e32 v0, v0, v1 2212; SI-NEXT: s_mov_b32 s2, -1 2213; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2214; SI-NEXT: s_endpgm 2215; 2216; VI-LABEL: amdgpu_cs_v8i1: 2217; VI: ; %bb.0: 2218; VI-NEXT: v_and_b32_e32 v6, 1, v6 2219; VI-NEXT: v_lshlrev_b16_e32 v5, 1, v5 2220; VI-NEXT: v_and_b32_e32 v4, 1, v4 2221; VI-NEXT: v_and_b32_e32 v2, 1, v2 2222; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 2223; VI-NEXT: v_and_b32_e32 v0, 1, v0 2224; VI-NEXT: v_lshlrev_b16_e32 v7, 3, v7 2225; VI-NEXT: v_lshlrev_b16_e32 v6, 2, v6 2226; VI-NEXT: v_or_b32_e32 v4, v4, v5 2227; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v3 2228; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 2229; VI-NEXT: v_or_b32_e32 v0, v0, v1 2230; VI-NEXT: v_or_b32_e32 v6, v7, v6 2231; VI-NEXT: v_and_b32_e32 v4, 3, v4 2232; VI-NEXT: v_or_b32_e32 v2, v3, v2 2233; VI-NEXT: v_and_b32_e32 v0, 3, v0 2234; VI-NEXT: v_or_b32_e32 v4, v4, v6 2235; VI-NEXT: v_or_b32_e32 v0, v0, v2 2236; VI-NEXT: v_lshlrev_b16_e32 v4, 4, v4 2237; VI-NEXT: v_and_b32_e32 v0, 15, v0 2238; VI-NEXT: v_or_b32_e32 v0, v0, v4 2239; VI-NEXT: flat_store_byte v[0:1], v0 2240; VI-NEXT: s_endpgm 2241; 2242; GFX11-LABEL: amdgpu_cs_v8i1: 2243; GFX11: ; %bb.0: 2244; GFX11-NEXT: v_and_b32_e32 v6, 1, v6 2245; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 2246; GFX11-NEXT: v_and_b32_e32 v4, 1, v4 2247; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 2248; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 2249; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 2250; GFX11-NEXT: v_lshlrev_b16 v7, 3, v7 2251; GFX11-NEXT: v_lshlrev_b16 v6, 2, v6 2252; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 2253; GFX11-NEXT: v_lshlrev_b16 v3, 3, v3 2254; GFX11-NEXT: v_lshlrev_b16 v2, 2, v2 2255; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2256; GFX11-NEXT: v_or_b32_e32 v1, v7, v6 2257; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 2258; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2259; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 2260; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 2261; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2262; GFX11-NEXT: v_or_b32_e32 v1, v4, v1 2263; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 2264; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2265; GFX11-NEXT: v_lshlrev_b16 v1, 4, v1 2266; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 2267; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2268; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2269; GFX11-NEXT: global_store_b8 v[0:1], v0, off 2270; GFX11-NEXT: s_endpgm 2271 store <8 x i1> %arg0, ptr addrspace(1) undef 2272 ret void 2273} 2274 2275define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) { 2276; SI-LABEL: amdgpu_cs_v16i1: 2277; SI: ; %bb.0: 2278; SI-NEXT: v_lshlrev_b32_e32 v15, 3, v15 2279; SI-NEXT: v_and_b32_e32 v14, 1, v14 2280; SI-NEXT: v_lshlrev_b32_e32 v13, 1, v13 2281; SI-NEXT: v_and_b32_e32 v12, 1, v12 2282; SI-NEXT: v_lshlrev_b32_e32 v11, 3, v11 2283; SI-NEXT: v_and_b32_e32 v10, 1, v10 2284; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v9 2285; SI-NEXT: v_and_b32_e32 v8, 1, v8 2286; SI-NEXT: v_lshlrev_b32_e32 v7, 3, v7 2287; SI-NEXT: v_and_b32_e32 v6, 1, v6 2288; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 2289; SI-NEXT: v_and_b32_e32 v4, 1, v4 2290; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v3 2291; SI-NEXT: v_and_b32_e32 v2, 1, v2 2292; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 2293; SI-NEXT: v_and_b32_e32 v0, 1, v0 2294; SI-NEXT: s_mov_b32 s3, 0xf000 2295; SI-NEXT: v_lshlrev_b32_e32 v14, 2, v14 2296; SI-NEXT: v_or_b32_e32 v12, v12, v13 2297; SI-NEXT: v_lshlrev_b32_e32 v10, 2, v10 2298; SI-NEXT: v_or_b32_e32 v8, v8, v9 2299; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 2300; SI-NEXT: v_or_b32_e32 v4, v4, v5 2301; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2302; SI-NEXT: v_or_b32_e32 v0, v0, v1 2303; SI-NEXT: v_or_b32_e32 v1, v15, v14 2304; SI-NEXT: v_and_b32_e32 v5, 3, v12 2305; SI-NEXT: v_or_b32_e32 v9, v11, v10 2306; SI-NEXT: v_and_b32_e32 v8, 3, v8 2307; SI-NEXT: v_or_b32_e32 v6, v7, v6 2308; SI-NEXT: v_and_b32_e32 v4, 3, v4 2309; SI-NEXT: v_or_b32_e32 v2, v3, v2 2310; SI-NEXT: v_and_b32_e32 v0, 3, v0 2311; SI-NEXT: v_or_b32_e32 v1, v5, v1 2312; SI-NEXT: v_or_b32_e32 v3, v8, v9 2313; SI-NEXT: v_or_b32_e32 v4, v4, v6 2314; SI-NEXT: v_or_b32_e32 v0, v0, v2 2315; SI-NEXT: v_lshlrev_b32_e32 v1, 12, v1 2316; SI-NEXT: v_and_b32_e32 v2, 15, v3 2317; SI-NEXT: v_lshlrev_b32_e32 v3, 4, v4 2318; SI-NEXT: v_and_b32_e32 v0, 15, v0 2319; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 2320; SI-NEXT: v_or_b32_e32 v0, v0, v3 2321; SI-NEXT: v_or_b32_e32 v1, v1, v2 2322; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 2323; SI-NEXT: v_or_b32_e32 v0, v0, v1 2324; SI-NEXT: s_mov_b32 s2, -1 2325; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 2326; SI-NEXT: s_endpgm 2327; 2328; VI-LABEL: amdgpu_cs_v16i1: 2329; VI: ; %bb.0: 2330; VI-NEXT: v_and_b32_e32 v14, 1, v14 2331; VI-NEXT: v_lshlrev_b16_e32 v13, 1, v13 2332; VI-NEXT: v_and_b32_e32 v12, 1, v12 2333; VI-NEXT: v_and_b32_e32 v10, 1, v10 2334; VI-NEXT: v_lshlrev_b16_e32 v9, 1, v9 2335; VI-NEXT: v_and_b32_e32 v8, 1, v8 2336; VI-NEXT: v_and_b32_e32 v6, 1, v6 2337; VI-NEXT: v_lshlrev_b16_e32 v5, 1, v5 2338; VI-NEXT: v_and_b32_e32 v4, 1, v4 2339; VI-NEXT: v_and_b32_e32 v2, 1, v2 2340; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 2341; VI-NEXT: v_and_b32_e32 v0, 1, v0 2342; VI-NEXT: v_lshlrev_b16_e32 v15, 3, v15 2343; VI-NEXT: v_lshlrev_b16_e32 v14, 2, v14 2344; VI-NEXT: v_or_b32_e32 v12, v12, v13 2345; VI-NEXT: v_lshlrev_b16_e32 v11, 3, v11 2346; VI-NEXT: v_lshlrev_b16_e32 v10, 2, v10 2347; VI-NEXT: v_or_b32_e32 v8, v8, v9 2348; VI-NEXT: v_lshlrev_b16_e32 v7, 3, v7 2349; VI-NEXT: v_lshlrev_b16_e32 v6, 2, v6 2350; VI-NEXT: v_or_b32_e32 v4, v4, v5 2351; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v3 2352; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 2353; VI-NEXT: v_or_b32_e32 v0, v0, v1 2354; VI-NEXT: v_or_b32_e32 v14, v15, v14 2355; VI-NEXT: v_and_b32_e32 v12, 3, v12 2356; VI-NEXT: v_or_b32_e32 v10, v11, v10 2357; VI-NEXT: v_and_b32_e32 v8, 3, v8 2358; VI-NEXT: v_or_b32_e32 v6, v7, v6 2359; VI-NEXT: v_and_b32_e32 v4, 3, v4 2360; VI-NEXT: v_or_b32_e32 v2, v3, v2 2361; VI-NEXT: v_and_b32_e32 v0, 3, v0 2362; VI-NEXT: v_or_b32_e32 v12, v12, v14 2363; VI-NEXT: v_or_b32_e32 v8, v8, v10 2364; VI-NEXT: v_mov_b32_e32 v9, 15 2365; VI-NEXT: v_or_b32_e32 v4, v4, v6 2366; VI-NEXT: v_or_b32_e32 v0, v0, v2 2367; VI-NEXT: v_lshlrev_b16_e32 v12, 12, v12 2368; VI-NEXT: v_and_b32_sdwa v8, v8, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2369; VI-NEXT: v_lshlrev_b16_e32 v4, 4, v4 2370; VI-NEXT: v_and_b32_e32 v0, 15, v0 2371; VI-NEXT: v_or_b32_e32 v8, v12, v8 2372; VI-NEXT: v_or_b32_e32 v0, v0, v4 2373; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2374; VI-NEXT: flat_store_short v[0:1], v0 2375; VI-NEXT: s_endpgm 2376; 2377; GFX11-LABEL: amdgpu_cs_v16i1: 2378; GFX11: ; %bb.0: 2379; GFX11-NEXT: v_and_b32_e32 v10, 1, v10 2380; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9 2381; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 2382; GFX11-NEXT: v_and_b32_e32 v6, 1, v6 2383; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 2384; GFX11-NEXT: v_and_b32_e32 v4, 1, v4 2385; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 2386; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 2387; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 2388; GFX11-NEXT: v_and_b32_e32 v14, 1, v14 2389; GFX11-NEXT: v_lshlrev_b16 v13, 1, v13 2390; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 2391; GFX11-NEXT: v_lshlrev_b16 v11, 3, v11 2392; GFX11-NEXT: v_lshlrev_b16 v10, 2, v10 2393; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 2394; GFX11-NEXT: v_lshlrev_b16 v7, 3, v7 2395; GFX11-NEXT: v_lshlrev_b16 v6, 2, v6 2396; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 2397; GFX11-NEXT: v_lshlrev_b16 v3, 3, v3 2398; GFX11-NEXT: v_lshlrev_b16 v2, 2, v2 2399; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2400; GFX11-NEXT: v_lshlrev_b16 v15, 3, v15 2401; GFX11-NEXT: v_lshlrev_b16 v14, 2, v14 2402; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 2403; GFX11-NEXT: v_or_b32_e32 v10, v11, v10 2404; GFX11-NEXT: v_and_b32_e32 v1, 3, v8 2405; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 2406; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 2407; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 2408; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 2409; GFX11-NEXT: v_or_b32_e32 v3, v15, v14 2410; GFX11-NEXT: v_and_b32_e32 v6, 3, v12 2411; GFX11-NEXT: v_or_b32_e32 v1, v1, v10 2412; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 2413; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 2414; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2415; GFX11-NEXT: v_or_b32_e32 v2, v6, v3 2416; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 2417; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2418; GFX11-NEXT: v_lshlrev_b16 v3, 4, v4 2419; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 2420; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2421; GFX11-NEXT: v_lshlrev_b16 v2, 12, v2 2422; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 2423; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2424; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 2425; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 2426; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2427; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 2428; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2429; GFX11-NEXT: global_store_b16 v[0:1], v0, off 2430; GFX11-NEXT: s_endpgm 2431 store <16 x i1> %arg0, ptr addrspace(1) undef 2432 ret void 2433} 2434 2435define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { 2436; SI-LABEL: amdgpu_cs_v32i1: 2437; SI: ; %bb.0: 2438; SI-NEXT: v_lshlrev_b32_e32 v29, 1, v29 2439; SI-NEXT: v_and_b32_e32 v28, 1, v28 2440; SI-NEXT: v_lshlrev_b32_e32 v25, 1, v25 2441; SI-NEXT: v_and_b32_e32 v24, 1, v24 2442; SI-NEXT: v_lshlrev_b32_e32 v21, 1, v21 2443; SI-NEXT: v_and_b32_e32 v20, 1, v20 2444; SI-NEXT: v_lshlrev_b32_e32 v17, 1, v17 2445; SI-NEXT: v_and_b32_e32 v16, 1, v16 2446; SI-NEXT: v_lshlrev_b32_e32 v13, 1, v13 2447; SI-NEXT: v_and_b32_e32 v12, 1, v12 2448; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v9 2449; SI-NEXT: v_and_b32_e32 v8, 1, v8 2450; SI-NEXT: v_lshlrev_b32_e32 v5, 1, v5 2451; SI-NEXT: v_and_b32_e32 v4, 1, v4 2452; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 2453; SI-NEXT: v_and_b32_e32 v0, 1, v0 2454; SI-NEXT: v_lshlrev_b32_e32 v31, 3, v31 2455; SI-NEXT: v_and_b32_e32 v30, 1, v30 2456; SI-NEXT: v_lshlrev_b32_e32 v27, 3, v27 2457; SI-NEXT: v_and_b32_e32 v26, 1, v26 2458; SI-NEXT: v_lshlrev_b32_e32 v23, 3, v23 2459; SI-NEXT: v_and_b32_e32 v22, 1, v22 2460; SI-NEXT: v_lshlrev_b32_e32 v19, 3, v19 2461; SI-NEXT: v_and_b32_e32 v18, 1, v18 2462; SI-NEXT: v_lshlrev_b32_e32 v15, 3, v15 2463; SI-NEXT: v_and_b32_e32 v14, 1, v14 2464; SI-NEXT: v_lshlrev_b32_e32 v11, 3, v11 2465; SI-NEXT: v_and_b32_e32 v10, 1, v10 2466; SI-NEXT: v_lshlrev_b32_e32 v7, 3, v7 2467; SI-NEXT: v_and_b32_e32 v6, 1, v6 2468; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v3 2469; SI-NEXT: v_and_b32_e32 v2, 1, v2 2470; SI-NEXT: s_mov_b32 s3, 0xf000 2471; SI-NEXT: v_or_b32_e32 v28, v28, v29 2472; SI-NEXT: v_or_b32_e32 v24, v24, v25 2473; SI-NEXT: v_or_b32_e32 v20, v20, v21 2474; SI-NEXT: v_or_b32_e32 v16, v16, v17 2475; SI-NEXT: v_or_b32_e32 v12, v12, v13 2476; SI-NEXT: v_or_b32_e32 v8, v8, v9 2477; SI-NEXT: v_or_b32_e32 v4, v4, v5 2478; SI-NEXT: v_or_b32_e32 v0, v0, v1 2479; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v30 2480; SI-NEXT: v_lshlrev_b32_e32 v5, 2, v26 2481; SI-NEXT: v_lshlrev_b32_e32 v9, 2, v22 2482; SI-NEXT: v_lshlrev_b32_e32 v13, 2, v18 2483; SI-NEXT: v_lshlrev_b32_e32 v14, 2, v14 2484; SI-NEXT: v_lshlrev_b32_e32 v10, 2, v10 2485; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v6 2486; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2487; SI-NEXT: v_or_b32_e32 v1, v31, v1 2488; SI-NEXT: v_or_b32_e32 v5, v27, v5 2489; SI-NEXT: v_or_b32_e32 v9, v23, v9 2490; SI-NEXT: v_or_b32_e32 v13, v19, v13 2491; SI-NEXT: v_and_b32_e32 v17, 3, v28 2492; SI-NEXT: v_and_b32_e32 v18, 3, v24 2493; SI-NEXT: v_and_b32_e32 v19, 3, v20 2494; SI-NEXT: v_and_b32_e32 v16, 3, v16 2495; SI-NEXT: v_or_b32_e32 v14, v15, v14 2496; SI-NEXT: v_and_b32_e32 v12, 3, v12 2497; SI-NEXT: v_or_b32_e32 v10, v11, v10 2498; SI-NEXT: v_and_b32_e32 v8, 3, v8 2499; SI-NEXT: v_or_b32_e32 v6, v7, v6 2500; SI-NEXT: v_and_b32_e32 v4, 3, v4 2501; SI-NEXT: v_or_b32_e32 v2, v3, v2 2502; SI-NEXT: v_and_b32_e32 v0, 3, v0 2503; SI-NEXT: v_or_b32_e32 v1, v17, v1 2504; SI-NEXT: v_or_b32_e32 v3, v18, v5 2505; SI-NEXT: v_or_b32_e32 v5, v19, v9 2506; SI-NEXT: v_or_b32_e32 v7, v16, v13 2507; SI-NEXT: v_or_b32_e32 v9, v12, v14 2508; SI-NEXT: v_or_b32_e32 v8, v8, v10 2509; SI-NEXT: v_or_b32_e32 v4, v4, v6 2510; SI-NEXT: v_or_b32_e32 v0, v0, v2 2511; SI-NEXT: v_lshlrev_b32_e32 v1, 12, v1 2512; SI-NEXT: v_and_b32_e32 v2, 15, v3 2513; SI-NEXT: v_lshlrev_b32_e32 v3, 4, v5 2514; SI-NEXT: v_and_b32_e32 v5, 15, v7 2515; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v9 2516; SI-NEXT: v_and_b32_e32 v7, 15, v8 2517; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 2518; SI-NEXT: v_and_b32_e32 v0, 15, v0 2519; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 2520; SI-NEXT: v_or_b32_e32 v3, v5, v3 2521; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 2522; SI-NEXT: v_or_b32_e32 v0, v0, v4 2523; SI-NEXT: v_or_b32_e32 v1, v1, v2 2524; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 2525; SI-NEXT: v_or_b32_e32 v3, v6, v5 2526; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 2527; SI-NEXT: v_or_b32_e32 v1, v2, v1 2528; SI-NEXT: v_or_b32_e32 v0, v0, v3 2529; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2530; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 2531; SI-NEXT: v_or_b32_e32 v0, v0, v1 2532; SI-NEXT: s_mov_b32 s2, -1 2533; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2534; SI-NEXT: s_endpgm 2535; 2536; VI-LABEL: amdgpu_cs_v32i1: 2537; VI: ; %bb.0: 2538; VI-NEXT: v_and_b32_e32 v6, 1, v6 2539; VI-NEXT: v_lshlrev_b16_e32 v5, 1, v5 2540; VI-NEXT: v_and_b32_e32 v4, 1, v4 2541; VI-NEXT: v_and_b32_e32 v2, 1, v2 2542; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 2543; VI-NEXT: v_and_b32_e32 v0, 1, v0 2544; VI-NEXT: v_lshlrev_b16_e32 v7, 3, v7 2545; VI-NEXT: v_lshlrev_b16_e32 v6, 2, v6 2546; VI-NEXT: v_or_b32_e32 v4, v4, v5 2547; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v3 2548; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 2549; VI-NEXT: v_or_b32_e32 v0, v0, v1 2550; VI-NEXT: v_or_b32_e32 v6, v7, v6 2551; VI-NEXT: v_and_b32_e32 v4, 3, v4 2552; VI-NEXT: v_or_b32_e32 v2, v3, v2 2553; VI-NEXT: v_and_b32_e32 v0, 3, v0 2554; VI-NEXT: v_or_b32_e32 v4, v4, v6 2555; VI-NEXT: v_or_b32_e32 v0, v0, v2 2556; VI-NEXT: v_lshlrev_b16_e32 v1, 4, v4 2557; VI-NEXT: v_and_b32_e32 v0, 15, v0 2558; VI-NEXT: v_and_b32_e32 v2, 1, v30 2559; VI-NEXT: v_or_b32_e32 v0, v0, v1 2560; VI-NEXT: v_lshlrev_b16_e32 v1, 3, v31 2561; VI-NEXT: v_lshlrev_b16_e32 v2, 2, v2 2562; VI-NEXT: v_or_b32_e32 v1, v1, v2 2563; VI-NEXT: v_lshlrev_b16_e32 v2, 1, v29 2564; VI-NEXT: v_and_b32_e32 v3, 1, v28 2565; VI-NEXT: v_or_b32_e32 v2, v3, v2 2566; VI-NEXT: v_and_b32_e32 v2, 3, v2 2567; VI-NEXT: v_and_b32_e32 v3, 1, v26 2568; VI-NEXT: v_or_b32_e32 v1, v2, v1 2569; VI-NEXT: v_lshlrev_b16_e32 v2, 3, v27 2570; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 2571; VI-NEXT: v_and_b32_e32 v10, 1, v10 2572; VI-NEXT: v_lshlrev_b16_e32 v9, 1, v9 2573; VI-NEXT: v_and_b32_e32 v8, 1, v8 2574; VI-NEXT: v_or_b32_e32 v2, v2, v3 2575; VI-NEXT: v_lshlrev_b16_e32 v3, 1, v25 2576; VI-NEXT: v_and_b32_e32 v4, 1, v24 2577; VI-NEXT: v_lshlrev_b16_e32 v11, 3, v11 2578; VI-NEXT: v_lshlrev_b16_e32 v10, 2, v10 2579; VI-NEXT: v_or_b32_e32 v8, v8, v9 2580; VI-NEXT: v_or_b32_e32 v3, v4, v3 2581; VI-NEXT: v_or_b32_e32 v10, v11, v10 2582; VI-NEXT: v_and_b32_e32 v8, 3, v8 2583; VI-NEXT: v_and_b32_e32 v3, 3, v3 2584; VI-NEXT: v_or_b32_e32 v8, v8, v10 2585; VI-NEXT: v_mov_b32_e32 v10, 15 2586; VI-NEXT: v_or_b32_e32 v2, v3, v2 2587; VI-NEXT: v_lshlrev_b16_e32 v1, 12, v1 2588; VI-NEXT: v_and_b32_sdwa v2, v2, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2589; VI-NEXT: v_and_b32_e32 v3, 1, v22 2590; VI-NEXT: v_or_b32_e32 v1, v1, v2 2591; VI-NEXT: v_lshlrev_b16_e32 v2, 3, v23 2592; VI-NEXT: v_lshlrev_b16_e32 v3, 2, v3 2593; VI-NEXT: v_or_b32_e32 v2, v2, v3 2594; VI-NEXT: v_lshlrev_b16_e32 v3, 1, v21 2595; VI-NEXT: v_and_b32_e32 v4, 1, v20 2596; VI-NEXT: v_or_b32_e32 v3, v4, v3 2597; VI-NEXT: v_and_b32_e32 v3, 3, v3 2598; VI-NEXT: v_and_b32_e32 v4, 1, v18 2599; VI-NEXT: v_or_b32_e32 v2, v3, v2 2600; VI-NEXT: v_lshlrev_b16_e32 v3, 3, v19 2601; VI-NEXT: v_lshlrev_b16_e32 v4, 2, v4 2602; VI-NEXT: v_and_b32_e32 v14, 1, v14 2603; VI-NEXT: v_lshlrev_b16_e32 v13, 1, v13 2604; VI-NEXT: v_and_b32_e32 v12, 1, v12 2605; VI-NEXT: v_or_b32_e32 v3, v3, v4 2606; VI-NEXT: v_lshlrev_b16_e32 v4, 1, v17 2607; VI-NEXT: v_and_b32_e32 v5, 1, v16 2608; VI-NEXT: v_lshlrev_b16_e32 v15, 3, v15 2609; VI-NEXT: v_lshlrev_b16_e32 v14, 2, v14 2610; VI-NEXT: v_or_b32_e32 v12, v12, v13 2611; VI-NEXT: v_or_b32_e32 v4, v5, v4 2612; VI-NEXT: v_or_b32_e32 v14, v15, v14 2613; VI-NEXT: v_and_b32_e32 v12, 3, v12 2614; VI-NEXT: v_and_b32_e32 v4, 3, v4 2615; VI-NEXT: v_or_b32_e32 v12, v12, v14 2616; VI-NEXT: v_or_b32_e32 v3, v4, v3 2617; VI-NEXT: v_lshlrev_b16_e32 v9, 12, v12 2618; VI-NEXT: v_and_b32_sdwa v8, v8, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2619; VI-NEXT: v_lshlrev_b16_e32 v2, 4, v2 2620; VI-NEXT: v_and_b32_e32 v3, 15, v3 2621; VI-NEXT: v_or_b32_e32 v8, v9, v8 2622; VI-NEXT: v_or_b32_e32 v2, v3, v2 2623; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2624; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2625; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2626; VI-NEXT: flat_store_dword v[0:1], v0 2627; VI-NEXT: s_endpgm 2628; 2629; GFX11-LABEL: amdgpu_cs_v32i1: 2630; GFX11: ; %bb.0: 2631; GFX11-NEXT: v_and_b32_e32 v10, 1, v10 2632; GFX11-NEXT: v_lshlrev_b16 v9, 1, v9 2633; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 2634; GFX11-NEXT: v_lshlrev_b16 v11, 3, v11 2635; GFX11-NEXT: v_and_b32_e32 v6, 1, v6 2636; GFX11-NEXT: v_lshlrev_b16 v10, 2, v10 2637; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 2638; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 2639; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 2640; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 2641; GFX11-NEXT: v_and_b32_e32 v14, 1, v14 2642; GFX11-NEXT: v_lshlrev_b16 v13, 1, v13 2643; GFX11-NEXT: v_and_b32_e32 v12, 1, v12 2644; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 2645; GFX11-NEXT: v_and_b32_e32 v4, 1, v4 2646; GFX11-NEXT: v_or_b32_e32 v9, v11, v10 2647; GFX11-NEXT: v_and_b32_e32 v8, 3, v8 2648; GFX11-NEXT: v_lshlrev_b16 v7, 3, v7 2649; GFX11-NEXT: v_lshlrev_b16 v6, 2, v6 2650; GFX11-NEXT: v_lshlrev_b16 v3, 3, v3 2651; GFX11-NEXT: v_lshlrev_b16 v2, 2, v2 2652; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2653; GFX11-NEXT: v_lshlrev_b16 v15, 3, v15 2654; GFX11-NEXT: v_lshlrev_b16 v14, 2, v14 2655; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 2656; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 2657; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 2658; GFX11-NEXT: v_or_b32_e32 v6, v8, v9 2659; GFX11-NEXT: v_or_b32_e32 v2, v3, v2 2660; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 2661; GFX11-NEXT: v_or_b32_e32 v13, v15, v14 2662; GFX11-NEXT: v_and_b32_e32 v12, 3, v12 2663; GFX11-NEXT: v_and_b32_e32 v3, 15, v6 2664; GFX11-NEXT: v_lshlrev_b16 v6, 1, v29 2665; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 2666; GFX11-NEXT: v_and_b32_e32 v2, 1, v26 2667; GFX11-NEXT: v_and_b32_e32 v7, 1, v28 2668; GFX11-NEXT: v_lshlrev_b16 v9, 1, v25 2669; GFX11-NEXT: v_and_b32_e32 v10, 1, v24 2670; GFX11-NEXT: v_and_b32_e32 v4, 3, v4 2671; GFX11-NEXT: v_or_b32_e32 v1, v12, v13 2672; GFX11-NEXT: v_lshlrev_b16 v8, 3, v27 2673; GFX11-NEXT: v_lshlrev_b16 v2, 2, v2 2674; GFX11-NEXT: v_or_b32_e32 v6, v7, v6 2675; GFX11-NEXT: v_or_b32_e32 v7, v10, v9 2676; GFX11-NEXT: v_and_b32_e32 v9, 1, v22 2677; GFX11-NEXT: v_lshlrev_b16 v10, 1, v21 2678; GFX11-NEXT: v_and_b32_e32 v12, 1, v20 2679; GFX11-NEXT: v_and_b32_e32 v13, 1, v18 2680; GFX11-NEXT: v_lshlrev_b16 v14, 1, v17 2681; GFX11-NEXT: v_and_b32_e32 v15, 1, v16 2682; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 2683; GFX11-NEXT: v_and_b32_e32 v5, 1, v30 2684; GFX11-NEXT: v_or_b32_e32 v2, v8, v2 2685; GFX11-NEXT: v_lshlrev_b16 v8, 3, v23 2686; GFX11-NEXT: v_lshlrev_b16 v9, 2, v9 2687; GFX11-NEXT: v_or_b32_e32 v10, v12, v10 2688; GFX11-NEXT: v_lshlrev_b16 v12, 3, v19 2689; GFX11-NEXT: v_lshlrev_b16 v13, 2, v13 2690; GFX11-NEXT: v_or_b32_e32 v14, v15, v14 2691; GFX11-NEXT: v_lshlrev_b16 v11, 3, v31 2692; GFX11-NEXT: v_lshlrev_b16 v5, 2, v5 2693; GFX11-NEXT: v_and_b32_e32 v7, 3, v7 2694; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 2695; GFX11-NEXT: v_and_b32_e32 v9, 3, v10 2696; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 2697; GFX11-NEXT: v_and_b32_e32 v12, 3, v14 2698; GFX11-NEXT: v_or_b32_e32 v5, v11, v5 2699; GFX11-NEXT: v_and_b32_e32 v6, 3, v6 2700; GFX11-NEXT: v_or_b32_e32 v2, v7, v2 2701; GFX11-NEXT: v_or_b32_e32 v7, v9, v8 2702; GFX11-NEXT: v_or_b32_e32 v8, v12, v10 2703; GFX11-NEXT: v_lshlrev_b16 v4, 4, v4 2704; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 2705; GFX11-NEXT: v_or_b32_e32 v5, v6, v5 2706; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 2707; GFX11-NEXT: v_lshlrev_b16 v6, 4, v7 2708; GFX11-NEXT: v_and_b32_e32 v7, 15, v8 2709; GFX11-NEXT: v_lshlrev_b16 v1, 12, v1 2710; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 2711; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 2712; GFX11-NEXT: v_lshlrev_b16 v4, 12, v5 2713; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 2714; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 2715; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 2716; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 2717; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2718; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 2719; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5 2720; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2721; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2722; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 2723; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2724; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2725; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2726; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2727; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 2728; GFX11-NEXT: global_store_b32 v[0:1], v0, off 2729; GFX11-NEXT: s_endpgm 2730 store <32 x i1> %arg0, ptr addrspace(1) undef 2731 ret void 2732} 2733 2734define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) { 2735; SI-LABEL: amdgpu_cs_inreg_i1: 2736; SI: ; %bb.0: 2737; SI-NEXT: s_and_b32 s0, s0, 1 2738; SI-NEXT: s_mov_b32 s3, 0xf000 2739; SI-NEXT: s_mov_b32 s2, -1 2740; SI-NEXT: v_mov_b32_e32 v0, s0 2741; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2742; SI-NEXT: s_endpgm 2743; 2744; VI-LABEL: amdgpu_cs_inreg_i1: 2745; VI: ; %bb.0: 2746; VI-NEXT: s_and_b32 s0, s0, 1 2747; VI-NEXT: v_mov_b32_e32 v0, s0 2748; VI-NEXT: flat_store_byte v[0:1], v0 2749; VI-NEXT: s_endpgm 2750; 2751; GFX11-LABEL: amdgpu_cs_inreg_i1: 2752; GFX11: ; %bb.0: 2753; GFX11-NEXT: s_and_b32 s0, s0, 1 2754; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2755; GFX11-NEXT: v_mov_b32_e32 v0, s0 2756; GFX11-NEXT: global_store_b8 v[0:1], v0, off 2757; GFX11-NEXT: s_endpgm 2758 store i1 %arg0, ptr addrspace(1) undef 2759 ret void 2760} 2761 2762define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) { 2763; SI-LABEL: amdgpu_cs_inreg_v8i1: 2764; SI: ; %bb.0: 2765; SI-NEXT: s_lshl_b32 s7, s7, 3 2766; SI-NEXT: s_and_b32 s6, s6, 1 2767; SI-NEXT: s_lshl_b32 s5, s5, 1 2768; SI-NEXT: s_and_b32 s4, s4, 1 2769; SI-NEXT: s_lshl_b32 s8, s3, 3 2770; SI-NEXT: s_and_b32 s2, s2, 1 2771; SI-NEXT: s_lshl_b32 s1, s1, 1 2772; SI-NEXT: s_and_b32 s0, s0, 1 2773; SI-NEXT: s_mov_b32 s3, 0xf000 2774; SI-NEXT: s_lshl_b32 s6, s6, 2 2775; SI-NEXT: s_or_b32 s4, s4, s5 2776; SI-NEXT: s_lshl_b32 s2, s2, 2 2777; SI-NEXT: s_or_b32 s0, s0, s1 2778; SI-NEXT: s_or_b32 s1, s7, s6 2779; SI-NEXT: s_and_b32 s4, s4, 3 2780; SI-NEXT: s_or_b32 s2, s8, s2 2781; SI-NEXT: s_and_b32 s0, s0, 3 2782; SI-NEXT: s_or_b32 s1, s4, s1 2783; SI-NEXT: s_or_b32 s0, s0, s2 2784; SI-NEXT: s_lshl_b32 s1, s1, 4 2785; SI-NEXT: s_and_b32 s0, s0, 15 2786; SI-NEXT: s_or_b32 s0, s0, s1 2787; SI-NEXT: s_mov_b32 s2, -1 2788; SI-NEXT: v_mov_b32_e32 v0, s0 2789; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2790; SI-NEXT: s_endpgm 2791; 2792; VI-LABEL: amdgpu_cs_inreg_v8i1: 2793; VI: ; %bb.0: 2794; VI-NEXT: s_and_b32 s6, s6, 1 2795; VI-NEXT: s_lshl_b32 s5, s5, 1 2796; VI-NEXT: s_and_b32 s4, s4, 1 2797; VI-NEXT: s_and_b32 s2, s2, 1 2798; VI-NEXT: s_lshl_b32 s1, s1, 1 2799; VI-NEXT: s_and_b32 s0, s0, 1 2800; VI-NEXT: s_lshl_b32 s7, s7, 3 2801; VI-NEXT: s_lshl_b32 s6, s6, 2 2802; VI-NEXT: s_or_b32 s4, s4, s5 2803; VI-NEXT: s_lshl_b32 s3, s3, 3 2804; VI-NEXT: s_lshl_b32 s2, s2, 2 2805; VI-NEXT: s_or_b32 s0, s0, s1 2806; VI-NEXT: s_or_b32 s6, s7, s6 2807; VI-NEXT: s_and_b32 s4, s4, 3 2808; VI-NEXT: s_or_b32 s2, s3, s2 2809; VI-NEXT: s_and_b32 s0, s0, 3 2810; VI-NEXT: s_or_b32 s4, s4, s6 2811; VI-NEXT: s_or_b32 s0, s0, s2 2812; VI-NEXT: s_lshl_b32 s4, s4, 4 2813; VI-NEXT: s_and_b32 s0, s0, 15 2814; VI-NEXT: s_or_b32 s0, s0, s4 2815; VI-NEXT: v_mov_b32_e32 v0, s0 2816; VI-NEXT: flat_store_byte v[0:1], v0 2817; VI-NEXT: s_endpgm 2818; 2819; GFX11-LABEL: amdgpu_cs_inreg_v8i1: 2820; GFX11: ; %bb.0: 2821; GFX11-NEXT: s_and_b32 s6, s6, 1 2822; GFX11-NEXT: s_lshl_b32 s5, s5, 1 2823; GFX11-NEXT: s_and_b32 s4, s4, 1 2824; GFX11-NEXT: s_and_b32 s2, s2, 1 2825; GFX11-NEXT: s_lshl_b32 s1, s1, 1 2826; GFX11-NEXT: s_and_b32 s0, s0, 1 2827; GFX11-NEXT: s_lshl_b32 s7, s7, 3 2828; GFX11-NEXT: s_lshl_b32 s6, s6, 2 2829; GFX11-NEXT: s_or_b32 s4, s4, s5 2830; GFX11-NEXT: s_lshl_b32 s3, s3, 3 2831; GFX11-NEXT: s_lshl_b32 s2, s2, 2 2832; GFX11-NEXT: s_or_b32 s0, s0, s1 2833; GFX11-NEXT: s_or_b32 s5, s7, s6 2834; GFX11-NEXT: s_and_b32 s4, s4, 3 2835; GFX11-NEXT: s_or_b32 s1, s3, s2 2836; GFX11-NEXT: s_and_b32 s0, s0, 3 2837; GFX11-NEXT: s_or_b32 s2, s4, s5 2838; GFX11-NEXT: s_or_b32 s0, s0, s1 2839; GFX11-NEXT: s_lshl_b32 s1, s2, 4 2840; GFX11-NEXT: s_and_b32 s0, s0, 15 2841; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2842; GFX11-NEXT: s_or_b32 s0, s0, s1 2843; GFX11-NEXT: v_mov_b32_e32 v0, s0 2844; GFX11-NEXT: global_store_b8 v[0:1], v0, off 2845; GFX11-NEXT: s_endpgm 2846 store <8 x i1> %arg0, ptr addrspace(1) undef 2847 ret void 2848} 2849 2850define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) { 2851; SI-LABEL: amdgpu_cs_inreg_v16i1: 2852; SI: ; %bb.0: 2853; SI-NEXT: s_lshl_b32 s15, s15, 3 2854; SI-NEXT: s_and_b32 s14, s14, 1 2855; SI-NEXT: s_lshl_b32 s13, s13, 1 2856; SI-NEXT: s_and_b32 s12, s12, 1 2857; SI-NEXT: s_lshl_b32 s11, s11, 3 2858; SI-NEXT: s_and_b32 s10, s10, 1 2859; SI-NEXT: s_lshl_b32 s9, s9, 1 2860; SI-NEXT: s_and_b32 s8, s8, 1 2861; SI-NEXT: s_lshl_b32 s7, s7, 3 2862; SI-NEXT: s_and_b32 s6, s6, 1 2863; SI-NEXT: s_lshl_b32 s5, s5, 1 2864; SI-NEXT: s_and_b32 s4, s4, 1 2865; SI-NEXT: s_lshl_b32 s16, s3, 3 2866; SI-NEXT: s_and_b32 s2, s2, 1 2867; SI-NEXT: s_lshl_b32 s1, s1, 1 2868; SI-NEXT: s_and_b32 s0, s0, 1 2869; SI-NEXT: s_mov_b32 s3, 0xf000 2870; SI-NEXT: s_lshl_b32 s14, s14, 2 2871; SI-NEXT: s_or_b32 s12, s12, s13 2872; SI-NEXT: s_lshl_b32 s10, s10, 2 2873; SI-NEXT: s_or_b32 s8, s8, s9 2874; SI-NEXT: s_lshl_b32 s6, s6, 2 2875; SI-NEXT: s_or_b32 s4, s4, s5 2876; SI-NEXT: s_lshl_b32 s2, s2, 2 2877; SI-NEXT: s_or_b32 s0, s0, s1 2878; SI-NEXT: s_or_b32 s1, s15, s14 2879; SI-NEXT: s_and_b32 s5, s12, 3 2880; SI-NEXT: s_or_b32 s9, s11, s10 2881; SI-NEXT: s_and_b32 s8, s8, 3 2882; SI-NEXT: s_or_b32 s6, s7, s6 2883; SI-NEXT: s_and_b32 s4, s4, 3 2884; SI-NEXT: s_or_b32 s2, s16, s2 2885; SI-NEXT: s_and_b32 s0, s0, 3 2886; SI-NEXT: s_or_b32 s1, s5, s1 2887; SI-NEXT: s_or_b32 s5, s8, s9 2888; SI-NEXT: s_or_b32 s4, s4, s6 2889; SI-NEXT: s_or_b32 s0, s0, s2 2890; SI-NEXT: s_lshl_b32 s1, s1, 12 2891; SI-NEXT: s_and_b32 s2, s5, 15 2892; SI-NEXT: s_lshl_b32 s4, s4, 4 2893; SI-NEXT: s_and_b32 s0, s0, 15 2894; SI-NEXT: s_lshl_b32 s2, s2, 8 2895; SI-NEXT: s_or_b32 s0, s0, s4 2896; SI-NEXT: s_or_b32 s1, s1, s2 2897; SI-NEXT: s_and_b32 s0, s0, 0xff 2898; SI-NEXT: s_or_b32 s0, s0, s1 2899; SI-NEXT: s_mov_b32 s2, -1 2900; SI-NEXT: v_mov_b32_e32 v0, s0 2901; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 2902; SI-NEXT: s_endpgm 2903; 2904; VI-LABEL: amdgpu_cs_inreg_v16i1: 2905; VI: ; %bb.0: 2906; VI-NEXT: s_and_b32 s10, s10, 1 2907; VI-NEXT: s_lshl_b32 s9, s9, 1 2908; VI-NEXT: s_and_b32 s8, s8, 1 2909; VI-NEXT: s_and_b32 s6, s6, 1 2910; VI-NEXT: s_lshl_b32 s5, s5, 1 2911; VI-NEXT: s_and_b32 s4, s4, 1 2912; VI-NEXT: s_and_b32 s2, s2, 1 2913; VI-NEXT: s_lshl_b32 s1, s1, 1 2914; VI-NEXT: s_and_b32 s0, s0, 1 2915; VI-NEXT: s_and_b32 s14, s14, 1 2916; VI-NEXT: s_lshl_b32 s13, s13, 1 2917; VI-NEXT: s_and_b32 s12, s12, 1 2918; VI-NEXT: s_lshl_b32 s11, s11, 3 2919; VI-NEXT: s_lshl_b32 s10, s10, 2 2920; VI-NEXT: s_or_b32 s8, s8, s9 2921; VI-NEXT: s_lshl_b32 s7, s7, 3 2922; VI-NEXT: s_lshl_b32 s6, s6, 2 2923; VI-NEXT: s_or_b32 s4, s4, s5 2924; VI-NEXT: s_lshl_b32 s3, s3, 3 2925; VI-NEXT: s_lshl_b32 s2, s2, 2 2926; VI-NEXT: s_or_b32 s0, s0, s1 2927; VI-NEXT: s_lshl_b32 s15, s15, 3 2928; VI-NEXT: s_lshl_b32 s14, s14, 2 2929; VI-NEXT: s_or_b32 s12, s12, s13 2930; VI-NEXT: s_or_b32 s10, s11, s10 2931; VI-NEXT: s_and_b32 s8, s8, 3 2932; VI-NEXT: s_or_b32 s6, s7, s6 2933; VI-NEXT: s_and_b32 s4, s4, 3 2934; VI-NEXT: s_or_b32 s2, s3, s2 2935; VI-NEXT: s_and_b32 s0, s0, 3 2936; VI-NEXT: s_or_b32 s14, s15, s14 2937; VI-NEXT: s_and_b32 s12, s12, 3 2938; VI-NEXT: s_or_b32 s8, s8, s10 2939; VI-NEXT: s_or_b32 s4, s4, s6 2940; VI-NEXT: s_or_b32 s0, s0, s2 2941; VI-NEXT: s_or_b32 s12, s12, s14 2942; VI-NEXT: s_and_b32 s8, s8, 15 2943; VI-NEXT: s_lshl_b32 s4, s4, 4 2944; VI-NEXT: s_and_b32 s0, s0, 15 2945; VI-NEXT: s_lshl_b32 s12, s12, 12 2946; VI-NEXT: s_lshl_b32 s8, s8, 8 2947; VI-NEXT: s_or_b32 s0, s0, s4 2948; VI-NEXT: s_or_b32 s8, s12, s8 2949; VI-NEXT: s_and_b32 s0, s0, 0xff 2950; VI-NEXT: s_or_b32 s0, s0, s8 2951; VI-NEXT: v_mov_b32_e32 v0, s0 2952; VI-NEXT: flat_store_short v[0:1], v0 2953; VI-NEXT: s_endpgm 2954; 2955; GFX11-LABEL: amdgpu_cs_inreg_v16i1: 2956; GFX11: ; %bb.0: 2957; GFX11-NEXT: s_and_b32 s10, s10, 1 2958; GFX11-NEXT: s_lshl_b32 s9, s9, 1 2959; GFX11-NEXT: s_and_b32 s8, s8, 1 2960; GFX11-NEXT: s_and_b32 s6, s6, 1 2961; GFX11-NEXT: s_lshl_b32 s5, s5, 1 2962; GFX11-NEXT: s_and_b32 s4, s4, 1 2963; GFX11-NEXT: s_and_b32 s2, s2, 1 2964; GFX11-NEXT: s_lshl_b32 s1, s1, 1 2965; GFX11-NEXT: s_and_b32 s0, s0, 1 2966; GFX11-NEXT: s_and_b32 s14, s14, 1 2967; GFX11-NEXT: s_lshl_b32 s13, s13, 1 2968; GFX11-NEXT: s_and_b32 s12, s12, 1 2969; GFX11-NEXT: s_lshl_b32 s11, s11, 3 2970; GFX11-NEXT: s_lshl_b32 s10, s10, 2 2971; GFX11-NEXT: s_or_b32 s8, s8, s9 2972; GFX11-NEXT: s_lshl_b32 s7, s7, 3 2973; GFX11-NEXT: s_lshl_b32 s6, s6, 2 2974; GFX11-NEXT: s_or_b32 s4, s4, s5 2975; GFX11-NEXT: s_lshl_b32 s3, s3, 3 2976; GFX11-NEXT: s_lshl_b32 s2, s2, 2 2977; GFX11-NEXT: s_or_b32 s0, s0, s1 2978; GFX11-NEXT: s_lshl_b32 s15, s15, 3 2979; GFX11-NEXT: s_lshl_b32 s14, s14, 2 2980; GFX11-NEXT: s_or_b32 s12, s12, s13 2981; GFX11-NEXT: s_or_b32 s9, s11, s10 2982; GFX11-NEXT: s_and_b32 s8, s8, 3 2983; GFX11-NEXT: s_or_b32 s5, s7, s6 2984; GFX11-NEXT: s_and_b32 s4, s4, 3 2985; GFX11-NEXT: s_or_b32 s1, s3, s2 2986; GFX11-NEXT: s_and_b32 s0, s0, 3 2987; GFX11-NEXT: s_or_b32 s13, s15, s14 2988; GFX11-NEXT: s_and_b32 s12, s12, 3 2989; GFX11-NEXT: s_or_b32 s8, s8, s9 2990; GFX11-NEXT: s_or_b32 s2, s4, s5 2991; GFX11-NEXT: s_or_b32 s0, s0, s1 2992; GFX11-NEXT: s_or_b32 s10, s12, s13 2993; GFX11-NEXT: s_and_b32 s8, s8, 15 2994; GFX11-NEXT: s_lshl_b32 s1, s2, 4 2995; GFX11-NEXT: s_and_b32 s0, s0, 15 2996; GFX11-NEXT: s_lshl_b32 s9, s10, 12 2997; GFX11-NEXT: s_lshl_b32 s2, s8, 8 2998; GFX11-NEXT: s_or_b32 s0, s0, s1 2999; GFX11-NEXT: s_or_b32 s1, s9, s2 3000; GFX11-NEXT: s_and_b32 s0, s0, 0xff 3001; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3002; GFX11-NEXT: s_or_b32 s0, s0, s1 3003; GFX11-NEXT: v_mov_b32_e32 v0, s0 3004; GFX11-NEXT: global_store_b16 v[0:1], v0, off 3005; GFX11-NEXT: s_endpgm 3006 store <16 x i1> %arg0, ptr addrspace(1) undef 3007 ret void 3008} 3009 3010define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) { 3011; SI-LABEL: amdgpu_cs_inreg_v32i1: 3012; SI: ; %bb.0: 3013; SI-NEXT: s_lshl_b32 s31, s31, 3 3014; SI-NEXT: s_and_b32 s30, s30, 1 3015; SI-NEXT: s_lshl_b32 s29, s29, 1 3016; SI-NEXT: s_and_b32 s28, s28, 1 3017; SI-NEXT: s_lshl_b32 s27, s27, 3 3018; SI-NEXT: s_and_b32 s26, s26, 1 3019; SI-NEXT: s_lshl_b32 s25, s25, 1 3020; SI-NEXT: s_and_b32 s24, s24, 1 3021; SI-NEXT: s_lshl_b32 s23, s23, 3 3022; SI-NEXT: s_and_b32 s22, s22, 1 3023; SI-NEXT: s_lshl_b32 s21, s21, 1 3024; SI-NEXT: s_and_b32 s20, s20, 1 3025; SI-NEXT: s_lshl_b32 s19, s19, 3 3026; SI-NEXT: s_and_b32 s18, s18, 1 3027; SI-NEXT: s_lshl_b32 s17, s17, 1 3028; SI-NEXT: s_and_b32 s16, s16, 1 3029; SI-NEXT: s_lshl_b32 s15, s15, 3 3030; SI-NEXT: s_and_b32 s14, s14, 1 3031; SI-NEXT: s_lshl_b32 s13, s13, 1 3032; SI-NEXT: s_and_b32 s12, s12, 1 3033; SI-NEXT: s_lshl_b32 s11, s11, 3 3034; SI-NEXT: s_and_b32 s10, s10, 1 3035; SI-NEXT: s_lshl_b32 s9, s9, 1 3036; SI-NEXT: s_and_b32 s8, s8, 1 3037; SI-NEXT: s_lshl_b32 s7, s7, 3 3038; SI-NEXT: s_and_b32 s6, s6, 1 3039; SI-NEXT: s_lshl_b32 s5, s5, 1 3040; SI-NEXT: s_and_b32 s4, s4, 1 3041; SI-NEXT: s_lshl_b32 s33, s3, 3 3042; SI-NEXT: s_and_b32 s2, s2, 1 3043; SI-NEXT: s_lshl_b32 s1, s1, 1 3044; SI-NEXT: s_and_b32 s0, s0, 1 3045; SI-NEXT: s_mov_b32 s3, 0xf000 3046; SI-NEXT: s_lshl_b32 s30, s30, 2 3047; SI-NEXT: s_or_b32 s28, s28, s29 3048; SI-NEXT: s_lshl_b32 s26, s26, 2 3049; SI-NEXT: s_or_b32 s24, s24, s25 3050; SI-NEXT: s_lshl_b32 s22, s22, 2 3051; SI-NEXT: s_or_b32 s20, s20, s21 3052; SI-NEXT: s_lshl_b32 s18, s18, 2 3053; SI-NEXT: s_or_b32 s16, s16, s17 3054; SI-NEXT: s_lshl_b32 s14, s14, 2 3055; SI-NEXT: s_or_b32 s12, s12, s13 3056; SI-NEXT: s_lshl_b32 s10, s10, 2 3057; SI-NEXT: s_or_b32 s8, s8, s9 3058; SI-NEXT: s_lshl_b32 s6, s6, 2 3059; SI-NEXT: s_or_b32 s4, s4, s5 3060; SI-NEXT: s_lshl_b32 s2, s2, 2 3061; SI-NEXT: s_or_b32 s0, s0, s1 3062; SI-NEXT: s_or_b32 s1, s31, s30 3063; SI-NEXT: s_and_b32 s5, s28, 3 3064; SI-NEXT: s_or_b32 s9, s27, s26 3065; SI-NEXT: s_and_b32 s13, s24, 3 3066; SI-NEXT: s_or_b32 s17, s23, s22 3067; SI-NEXT: s_and_b32 s20, s20, 3 3068; SI-NEXT: s_or_b32 s18, s19, s18 3069; SI-NEXT: s_and_b32 s16, s16, 3 3070; SI-NEXT: s_or_b32 s14, s15, s14 3071; SI-NEXT: s_and_b32 s12, s12, 3 3072; SI-NEXT: s_or_b32 s10, s11, s10 3073; SI-NEXT: s_and_b32 s8, s8, 3 3074; SI-NEXT: s_or_b32 s6, s7, s6 3075; SI-NEXT: s_and_b32 s4, s4, 3 3076; SI-NEXT: s_or_b32 s2, s33, s2 3077; SI-NEXT: s_and_b32 s0, s0, 3 3078; SI-NEXT: s_or_b32 s1, s5, s1 3079; SI-NEXT: s_or_b32 s5, s13, s9 3080; SI-NEXT: s_or_b32 s7, s20, s17 3081; SI-NEXT: s_or_b32 s9, s16, s18 3082; SI-NEXT: s_or_b32 s11, s12, s14 3083; SI-NEXT: s_or_b32 s8, s8, s10 3084; SI-NEXT: s_or_b32 s4, s4, s6 3085; SI-NEXT: s_or_b32 s0, s0, s2 3086; SI-NEXT: s_lshl_b32 s1, s1, 12 3087; SI-NEXT: s_and_b32 s2, s5, 15 3088; SI-NEXT: s_lshl_b32 s5, s7, 4 3089; SI-NEXT: s_and_b32 s6, s9, 15 3090; SI-NEXT: s_lshl_b32 s7, s11, 12 3091; SI-NEXT: s_and_b32 s8, s8, 15 3092; SI-NEXT: s_lshl_b32 s4, s4, 4 3093; SI-NEXT: s_and_b32 s0, s0, 15 3094; SI-NEXT: s_lshl_b32 s2, s2, 8 3095; SI-NEXT: s_or_b32 s5, s6, s5 3096; SI-NEXT: s_lshl_b32 s6, s8, 8 3097; SI-NEXT: s_or_b32 s0, s0, s4 3098; SI-NEXT: s_or_b32 s1, s1, s2 3099; SI-NEXT: s_and_b32 s2, s5, 0xff 3100; SI-NEXT: s_or_b32 s4, s7, s6 3101; SI-NEXT: s_and_b32 s0, s0, 0xff 3102; SI-NEXT: s_or_b32 s1, s2, s1 3103; SI-NEXT: s_or_b32 s0, s0, s4 3104; SI-NEXT: s_lshl_b32 s1, s1, 16 3105; SI-NEXT: s_and_b32 s0, s0, 0xffff 3106; SI-NEXT: s_or_b32 s0, s0, s1 3107; SI-NEXT: s_mov_b32 s2, -1 3108; SI-NEXT: v_mov_b32_e32 v0, s0 3109; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 3110; SI-NEXT: s_endpgm 3111; 3112; VI-LABEL: amdgpu_cs_inreg_v32i1: 3113; VI: ; %bb.0: 3114; VI-NEXT: s_and_b32 s26, s26, 1 3115; VI-NEXT: s_lshl_b32 s25, s25, 1 3116; VI-NEXT: s_and_b32 s24, s24, 1 3117; VI-NEXT: s_and_b32 s22, s22, 1 3118; VI-NEXT: s_lshl_b32 s21, s21, 1 3119; VI-NEXT: s_and_b32 s20, s20, 1 3120; VI-NEXT: s_and_b32 s18, s18, 1 3121; VI-NEXT: s_lshl_b32 s17, s17, 1 3122; VI-NEXT: s_and_b32 s16, s16, 1 3123; VI-NEXT: s_and_b32 s10, s10, 1 3124; VI-NEXT: s_lshl_b32 s9, s9, 1 3125; VI-NEXT: s_and_b32 s8, s8, 1 3126; VI-NEXT: s_and_b32 s6, s6, 1 3127; VI-NEXT: s_lshl_b32 s5, s5, 1 3128; VI-NEXT: s_and_b32 s4, s4, 1 3129; VI-NEXT: s_and_b32 s2, s2, 1 3130; VI-NEXT: s_lshl_b32 s1, s1, 1 3131; VI-NEXT: s_and_b32 s0, s0, 1 3132; VI-NEXT: s_and_b32 s30, s30, 1 3133; VI-NEXT: s_lshl_b32 s29, s29, 1 3134; VI-NEXT: s_and_b32 s28, s28, 1 3135; VI-NEXT: s_lshl_b32 s27, s27, 3 3136; VI-NEXT: s_lshl_b32 s26, s26, 2 3137; VI-NEXT: s_or_b32 s24, s24, s25 3138; VI-NEXT: s_lshl_b32 s23, s23, 3 3139; VI-NEXT: s_lshl_b32 s22, s22, 2 3140; VI-NEXT: s_or_b32 s20, s20, s21 3141; VI-NEXT: s_lshl_b32 s19, s19, 3 3142; VI-NEXT: s_lshl_b32 s18, s18, 2 3143; VI-NEXT: s_or_b32 s16, s16, s17 3144; VI-NEXT: s_and_b32 s14, s14, 1 3145; VI-NEXT: s_lshl_b32 s13, s13, 1 3146; VI-NEXT: s_and_b32 s12, s12, 1 3147; VI-NEXT: s_lshl_b32 s11, s11, 3 3148; VI-NEXT: s_lshl_b32 s10, s10, 2 3149; VI-NEXT: s_or_b32 s8, s8, s9 3150; VI-NEXT: s_lshl_b32 s7, s7, 3 3151; VI-NEXT: s_lshl_b32 s6, s6, 2 3152; VI-NEXT: s_or_b32 s4, s4, s5 3153; VI-NEXT: s_lshl_b32 s3, s3, 3 3154; VI-NEXT: s_lshl_b32 s2, s2, 2 3155; VI-NEXT: s_or_b32 s0, s0, s1 3156; VI-NEXT: s_lshl_b32 s31, s31, 3 3157; VI-NEXT: s_lshl_b32 s30, s30, 2 3158; VI-NEXT: s_or_b32 s28, s28, s29 3159; VI-NEXT: s_or_b32 s26, s27, s26 3160; VI-NEXT: s_and_b32 s24, s24, 3 3161; VI-NEXT: s_or_b32 s22, s23, s22 3162; VI-NEXT: s_and_b32 s20, s20, 3 3163; VI-NEXT: s_or_b32 s18, s19, s18 3164; VI-NEXT: s_and_b32 s16, s16, 3 3165; VI-NEXT: s_lshl_b32 s15, s15, 3 3166; VI-NEXT: s_lshl_b32 s14, s14, 2 3167; VI-NEXT: s_or_b32 s12, s12, s13 3168; VI-NEXT: s_or_b32 s10, s11, s10 3169; VI-NEXT: s_and_b32 s8, s8, 3 3170; VI-NEXT: s_or_b32 s6, s7, s6 3171; VI-NEXT: s_and_b32 s4, s4, 3 3172; VI-NEXT: s_or_b32 s2, s3, s2 3173; VI-NEXT: s_and_b32 s0, s0, 3 3174; VI-NEXT: s_or_b32 s30, s31, s30 3175; VI-NEXT: s_and_b32 s28, s28, 3 3176; VI-NEXT: s_or_b32 s24, s24, s26 3177; VI-NEXT: s_or_b32 s20, s20, s22 3178; VI-NEXT: s_or_b32 s16, s16, s18 3179; VI-NEXT: s_or_b32 s14, s15, s14 3180; VI-NEXT: s_and_b32 s12, s12, 3 3181; VI-NEXT: s_or_b32 s8, s8, s10 3182; VI-NEXT: s_or_b32 s4, s4, s6 3183; VI-NEXT: s_or_b32 s0, s0, s2 3184; VI-NEXT: s_or_b32 s28, s28, s30 3185; VI-NEXT: s_and_b32 s24, s24, 15 3186; VI-NEXT: s_lshl_b32 s20, s20, 4 3187; VI-NEXT: s_and_b32 s16, s16, 15 3188; VI-NEXT: s_or_b32 s12, s12, s14 3189; VI-NEXT: s_and_b32 s8, s8, 15 3190; VI-NEXT: s_lshl_b32 s4, s4, 4 3191; VI-NEXT: s_and_b32 s0, s0, 15 3192; VI-NEXT: s_lshl_b32 s28, s28, 12 3193; VI-NEXT: s_lshl_b32 s24, s24, 8 3194; VI-NEXT: s_or_b32 s16, s16, s20 3195; VI-NEXT: s_lshl_b32 s12, s12, 12 3196; VI-NEXT: s_lshl_b32 s8, s8, 8 3197; VI-NEXT: s_or_b32 s0, s0, s4 3198; VI-NEXT: s_or_b32 s24, s28, s24 3199; VI-NEXT: s_and_b32 s16, s16, 0xff 3200; VI-NEXT: s_or_b32 s8, s12, s8 3201; VI-NEXT: s_and_b32 s0, s0, 0xff 3202; VI-NEXT: s_or_b32 s16, s16, s24 3203; VI-NEXT: s_or_b32 s0, s0, s8 3204; VI-NEXT: s_lshl_b32 s16, s16, 16 3205; VI-NEXT: s_and_b32 s0, s0, 0xffff 3206; VI-NEXT: s_or_b32 s0, s0, s16 3207; VI-NEXT: v_mov_b32_e32 v0, s0 3208; VI-NEXT: flat_store_dword v[0:1], v0 3209; VI-NEXT: s_endpgm 3210; 3211; GFX11-LABEL: amdgpu_cs_inreg_v32i1: 3212; GFX11: ; %bb.0: 3213; GFX11-NEXT: s_and_b32 s10, s10, 1 3214; GFX11-NEXT: s_lshl_b32 s9, s9, 1 3215; GFX11-NEXT: s_and_b32 s8, s8, 1 3216; GFX11-NEXT: s_and_b32 s14, s14, 1 3217; GFX11-NEXT: s_lshl_b32 s13, s13, 1 3218; GFX11-NEXT: s_and_b32 s12, s12, 1 3219; GFX11-NEXT: s_lshl_b32 s11, s11, 3 3220; GFX11-NEXT: s_lshl_b32 s10, s10, 2 3221; GFX11-NEXT: s_or_b32 s8, s8, s9 3222; GFX11-NEXT: s_and_b32 s6, s6, 1 3223; GFX11-NEXT: s_lshl_b32 s5, s5, 1 3224; GFX11-NEXT: s_and_b32 s4, s4, 1 3225; GFX11-NEXT: s_and_b32 s2, s2, 1 3226; GFX11-NEXT: s_lshl_b32 s1, s1, 1 3227; GFX11-NEXT: s_and_b32 s0, s0, 1 3228; GFX11-NEXT: s_lshl_b32 s15, s15, 3 3229; GFX11-NEXT: s_lshl_b32 s14, s14, 2 3230; GFX11-NEXT: s_or_b32 s12, s12, s13 3231; GFX11-NEXT: s_or_b32 s9, s11, s10 3232; GFX11-NEXT: s_and_b32 s8, s8, 3 3233; GFX11-NEXT: s_lshl_b32 s7, s7, 3 3234; GFX11-NEXT: s_lshl_b32 s6, s6, 2 3235; GFX11-NEXT: s_or_b32 s4, s4, s5 3236; GFX11-NEXT: s_lshl_b32 s3, s3, 3 3237; GFX11-NEXT: s_lshl_b32 s2, s2, 2 3238; GFX11-NEXT: s_or_b32 s0, s0, s1 3239; GFX11-NEXT: s_or_b32 s13, s15, s14 3240; GFX11-NEXT: s_and_b32 s12, s12, 3 3241; GFX11-NEXT: s_or_b32 s8, s8, s9 3242; GFX11-NEXT: s_or_b32 s5, s7, s6 3243; GFX11-NEXT: s_and_b32 s4, s4, 3 3244; GFX11-NEXT: s_or_b32 s1, s3, s2 3245; GFX11-NEXT: s_and_b32 s0, s0, 3 3246; GFX11-NEXT: s_or_b32 s10, s12, s13 3247; GFX11-NEXT: s_and_b32 s8, s8, 15 3248; GFX11-NEXT: s_or_b32 s2, s4, s5 3249; GFX11-NEXT: s_or_b32 s0, s0, s1 3250; GFX11-NEXT: s_lshl_b32 s9, s10, 12 3251; GFX11-NEXT: s_lshl_b32 s1, s2, 4 3252; GFX11-NEXT: s_and_b32 s0, s0, 15 3253; GFX11-NEXT: s_lshl_b32 s2, s8, 8 3254; GFX11-NEXT: s_and_b32 s3, s30, 1 3255; GFX11-NEXT: s_lshl_b32 s4, s29, 1 3256; GFX11-NEXT: s_and_b32 s5, s28, 1 3257; GFX11-NEXT: s_or_b32 s0, s0, s1 3258; GFX11-NEXT: s_or_b32 s1, s9, s2 3259; GFX11-NEXT: s_lshl_b32 s2, s31, 3 3260; GFX11-NEXT: s_lshl_b32 s3, s3, 2 3261; GFX11-NEXT: s_or_b32 s4, s5, s4 3262; GFX11-NEXT: s_and_b32 s5, s26, 1 3263; GFX11-NEXT: s_lshl_b32 s6, s25, 1 3264; GFX11-NEXT: s_and_b32 s7, s24, 1 3265; GFX11-NEXT: s_or_b32 s2, s2, s3 3266; GFX11-NEXT: s_and_b32 s3, s4, 3 3267; GFX11-NEXT: s_lshl_b32 s4, s27, 3 3268; GFX11-NEXT: s_lshl_b32 s5, s5, 2 3269; GFX11-NEXT: s_or_b32 s6, s7, s6 3270; GFX11-NEXT: s_or_b32 s4, s4, s5 3271; GFX11-NEXT: s_and_b32 s5, s6, 3 3272; GFX11-NEXT: s_or_b32 s2, s3, s2 3273; GFX11-NEXT: s_or_b32 s3, s5, s4 3274; GFX11-NEXT: s_and_b32 s5, s22, 1 3275; GFX11-NEXT: s_lshl_b32 s6, s21, 1 3276; GFX11-NEXT: s_and_b32 s7, s20, 1 3277; GFX11-NEXT: s_lshl_b32 s4, s23, 3 3278; GFX11-NEXT: s_lshl_b32 s5, s5, 2 3279; GFX11-NEXT: s_or_b32 s6, s7, s6 3280; GFX11-NEXT: s_and_b32 s7, s18, 1 3281; GFX11-NEXT: s_lshl_b32 s8, s17, 1 3282; GFX11-NEXT: s_and_b32 s9, s16, 1 3283; GFX11-NEXT: s_or_b32 s4, s4, s5 3284; GFX11-NEXT: s_and_b32 s5, s6, 3 3285; GFX11-NEXT: s_lshl_b32 s6, s19, 3 3286; GFX11-NEXT: s_lshl_b32 s7, s7, 2 3287; GFX11-NEXT: s_or_b32 s8, s9, s8 3288; GFX11-NEXT: s_or_b32 s6, s6, s7 3289; GFX11-NEXT: s_and_b32 s7, s8, 3 3290; GFX11-NEXT: s_or_b32 s4, s5, s4 3291; GFX11-NEXT: s_or_b32 s5, s7, s6 3292; GFX11-NEXT: s_and_b32 s3, s3, 15 3293; GFX11-NEXT: s_lshl_b32 s4, s4, 4 3294; GFX11-NEXT: s_and_b32 s5, s5, 15 3295; GFX11-NEXT: s_lshl_b32 s2, s2, 12 3296; GFX11-NEXT: s_lshl_b32 s3, s3, 8 3297; GFX11-NEXT: s_or_b32 s4, s5, s4 3298; GFX11-NEXT: s_and_b32 s0, s0, 0xff 3299; GFX11-NEXT: s_or_b32 s2, s2, s3 3300; GFX11-NEXT: s_and_b32 s3, s4, 0xff 3301; GFX11-NEXT: s_or_b32 s0, s0, s1 3302; GFX11-NEXT: s_or_b32 s1, s3, s2 3303; GFX11-NEXT: s_and_b32 s0, s0, 0xffff 3304; GFX11-NEXT: s_lshl_b32 s1, s1, 16 3305; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3306; GFX11-NEXT: s_or_b32 s0, s0, s1 3307; GFX11-NEXT: v_mov_b32_e32 v0, s0 3308; GFX11-NEXT: global_store_b32 v[0:1], v0, off 3309; GFX11-NEXT: s_endpgm 3310 store <32 x i1> %arg0, ptr addrspace(1) undef 3311 ret void 3312} 3313 3314define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) { 3315; SI-LABEL: amdgpu_cs_i1_sext: 3316; SI: ; %bb.0: 3317; SI-NEXT: v_and_b32_e32 v0, 1, v0 3318; SI-NEXT: s_mov_b32 s3, 0xf000 3319; SI-NEXT: s_mov_b32 s2, -1 3320; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 3321; SI-NEXT: s_endpgm 3322; 3323; VI-LABEL: amdgpu_cs_i1_sext: 3324; VI: ; %bb.0: 3325; VI-NEXT: v_and_b32_e32 v0, 1, v0 3326; VI-NEXT: flat_store_byte v[0:1], v0 3327; VI-NEXT: s_endpgm 3328; 3329; GFX11-LABEL: amdgpu_cs_i1_sext: 3330; GFX11: ; %bb.0: 3331; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 3332; GFX11-NEXT: global_store_b8 v[0:1], v0, off 3333; GFX11-NEXT: s_endpgm 3334 store i1 %arg0, ptr addrspace(1) undef 3335 ret void 3336} 3337 3338define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) { 3339; SI-LABEL: amdgpu_cs_i1_zext: 3340; SI: ; %bb.0: 3341; SI-NEXT: s_mov_b32 s3, 0xf000 3342; SI-NEXT: s_mov_b32 s2, -1 3343; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 3344; SI-NEXT: s_endpgm 3345; 3346; VI-LABEL: amdgpu_cs_i1_zext: 3347; VI: ; %bb.0: 3348; VI-NEXT: flat_store_byte v[0:1], v0 3349; VI-NEXT: s_endpgm 3350; 3351; GFX11-LABEL: amdgpu_cs_i1_zext: 3352; GFX11: ; %bb.0: 3353; GFX11-NEXT: global_store_b8 v[0:1], v0, off 3354; GFX11-NEXT: s_endpgm 3355 store i1 %arg0, ptr addrspace(1) undef 3356 ret void 3357} 3358 3359attributes #0 = { nounwind noinline } 3360