1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s 8 9define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 10; EG-LABEL: v_test_imin_sle_i32: 11; EG: ; %bb.0: 12; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 13; EG-NEXT: TEX 1 @6 14; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] 15; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 16; EG-NEXT: CF_END 17; EG-NEXT: PAD 18; EG-NEXT: Fetch clause starting at 6: 19; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 20; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 21; EG-NEXT: ALU clause starting at 10: 22; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 23; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 24; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 25; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 26; EG-NEXT: ALU clause starting at 14: 27; EG-NEXT: MIN_INT T0.X, T0.X, T1.X, 28; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 29; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 30; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 31; 32; CI-LABEL: v_test_imin_sle_i32: 33; CI: ; %bb.0: 34; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 35; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 36; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 37; CI-NEXT: s_waitcnt lgkmcnt(0) 38; CI-NEXT: v_mov_b32_e32 v1, s3 39; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 40; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 41; CI-NEXT: v_mov_b32_e32 v3, s5 42; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 43; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 44; CI-NEXT: flat_load_dword v5, v[0:1] 45; CI-NEXT: flat_load_dword v2, v[2:3] 46; CI-NEXT: v_mov_b32_e32 v1, s1 47; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 48; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 49; CI-NEXT: s_waitcnt vmcnt(0) 50; CI-NEXT: v_min_i32_e32 v2, v5, v2 51; CI-NEXT: flat_store_dword v[0:1], v2 52; CI-NEXT: s_endpgm 53; 54; VI-LABEL: v_test_imin_sle_i32: 55; VI: ; %bb.0: 56; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 57; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 58; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 59; VI-NEXT: s_waitcnt lgkmcnt(0) 60; VI-NEXT: v_mov_b32_e32 v1, s3 61; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 62; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 63; VI-NEXT: v_mov_b32_e32 v3, s5 64; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 65; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 66; VI-NEXT: flat_load_dword v5, v[0:1] 67; VI-NEXT: flat_load_dword v2, v[2:3] 68; VI-NEXT: v_mov_b32_e32 v1, s1 69; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 70; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 71; VI-NEXT: s_waitcnt vmcnt(0) 72; VI-NEXT: v_min_i32_e32 v2, v5, v2 73; VI-NEXT: flat_store_dword v[0:1], v2 74; VI-NEXT: s_endpgm 75; 76; GFX9-LABEL: v_test_imin_sle_i32: 77; GFX9: ; %bb.0: 78; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 79; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 80; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 81; GFX9-NEXT: s_waitcnt lgkmcnt(0) 82; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 83; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 86; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 87; GFX9-NEXT: s_endpgm 88; 89; GFX10-LABEL: v_test_imin_sle_i32: 90; GFX10: ; %bb.0: 91; GFX10-NEXT: s_clause 0x1 92; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 93; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 94; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 95; GFX10-NEXT: s_waitcnt lgkmcnt(0) 96; GFX10-NEXT: s_clause 0x1 97; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 98; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 99; GFX10-NEXT: s_waitcnt vmcnt(0) 100; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 101; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 102; GFX10-NEXT: s_endpgm 103; 104; GFX11-LABEL: v_test_imin_sle_i32: 105; GFX11: ; %bb.0: 106; GFX11-NEXT: s_clause 0x1 107; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 108; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 109; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 110; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 111; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 112; GFX11-NEXT: s_waitcnt lgkmcnt(0) 113; GFX11-NEXT: s_clause 0x1 114; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 115; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 116; GFX11-NEXT: s_waitcnt vmcnt(0) 117; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 118; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 119; GFX11-NEXT: s_endpgm 120 %tid = call i32 @llvm.amdgcn.workitem.id.x() 121 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid 122 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid 123 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid 124 %a = load i32, ptr addrspace(1) %a.gep, align 4 125 %b = load i32, ptr addrspace(1) %b.gep, align 4 126 %cmp = icmp sle i32 %a, %b 127 %val = select i1 %cmp, i32 %a, i32 %b 128 store i32 %val, ptr addrspace(1) %out.gep, align 4 129 ret void 130} 131 132define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 133; EG-LABEL: s_test_imin_sle_i32: 134; EG: ; %bb.0: 135; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 136; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 137; EG-NEXT: CF_END 138; EG-NEXT: PAD 139; EG-NEXT: ALU clause starting at 4: 140; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 141; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W, 142; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 143; 144; CI-LABEL: s_test_imin_sle_i32: 145; CI: ; %bb.0: 146; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 147; CI-NEXT: s_waitcnt lgkmcnt(0) 148; CI-NEXT: s_min_i32 s2, s2, s3 149; CI-NEXT: v_mov_b32_e32 v0, s0 150; CI-NEXT: v_mov_b32_e32 v1, s1 151; CI-NEXT: v_mov_b32_e32 v2, s2 152; CI-NEXT: flat_store_dword v[0:1], v2 153; CI-NEXT: s_endpgm 154; 155; VI-LABEL: s_test_imin_sle_i32: 156; VI: ; %bb.0: 157; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 158; VI-NEXT: s_waitcnt lgkmcnt(0) 159; VI-NEXT: s_min_i32 s2, s2, s3 160; VI-NEXT: v_mov_b32_e32 v0, s0 161; VI-NEXT: v_mov_b32_e32 v1, s1 162; VI-NEXT: v_mov_b32_e32 v2, s2 163; VI-NEXT: flat_store_dword v[0:1], v2 164; VI-NEXT: s_endpgm 165; 166; GFX9-LABEL: s_test_imin_sle_i32: 167; GFX9: ; %bb.0: 168; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 169; GFX9-NEXT: v_mov_b32_e32 v0, 0 170; GFX9-NEXT: s_waitcnt lgkmcnt(0) 171; GFX9-NEXT: s_min_i32 s2, s2, s3 172; GFX9-NEXT: v_mov_b32_e32 v1, s2 173; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 174; GFX9-NEXT: s_endpgm 175; 176; GFX10-LABEL: s_test_imin_sle_i32: 177; GFX10: ; %bb.0: 178; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 179; GFX10-NEXT: v_mov_b32_e32 v0, 0 180; GFX10-NEXT: s_waitcnt lgkmcnt(0) 181; GFX10-NEXT: s_min_i32 s2, s2, s3 182; GFX10-NEXT: v_mov_b32_e32 v1, s2 183; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 184; GFX10-NEXT: s_endpgm 185; 186; GFX11-LABEL: s_test_imin_sle_i32: 187; GFX11: ; %bb.0: 188; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 189; GFX11-NEXT: v_mov_b32_e32 v0, 0 190; GFX11-NEXT: s_waitcnt lgkmcnt(0) 191; GFX11-NEXT: s_min_i32 s2, s2, s3 192; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 193; GFX11-NEXT: v_mov_b32_e32 v1, s2 194; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 195; GFX11-NEXT: s_endpgm 196 %cmp = icmp sle i32 %a, %b 197 %val = select i1 %cmp, i32 %a, i32 %b 198 store i32 %val, ptr addrspace(1) %out, align 4 199 ret void 200} 201 202define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 { 203; EG-LABEL: s_test_imin_sle_v1i32: 204; EG: ; %bb.0: 205; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 206; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 207; EG-NEXT: CF_END 208; EG-NEXT: PAD 209; EG-NEXT: ALU clause starting at 4: 210; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 211; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W, 212; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 213; 214; CI-LABEL: s_test_imin_sle_v1i32: 215; CI: ; %bb.0: 216; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 217; CI-NEXT: s_waitcnt lgkmcnt(0) 218; CI-NEXT: s_min_i32 s2, s2, s3 219; CI-NEXT: v_mov_b32_e32 v0, s0 220; CI-NEXT: v_mov_b32_e32 v1, s1 221; CI-NEXT: v_mov_b32_e32 v2, s2 222; CI-NEXT: flat_store_dword v[0:1], v2 223; CI-NEXT: s_endpgm 224; 225; VI-LABEL: s_test_imin_sle_v1i32: 226; VI: ; %bb.0: 227; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 228; VI-NEXT: s_waitcnt lgkmcnt(0) 229; VI-NEXT: s_min_i32 s2, s2, s3 230; VI-NEXT: v_mov_b32_e32 v0, s0 231; VI-NEXT: v_mov_b32_e32 v1, s1 232; VI-NEXT: v_mov_b32_e32 v2, s2 233; VI-NEXT: flat_store_dword v[0:1], v2 234; VI-NEXT: s_endpgm 235; 236; GFX9-LABEL: s_test_imin_sle_v1i32: 237; GFX9: ; %bb.0: 238; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 239; GFX9-NEXT: v_mov_b32_e32 v0, 0 240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-NEXT: s_min_i32 s2, s2, s3 242; GFX9-NEXT: v_mov_b32_e32 v1, s2 243; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 244; GFX9-NEXT: s_endpgm 245; 246; GFX10-LABEL: s_test_imin_sle_v1i32: 247; GFX10: ; %bb.0: 248; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 249; GFX10-NEXT: v_mov_b32_e32 v0, 0 250; GFX10-NEXT: s_waitcnt lgkmcnt(0) 251; GFX10-NEXT: s_min_i32 s2, s2, s3 252; GFX10-NEXT: v_mov_b32_e32 v1, s2 253; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 254; GFX10-NEXT: s_endpgm 255; 256; GFX11-LABEL: s_test_imin_sle_v1i32: 257; GFX11: ; %bb.0: 258; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 259; GFX11-NEXT: v_mov_b32_e32 v0, 0 260; GFX11-NEXT: s_waitcnt lgkmcnt(0) 261; GFX11-NEXT: s_min_i32 s2, s2, s3 262; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 263; GFX11-NEXT: v_mov_b32_e32 v1, s2 264; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 265; GFX11-NEXT: s_endpgm 266 %cmp = icmp sle <1 x i32> %a, %b 267 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b 268 store <1 x i32> %val, ptr addrspace(1) %out 269 ret void 270} 271 272define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 { 273; EG-LABEL: s_test_imin_sle_v4i32: 274; EG: ; %bb.0: 275; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 276; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 277; EG-NEXT: CF_END 278; EG-NEXT: PAD 279; EG-NEXT: ALU clause starting at 4: 280; EG-NEXT: MIN_INT * T0.W, KC0[4].X, KC0[5].X, 281; EG-NEXT: MIN_INT * T0.Z, KC0[3].W, KC0[4].W, 282; EG-NEXT: MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z, 283; EG-NEXT: MIN_INT * T0.X, KC0[3].Y, KC0[4].Y, 284; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 285; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 286; 287; CI-LABEL: s_test_imin_sle_v4i32: 288; CI: ; %bb.0: 289; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 290; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 291; CI-NEXT: s_waitcnt lgkmcnt(0) 292; CI-NEXT: s_min_i32 s3, s3, s7 293; CI-NEXT: s_min_i32 s2, s2, s6 294; CI-NEXT: s_min_i32 s1, s1, s5 295; CI-NEXT: s_min_i32 s0, s0, s4 296; CI-NEXT: v_mov_b32_e32 v4, s8 297; CI-NEXT: v_mov_b32_e32 v0, s0 298; CI-NEXT: v_mov_b32_e32 v1, s1 299; CI-NEXT: v_mov_b32_e32 v2, s2 300; CI-NEXT: v_mov_b32_e32 v3, s3 301; CI-NEXT: v_mov_b32_e32 v5, s9 302; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 303; CI-NEXT: s_endpgm 304; 305; VI-LABEL: s_test_imin_sle_v4i32: 306; VI: ; %bb.0: 307; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 308; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 309; VI-NEXT: s_waitcnt lgkmcnt(0) 310; VI-NEXT: s_min_i32 s3, s3, s7 311; VI-NEXT: s_min_i32 s2, s2, s6 312; VI-NEXT: s_min_i32 s1, s1, s5 313; VI-NEXT: s_min_i32 s0, s0, s4 314; VI-NEXT: v_mov_b32_e32 v4, s8 315; VI-NEXT: v_mov_b32_e32 v0, s0 316; VI-NEXT: v_mov_b32_e32 v1, s1 317; VI-NEXT: v_mov_b32_e32 v2, s2 318; VI-NEXT: v_mov_b32_e32 v3, s3 319; VI-NEXT: v_mov_b32_e32 v5, s9 320; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 321; VI-NEXT: s_endpgm 322; 323; GFX9-LABEL: s_test_imin_sle_v4i32: 324; GFX9: ; %bb.0: 325; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 326; GFX9-NEXT: v_mov_b32_e32 v4, 0 327; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: s_min_i32 s3, s3, s7 330; GFX9-NEXT: s_min_i32 s2, s2, s6 331; GFX9-NEXT: s_min_i32 s1, s1, s5 332; GFX9-NEXT: s_min_i32 s0, s0, s4 333; GFX9-NEXT: v_mov_b32_e32 v0, s0 334; GFX9-NEXT: v_mov_b32_e32 v1, s1 335; GFX9-NEXT: v_mov_b32_e32 v2, s2 336; GFX9-NEXT: v_mov_b32_e32 v3, s3 337; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 338; GFX9-NEXT: s_endpgm 339; 340; GFX10-LABEL: s_test_imin_sle_v4i32: 341; GFX10: ; %bb.0: 342; GFX10-NEXT: s_clause 0x1 343; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 344; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 345; GFX10-NEXT: v_mov_b32_e32 v4, 0 346; GFX10-NEXT: s_waitcnt lgkmcnt(0) 347; GFX10-NEXT: s_min_i32 s3, s3, s7 348; GFX10-NEXT: s_min_i32 s2, s2, s6 349; GFX10-NEXT: s_min_i32 s0, s0, s4 350; GFX10-NEXT: s_min_i32 s1, s1, s5 351; GFX10-NEXT: v_mov_b32_e32 v0, s0 352; GFX10-NEXT: v_mov_b32_e32 v1, s1 353; GFX10-NEXT: v_mov_b32_e32 v2, s2 354; GFX10-NEXT: v_mov_b32_e32 v3, s3 355; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] 356; GFX10-NEXT: s_endpgm 357; 358; GFX11-LABEL: s_test_imin_sle_v4i32: 359; GFX11: ; %bb.0: 360; GFX11-NEXT: s_clause 0x1 361; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 362; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 363; GFX11-NEXT: v_mov_b32_e32 v4, 0 364; GFX11-NEXT: s_waitcnt lgkmcnt(0) 365; GFX11-NEXT: s_min_i32 s2, s11, s15 366; GFX11-NEXT: s_min_i32 s3, s10, s14 367; GFX11-NEXT: s_min_i32 s4, s8, s12 368; GFX11-NEXT: s_min_i32 s5, s9, s13 369; GFX11-NEXT: v_mov_b32_e32 v0, s4 370; GFX11-NEXT: v_mov_b32_e32 v1, s5 371; GFX11-NEXT: v_mov_b32_e32 v2, s3 372; GFX11-NEXT: v_mov_b32_e32 v3, s2 373; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 374; GFX11-NEXT: s_endpgm 375 %cmp = icmp sle <4 x i32> %a, %b 376 %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b 377 store <4 x i32> %val, ptr addrspace(1) %out 378 ret void 379} 380 381define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 { 382; EG-LABEL: s_test_imin_sle_i8: 383; EG: ; %bb.0: 384; EG-NEXT: ALU 0, @10, KC0[], KC1[] 385; EG-NEXT: TEX 1 @6 386; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 387; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 388; EG-NEXT: CF_END 389; EG-NEXT: PAD 390; EG-NEXT: Fetch clause starting at 6: 391; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3 392; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3 393; EG-NEXT: ALU clause starting at 10: 394; EG-NEXT: MOV * T0.X, 0.0, 395; EG-NEXT: ALU clause starting at 11: 396; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, 397; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 398; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 399; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45) 400; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, 401; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 402; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 403; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 404; EG-NEXT: LSHL T0.X, PV.W, PS, 405; EG-NEXT: LSHL * T0.W, literal.x, PS, 406; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 407; EG-NEXT: MOV T0.Y, 0.0, 408; EG-NEXT: MOV * T0.Z, 0.0, 409; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 410; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 411; 412; CI-LABEL: s_test_imin_sle_i8: 413; CI: ; %bb.0: 414; CI-NEXT: s_load_dword s2, s[8:9], 0xa 415; CI-NEXT: s_load_dword s3, s[8:9], 0x13 416; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 417; CI-NEXT: s_waitcnt lgkmcnt(0) 418; CI-NEXT: s_sext_i32_i8 s2, s2 419; CI-NEXT: s_sext_i32_i8 s3, s3 420; CI-NEXT: s_min_i32 s2, s2, s3 421; CI-NEXT: v_mov_b32_e32 v0, s0 422; CI-NEXT: v_mov_b32_e32 v1, s1 423; CI-NEXT: v_mov_b32_e32 v2, s2 424; CI-NEXT: flat_store_byte v[0:1], v2 425; CI-NEXT: s_endpgm 426; 427; VI-LABEL: s_test_imin_sle_i8: 428; VI: ; %bb.0: 429; VI-NEXT: s_load_dword s2, s[8:9], 0x28 430; VI-NEXT: s_load_dword s3, s[8:9], 0x4c 431; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 432; VI-NEXT: s_waitcnt lgkmcnt(0) 433; VI-NEXT: s_sext_i32_i8 s2, s2 434; VI-NEXT: s_sext_i32_i8 s3, s3 435; VI-NEXT: s_min_i32 s2, s2, s3 436; VI-NEXT: v_mov_b32_e32 v0, s0 437; VI-NEXT: v_mov_b32_e32 v1, s1 438; VI-NEXT: v_mov_b32_e32 v2, s2 439; VI-NEXT: flat_store_byte v[0:1], v2 440; VI-NEXT: s_endpgm 441; 442; GFX9-LABEL: s_test_imin_sle_i8: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 445; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c 446; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 447; GFX9-NEXT: v_mov_b32_e32 v0, 0 448; GFX9-NEXT: s_waitcnt lgkmcnt(0) 449; GFX9-NEXT: s_sext_i32_i8 s2, s2 450; GFX9-NEXT: s_sext_i32_i8 s3, s3 451; GFX9-NEXT: s_min_i32 s2, s2, s3 452; GFX9-NEXT: v_mov_b32_e32 v1, s2 453; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 454; GFX9-NEXT: s_endpgm 455; 456; GFX10-LABEL: s_test_imin_sle_i8: 457; GFX10: ; %bb.0: 458; GFX10-NEXT: s_clause 0x2 459; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 460; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c 461; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 462; GFX10-NEXT: v_mov_b32_e32 v0, 0 463; GFX10-NEXT: s_waitcnt lgkmcnt(0) 464; GFX10-NEXT: s_sext_i32_i8 s2, s2 465; GFX10-NEXT: s_sext_i32_i8 s3, s3 466; GFX10-NEXT: s_min_i32 s2, s2, s3 467; GFX10-NEXT: v_mov_b32_e32 v1, s2 468; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 469; GFX10-NEXT: s_endpgm 470; 471; GFX11-LABEL: s_test_imin_sle_i8: 472; GFX11: ; %bb.0: 473; GFX11-NEXT: s_clause 0x2 474; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 475; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c 476; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 477; GFX11-NEXT: v_mov_b32_e32 v0, 0 478; GFX11-NEXT: s_waitcnt lgkmcnt(0) 479; GFX11-NEXT: s_sext_i32_i8 s2, s2 480; GFX11-NEXT: s_sext_i32_i8 s3, s3 481; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 482; GFX11-NEXT: s_min_i32 s2, s2, s3 483; GFX11-NEXT: v_mov_b32_e32 v1, s2 484; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] 485; GFX11-NEXT: s_endpgm 486 %cmp = icmp sle i8 %a, %b 487 %val = select i1 %cmp, i8 %a, i8 %b 488 store i8 %val, ptr addrspace(1) %out 489 ret void 490} 491 492; FIXME: Why vector and sdwa for last element? 493 494define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 { 495; EG-LABEL: s_test_imin_sle_v4i8: 496; EG: ; %bb.0: 497; EG-NEXT: ALU 0, @22, KC0[], KC1[] 498; EG-NEXT: TEX 7 @6 499; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[] 500; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 501; EG-NEXT: CF_END 502; EG-NEXT: PAD 503; EG-NEXT: Fetch clause starting at 6: 504; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3 505; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3 506; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3 507; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3 508; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3 509; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3 510; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3 511; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3 512; EG-NEXT: ALU clause starting at 22: 513; EG-NEXT: MOV * T4.X, 0.0, 514; EG-NEXT: ALU clause starting at 23: 515; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x, 516; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212 517; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 518; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x, 519; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212 520; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201 521; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 522; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x, 523; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W, 524; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 525; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W, 526; EG-NEXT: AND_INT T0.W, PS, literal.x, 527; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y, 528; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 529; EG-NEXT: AND_INT T4.X, PS, literal.x, 530; EG-NEXT: LSHL T0.Y, PV.W, literal.y, 531; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z, 532; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212 533; EG-NEXT: LSHL * T1.W, PV.Z, literal.w, 534; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) 535; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44) 536; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W, 537; EG-NEXT: OR_INT T0.W, PS, PV.Y, 538; EG-NEXT: LSHL * T1.W, PV.X, literal.x, 539; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 540; EG-NEXT: OR_INT T0.W, PV.W, PS, 541; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x, 542; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 543; EG-NEXT: OR_INT T4.X, PV.W, PS, 544; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 545; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 546; 547; CI-LABEL: s_test_imin_sle_v4i8: 548; CI: ; %bb.0: 549; CI-NEXT: s_load_dword s2, s[8:9], 0xa 550; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 551; CI-NEXT: s_load_dword s3, s[8:9], 0x13 552; CI-NEXT: s_waitcnt lgkmcnt(0) 553; CI-NEXT: s_ashr_i32 s4, s2, 24 554; CI-NEXT: s_sext_i32_i8 s5, s2 555; CI-NEXT: s_bfe_i32 s6, s2, 0x80008 556; CI-NEXT: s_bfe_i32 s2, s2, 0x80010 557; CI-NEXT: s_ashr_i32 s7, s3, 24 558; CI-NEXT: s_sext_i32_i8 s8, s3 559; CI-NEXT: s_bfe_i32 s9, s3, 0x80008 560; CI-NEXT: s_bfe_i32 s3, s3, 0x80010 561; CI-NEXT: s_min_i32 s2, s2, s3 562; CI-NEXT: s_min_i32 s4, s4, s7 563; CI-NEXT: s_and_b32 s2, s2, 0xff 564; CI-NEXT: s_lshl_b32 s4, s4, 24 565; CI-NEXT: s_lshl_b32 s2, s2, 16 566; CI-NEXT: s_or_b32 s2, s4, s2 567; CI-NEXT: s_min_i32 s3, s6, s9 568; CI-NEXT: s_min_i32 s4, s5, s8 569; CI-NEXT: s_lshl_b32 s3, s3, 8 570; CI-NEXT: s_and_b32 s4, s4, 0xff 571; CI-NEXT: s_or_b32 s3, s4, s3 572; CI-NEXT: s_and_b32 s3, s3, 0xffff 573; CI-NEXT: s_or_b32 s2, s3, s2 574; CI-NEXT: v_mov_b32_e32 v0, s0 575; CI-NEXT: v_mov_b32_e32 v1, s1 576; CI-NEXT: v_mov_b32_e32 v2, s2 577; CI-NEXT: flat_store_dword v[0:1], v2 578; CI-NEXT: s_endpgm 579; 580; VI-LABEL: s_test_imin_sle_v4i8: 581; VI: ; %bb.0: 582; VI-NEXT: s_load_dword s2, s[8:9], 0x28 583; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 584; VI-NEXT: s_load_dword s3, s[8:9], 0x4c 585; VI-NEXT: s_waitcnt lgkmcnt(0) 586; VI-NEXT: s_ashr_i32 s4, s2, 24 587; VI-NEXT: s_bfe_i32 s5, s2, 0x80010 588; VI-NEXT: s_bfe_i32 s6, s2, 0x80008 589; VI-NEXT: s_sext_i32_i8 s2, s2 590; VI-NEXT: s_ashr_i32 s7, s3, 24 591; VI-NEXT: s_bfe_i32 s8, s3, 0x80010 592; VI-NEXT: s_bfe_i32 s9, s3, 0x80008 593; VI-NEXT: s_sext_i32_i8 s3, s3 594; VI-NEXT: s_min_i32 s2, s2, s3 595; VI-NEXT: s_min_i32 s3, s6, s9 596; VI-NEXT: s_min_i32 s5, s5, s8 597; VI-NEXT: s_min_i32 s4, s4, s7 598; VI-NEXT: s_and_b32 s5, s5, 0xff 599; VI-NEXT: s_lshl_b32 s3, s3, 8 600; VI-NEXT: s_and_b32 s2, s2, 0xff 601; VI-NEXT: s_lshl_b32 s4, s4, 24 602; VI-NEXT: s_lshl_b32 s5, s5, 16 603; VI-NEXT: s_or_b32 s2, s2, s3 604; VI-NEXT: s_or_b32 s4, s4, s5 605; VI-NEXT: s_and_b32 s2, s2, 0xffff 606; VI-NEXT: s_or_b32 s2, s2, s4 607; VI-NEXT: v_mov_b32_e32 v0, s0 608; VI-NEXT: v_mov_b32_e32 v1, s1 609; VI-NEXT: v_mov_b32_e32 v2, s2 610; VI-NEXT: flat_store_dword v[0:1], v2 611; VI-NEXT: s_endpgm 612; 613; GFX9-LABEL: s_test_imin_sle_v4i8: 614; GFX9: ; %bb.0: 615; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c 616; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 617; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 618; GFX9-NEXT: v_mov_b32_e32 v0, 0 619; GFX9-NEXT: s_waitcnt lgkmcnt(0) 620; GFX9-NEXT: s_lshr_b32 s5, s2, 16 621; GFX9-NEXT: s_lshr_b32 s8, s3, 16 622; GFX9-NEXT: s_ashr_i32 s9, s3, 24 623; GFX9-NEXT: s_ashr_i32 s6, s2, 24 624; GFX9-NEXT: s_bfe_i32 s8, s8, 0x80000 625; GFX9-NEXT: v_mov_b32_e32 v1, s9 626; GFX9-NEXT: s_bfe_i32 s5, s5, 0x80000 627; GFX9-NEXT: s_sext_i32_i16 s7, s3 628; GFX9-NEXT: v_min_i16_e32 v1, s6, v1 629; GFX9-NEXT: v_mov_b32_e32 v2, s8 630; GFX9-NEXT: s_sext_i32_i16 s4, s2 631; GFX9-NEXT: s_lshr_b32 s7, s7, 8 632; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 633; GFX9-NEXT: v_min_i16_e32 v2, s5, v2 634; GFX9-NEXT: s_lshr_b32 s4, s4, 8 635; GFX9-NEXT: s_bfe_i32 s3, s3, 0x80000 636; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 637; GFX9-NEXT: v_mov_b32_e32 v2, s7 638; GFX9-NEXT: s_bfe_i32 s2, s2, 0x80000 639; GFX9-NEXT: v_min_i16_e32 v2, s4, v2 640; GFX9-NEXT: v_mov_b32_e32 v3, s3 641; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 642; GFX9-NEXT: v_min_i16_e32 v3, s2, v3 643; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 644; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 645; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 646; GFX9-NEXT: s_endpgm 647; 648; GFX10-LABEL: s_test_imin_sle_v4i8: 649; GFX10: ; %bb.0: 650; GFX10-NEXT: s_clause 0x2 651; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 652; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c 653; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 654; GFX10-NEXT: s_waitcnt lgkmcnt(0) 655; GFX10-NEXT: s_sext_i32_i16 s4, s2 656; GFX10-NEXT: s_sext_i32_i16 s7, s3 657; GFX10-NEXT: s_ashr_i32 s6, s2, 24 658; GFX10-NEXT: s_ashr_i32 s9, s3, 24 659; GFX10-NEXT: s_lshr_b32 s4, s4, 8 660; GFX10-NEXT: s_lshr_b32 s7, s7, 8 661; GFX10-NEXT: v_min_i16 v0, s6, s9 662; GFX10-NEXT: v_min_i16 v1, s4, s7 663; GFX10-NEXT: s_lshr_b32 s5, s2, 16 664; GFX10-NEXT: s_lshr_b32 s8, s3, 16 665; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000 666; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000 667; GFX10-NEXT: s_bfe_i32 s4, s8, 0x80000 668; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000 669; GFX10-NEXT: v_min_i16 v2, s5, s4 670; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 671; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 672; GFX10-NEXT: v_min_i16 v3, s2, s3 673; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 674; GFX10-NEXT: v_mov_b32_e32 v2, 0 675; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 676; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 677; GFX10-NEXT: global_store_dword v2, v0, s[0:1] 678; GFX10-NEXT: s_endpgm 679; 680; GFX11-LABEL: s_test_imin_sle_v4i8: 681; GFX11: ; %bb.0: 682; GFX11-NEXT: s_clause 0x1 683; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x28 684; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x4c 685; GFX11-NEXT: s_waitcnt lgkmcnt(0) 686; GFX11-NEXT: s_sext_i32_i16 s2, s0 687; GFX11-NEXT: s_lshr_b32 s3, s0, 16 688; GFX11-NEXT: s_sext_i32_i16 s7, s1 689; GFX11-NEXT: s_lshr_b32 s8, s1, 16 690; GFX11-NEXT: s_ashr_i32 s6, s0, 24 691; GFX11-NEXT: s_bfe_i32 s0, s0, 0x80000 692; GFX11-NEXT: s_ashr_i32 s9, s1, 24 693; GFX11-NEXT: s_bfe_i32 s1, s1, 0x80000 694; GFX11-NEXT: s_lshr_b32 s2, s2, 8 695; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000 696; GFX11-NEXT: s_lshr_b32 s7, s7, 8 697; GFX11-NEXT: s_bfe_i32 s8, s8, 0x80000 698; GFX11-NEXT: v_min_i16 v0, s6, s9 699; GFX11-NEXT: v_min_i16 v1, s0, s1 700; GFX11-NEXT: v_min_i16 v2, s3, s8 701; GFX11-NEXT: v_min_i16 v3, s2, s7 702; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 703; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 704; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 705; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 706; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 708; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 709; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 710; GFX11-NEXT: v_mov_b32_e32 v2, 0 711; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 712; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 713; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 715; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 716; GFX11-NEXT: s_waitcnt lgkmcnt(0) 717; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 718; GFX11-NEXT: s_endpgm 719 %cmp = icmp sle <4 x i8> %a, %b 720 %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b 721 store <4 x i8> %val, ptr addrspace(1) %out 722 ret void 723} 724 725define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 { 726; EG-LABEL: s_test_imin_sle_v2i16: 727; EG: ; %bb.0: 728; EG-NEXT: ALU 0, @14, KC0[], KC1[] 729; EG-NEXT: TEX 3 @6 730; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 731; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1 732; EG-NEXT: CF_END 733; EG-NEXT: PAD 734; EG-NEXT: Fetch clause starting at 6: 735; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3 736; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3 737; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3 738; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3 739; EG-NEXT: ALU clause starting at 14: 740; EG-NEXT: MOV * T4.X, 0.0, 741; EG-NEXT: ALU clause starting at 15: 742; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x, 743; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212 744; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201 745; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 746; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x, 747; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 748; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W, 749; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y, 750; EG-NEXT: LSHL T1.W, PS, literal.x, 751; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, 752; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 753; EG-NEXT: OR_INT T4.X, PV.W, PS, 754; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, 755; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 756; 757; CI-LABEL: s_test_imin_sle_v2i16: 758; CI: ; %bb.0: 759; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 760; CI-NEXT: s_waitcnt lgkmcnt(0) 761; CI-NEXT: s_ashr_i32 s4, s2, 16 762; CI-NEXT: s_sext_i32_i16 s2, s2 763; CI-NEXT: s_ashr_i32 s5, s3, 16 764; CI-NEXT: s_sext_i32_i16 s3, s3 765; CI-NEXT: s_min_i32 s4, s4, s5 766; CI-NEXT: s_min_i32 s2, s2, s3 767; CI-NEXT: s_lshl_b32 s3, s4, 16 768; CI-NEXT: s_and_b32 s2, s2, 0xffff 769; CI-NEXT: s_or_b32 s2, s2, s3 770; CI-NEXT: v_mov_b32_e32 v0, s0 771; CI-NEXT: v_mov_b32_e32 v1, s1 772; CI-NEXT: v_mov_b32_e32 v2, s2 773; CI-NEXT: flat_store_dword v[0:1], v2 774; CI-NEXT: s_endpgm 775; 776; VI-LABEL: s_test_imin_sle_v2i16: 777; VI: ; %bb.0: 778; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 779; VI-NEXT: s_waitcnt lgkmcnt(0) 780; VI-NEXT: s_ashr_i32 s4, s2, 16 781; VI-NEXT: s_sext_i32_i16 s2, s2 782; VI-NEXT: s_ashr_i32 s5, s3, 16 783; VI-NEXT: s_sext_i32_i16 s3, s3 784; VI-NEXT: s_min_i32 s4, s4, s5 785; VI-NEXT: s_min_i32 s2, s2, s3 786; VI-NEXT: s_lshl_b32 s3, s4, 16 787; VI-NEXT: s_and_b32 s2, s2, 0xffff 788; VI-NEXT: s_or_b32 s2, s2, s3 789; VI-NEXT: v_mov_b32_e32 v0, s0 790; VI-NEXT: v_mov_b32_e32 v1, s1 791; VI-NEXT: v_mov_b32_e32 v2, s2 792; VI-NEXT: flat_store_dword v[0:1], v2 793; VI-NEXT: s_endpgm 794; 795; GFX9-LABEL: s_test_imin_sle_v2i16: 796; GFX9: ; %bb.0: 797; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 798; GFX9-NEXT: v_mov_b32_e32 v0, 0 799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 800; GFX9-NEXT: v_mov_b32_e32 v1, s3 801; GFX9-NEXT: v_pk_min_i16 v1, s2, v1 802; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 803; GFX9-NEXT: s_endpgm 804; 805; GFX10-LABEL: s_test_imin_sle_v2i16: 806; GFX10: ; %bb.0: 807; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 808; GFX10-NEXT: v_mov_b32_e32 v0, 0 809; GFX10-NEXT: s_waitcnt lgkmcnt(0) 810; GFX10-NEXT: v_pk_min_i16 v1, s2, s3 811; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 812; GFX10-NEXT: s_endpgm 813; 814; GFX11-LABEL: s_test_imin_sle_v2i16: 815; GFX11: ; %bb.0: 816; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 817; GFX11-NEXT: v_mov_b32_e32 v0, 0 818; GFX11-NEXT: s_waitcnt lgkmcnt(0) 819; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 820; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 821; GFX11-NEXT: s_endpgm 822 %cmp = icmp sle <2 x i16> %a, %b 823 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 824 store <2 x i16> %val, ptr addrspace(1) %out 825 ret void 826} 827 828define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 { 829; EG-LABEL: s_test_imin_sle_v4i16: 830; EG: ; %bb.0: 831; EG-NEXT: ALU 1, @28, KC0[], KC1[] 832; EG-NEXT: TEX 1 @12 833; EG-NEXT: ALU 9, @30, KC0[], KC1[] 834; EG-NEXT: TEX 1 @16 835; EG-NEXT: ALU 10, @40, KC0[], KC1[] 836; EG-NEXT: TEX 1 @20 837; EG-NEXT: ALU 10, @51, KC0[], KC1[] 838; EG-NEXT: TEX 1 @24 839; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[] 840; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1 841; EG-NEXT: CF_END 842; EG-NEXT: PAD 843; EG-NEXT: Fetch clause starting at 12: 844; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 845; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3 846; EG-NEXT: Fetch clause starting at 16: 847; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 848; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3 849; EG-NEXT: Fetch clause starting at 20: 850; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 851; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3 852; EG-NEXT: Fetch clause starting at 24: 853; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 854; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3 855; EG-NEXT: ALU clause starting at 28: 856; EG-NEXT: MOV * T0.Y, T3.X, 857; EG-NEXT: MOV * T5.X, 0.0, 858; EG-NEXT: ALU clause starting at 30: 859; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, 860; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 861; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 862; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, 863; EG-NEXT: LSHL T0.W, PV.W, literal.x, 864; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 865; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 866; EG-NEXT: OR_INT * T0.W, PS, PV.W, 867; EG-NEXT: MOV * T3.X, PV.W, 868; EG-NEXT: MOV * T0.Y, PV.X, 869; EG-NEXT: ALU clause starting at 40: 870; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, 871; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 872; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 873; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, 874; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 875; EG-NEXT: -65536(nan), 0(0.000000e+00) 876; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, 877; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 878; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 879; EG-NEXT: MOV T3.X, PV.W, 880; EG-NEXT: MOV * T0.Y, T2.X, 881; EG-NEXT: ALU clause starting at 51: 882; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, 883; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 884; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 885; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, 886; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 887; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 888; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 889; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 890; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 891; EG-NEXT: MOV * T2.X, PV.W, 892; EG-NEXT: MOV * T0.Y, PV.X, 893; EG-NEXT: ALU clause starting at 62: 894; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, 895; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212 896; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 897; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, 898; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, 899; EG-NEXT: AND_INT T1.W, T0.Y, literal.y, 900; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, 901; EG-NEXT: 2(2.802597e-45), -65536(nan) 902; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 903; EG-NEXT: OR_INT * T6.X, PV.W, PS, 904; EG-NEXT: MOV T2.X, PV.X, 905; EG-NEXT: MOV * T6.Y, T3.X, 906; 907; CI-LABEL: s_test_imin_sle_v4i16: 908; CI: ; %bb.0: 909; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 910; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 911; CI-NEXT: s_waitcnt lgkmcnt(0) 912; CI-NEXT: s_ashr_i32 s6, s0, 16 913; CI-NEXT: s_ashr_i32 s7, s1, 16 914; CI-NEXT: s_sext_i32_i16 s0, s0 915; CI-NEXT: s_sext_i32_i16 s1, s1 916; CI-NEXT: s_ashr_i32 s8, s2, 16 917; CI-NEXT: s_ashr_i32 s9, s3, 16 918; CI-NEXT: s_sext_i32_i16 s2, s2 919; CI-NEXT: s_sext_i32_i16 s3, s3 920; CI-NEXT: s_min_i32 s7, s7, s9 921; CI-NEXT: s_min_i32 s1, s1, s3 922; CI-NEXT: s_min_i32 s3, s6, s8 923; CI-NEXT: s_min_i32 s0, s0, s2 924; CI-NEXT: s_lshl_b32 s7, s7, 16 925; CI-NEXT: s_and_b32 s1, s1, 0xffff 926; CI-NEXT: s_lshl_b32 s3, s3, 16 927; CI-NEXT: s_and_b32 s0, s0, 0xffff 928; CI-NEXT: s_or_b32 s1, s1, s7 929; CI-NEXT: s_or_b32 s0, s0, s3 930; CI-NEXT: v_mov_b32_e32 v2, s4 931; CI-NEXT: v_mov_b32_e32 v0, s0 932; CI-NEXT: v_mov_b32_e32 v1, s1 933; CI-NEXT: v_mov_b32_e32 v3, s5 934; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 935; CI-NEXT: s_endpgm 936; 937; VI-LABEL: s_test_imin_sle_v4i16: 938; VI: ; %bb.0: 939; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 940; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 941; VI-NEXT: s_waitcnt lgkmcnt(0) 942; VI-NEXT: s_ashr_i32 s6, s1, 16 943; VI-NEXT: s_sext_i32_i16 s1, s1 944; VI-NEXT: s_ashr_i32 s8, s3, 16 945; VI-NEXT: s_sext_i32_i16 s3, s3 946; VI-NEXT: s_ashr_i32 s7, s0, 16 947; VI-NEXT: s_sext_i32_i16 s0, s0 948; VI-NEXT: s_ashr_i32 s9, s2, 16 949; VI-NEXT: s_sext_i32_i16 s2, s2 950; VI-NEXT: s_min_i32 s6, s6, s8 951; VI-NEXT: s_min_i32 s1, s1, s3 952; VI-NEXT: s_min_i32 s7, s7, s9 953; VI-NEXT: s_min_i32 s0, s0, s2 954; VI-NEXT: s_lshl_b32 s2, s6, 16 955; VI-NEXT: s_and_b32 s1, s1, 0xffff 956; VI-NEXT: s_or_b32 s1, s1, s2 957; VI-NEXT: s_lshl_b32 s2, s7, 16 958; VI-NEXT: s_and_b32 s0, s0, 0xffff 959; VI-NEXT: s_or_b32 s0, s0, s2 960; VI-NEXT: v_mov_b32_e32 v2, s4 961; VI-NEXT: v_mov_b32_e32 v0, s0 962; VI-NEXT: v_mov_b32_e32 v1, s1 963; VI-NEXT: v_mov_b32_e32 v3, s5 964; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 965; VI-NEXT: s_endpgm 966; 967; GFX9-LABEL: s_test_imin_sle_v4i16: 968; GFX9: ; %bb.0: 969; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 970; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 971; GFX9-NEXT: v_mov_b32_e32 v2, 0 972; GFX9-NEXT: s_waitcnt lgkmcnt(0) 973; GFX9-NEXT: v_mov_b32_e32 v0, s3 974; GFX9-NEXT: v_mov_b32_e32 v3, s2 975; GFX9-NEXT: v_pk_min_i16 v1, s1, v0 976; GFX9-NEXT: v_pk_min_i16 v0, s0, v3 977; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 978; GFX9-NEXT: s_endpgm 979; 980; GFX10-LABEL: s_test_imin_sle_v4i16: 981; GFX10: ; %bb.0: 982; GFX10-NEXT: s_clause 0x1 983; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 984; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 985; GFX10-NEXT: v_mov_b32_e32 v2, 0 986; GFX10-NEXT: s_waitcnt lgkmcnt(0) 987; GFX10-NEXT: v_pk_min_i16 v1, s1, s3 988; GFX10-NEXT: v_pk_min_i16 v0, s0, s2 989; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 990; GFX10-NEXT: s_endpgm 991; 992; GFX11-LABEL: s_test_imin_sle_v4i16: 993; GFX11: ; %bb.0: 994; GFX11-NEXT: s_clause 0x1 995; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 996; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 997; GFX11-NEXT: v_mov_b32_e32 v2, 0 998; GFX11-NEXT: s_waitcnt lgkmcnt(0) 999; GFX11-NEXT: v_pk_min_i16 v1, s1, s3 1000; GFX11-NEXT: v_pk_min_i16 v0, s0, s2 1001; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1002; GFX11-NEXT: s_endpgm 1003 %cmp = icmp sle <4 x i16> %a, %b 1004 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b 1005 store <4 x i16> %val, ptr addrspace(1) %out 1006 ret void 1007} 1008 1009define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 1010; EG-LABEL: v_test_imin_slt_i32: 1011; EG: ; %bb.0: 1012; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 1013; EG-NEXT: TEX 1 @6 1014; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] 1015; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1016; EG-NEXT: CF_END 1017; EG-NEXT: PAD 1018; EG-NEXT: Fetch clause starting at 6: 1019; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 1020; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1021; EG-NEXT: ALU clause starting at 10: 1022; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1023; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1024; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 1025; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 1026; EG-NEXT: ALU clause starting at 14: 1027; EG-NEXT: MIN_INT T0.X, T0.X, T1.X, 1028; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1029; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 1030; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1031; 1032; CI-LABEL: v_test_imin_slt_i32: 1033; CI: ; %bb.0: 1034; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1035; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1036; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1037; CI-NEXT: s_waitcnt lgkmcnt(0) 1038; CI-NEXT: v_mov_b32_e32 v1, s3 1039; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1040; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1041; CI-NEXT: v_mov_b32_e32 v3, s5 1042; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1043; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1044; CI-NEXT: flat_load_dword v5, v[0:1] 1045; CI-NEXT: flat_load_dword v2, v[2:3] 1046; CI-NEXT: v_mov_b32_e32 v1, s1 1047; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 1048; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1049; CI-NEXT: s_waitcnt vmcnt(0) 1050; CI-NEXT: v_min_i32_e32 v2, v5, v2 1051; CI-NEXT: flat_store_dword v[0:1], v2 1052; CI-NEXT: s_endpgm 1053; 1054; VI-LABEL: v_test_imin_slt_i32: 1055; VI: ; %bb.0: 1056; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1057; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1058; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1059; VI-NEXT: s_waitcnt lgkmcnt(0) 1060; VI-NEXT: v_mov_b32_e32 v1, s3 1061; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1062; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1063; VI-NEXT: v_mov_b32_e32 v3, s5 1064; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1065; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1066; VI-NEXT: flat_load_dword v5, v[0:1] 1067; VI-NEXT: flat_load_dword v2, v[2:3] 1068; VI-NEXT: v_mov_b32_e32 v1, s1 1069; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1070; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1071; VI-NEXT: s_waitcnt vmcnt(0) 1072; VI-NEXT: v_min_i32_e32 v2, v5, v2 1073; VI-NEXT: flat_store_dword v[0:1], v2 1074; VI-NEXT: s_endpgm 1075; 1076; GFX9-LABEL: v_test_imin_slt_i32: 1077; GFX9: ; %bb.0: 1078; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1079; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1080; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1081; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1083; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 1084; GFX9-NEXT: s_waitcnt vmcnt(0) 1085; GFX9-NEXT: v_min_i32_e32 v1, v1, v2 1086; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1087; GFX9-NEXT: s_endpgm 1088; 1089; GFX10-LABEL: v_test_imin_slt_i32: 1090; GFX10: ; %bb.0: 1091; GFX10-NEXT: s_clause 0x1 1092; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1093; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1094; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1095; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1096; GFX10-NEXT: s_clause 0x1 1097; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1098; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 1099; GFX10-NEXT: s_waitcnt vmcnt(0) 1100; GFX10-NEXT: v_min_i32_e32 v1, v1, v2 1101; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1102; GFX10-NEXT: s_endpgm 1103; 1104; GFX11-LABEL: v_test_imin_slt_i32: 1105; GFX11: ; %bb.0: 1106; GFX11-NEXT: s_clause 0x1 1107; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1108; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1109; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1110; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1111; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1112; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1113; GFX11-NEXT: s_clause 0x1 1114; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1115; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 1116; GFX11-NEXT: s_waitcnt vmcnt(0) 1117; GFX11-NEXT: v_min_i32_e32 v1, v1, v2 1118; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1119; GFX11-NEXT: s_endpgm 1120 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1121 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid 1122 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid 1123 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid 1124 %a = load i32, ptr addrspace(1) %a.gep, align 4 1125 %b = load i32, ptr addrspace(1) %b.gep, align 4 1126 %cmp = icmp slt i32 %a, %b 1127 %val = select i1 %cmp, i32 %a, i32 %b 1128 store i32 %val, ptr addrspace(1) %out.gep, align 4 1129 ret void 1130} 1131 1132define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 1133; EG-LABEL: v_test_imin_slt_i16: 1134; EG: ; %bb.0: 1135; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1136; EG-NEXT: TEX 0 @8 1137; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1138; EG-NEXT: TEX 0 @10 1139; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[] 1140; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 1141; EG-NEXT: CF_END 1142; EG-NEXT: PAD 1143; EG-NEXT: Fetch clause starting at 8: 1144; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1145; EG-NEXT: Fetch clause starting at 10: 1146; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 1147; EG-NEXT: ALU clause starting at 12: 1148; EG-NEXT: LSHL * T0.W, T0.X, 1, 1149; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, 1150; EG-NEXT: ALU clause starting at 14: 1151; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W, 1152; EG-NEXT: ALU clause starting at 15: 1153; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x, 1154; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212 1155; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1156; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1157; EG-NEXT: AND_INT T2.W, PS, literal.x, 1158; EG-NEXT: MIN_INT * T1.W, PV.W, PV.Z, 1159; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1160; EG-NEXT: AND_INT T1.W, PS, literal.x, 1161; EG-NEXT: LSHL * T2.W, PV.W, literal.y, 1162; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1163; EG-NEXT: LSHL T1.X, PV.W, PS, 1164; EG-NEXT: LSHL * T1.W, literal.x, PS, 1165; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1166; EG-NEXT: MOV T1.Y, 0.0, 1167; EG-NEXT: MOV * T1.Z, 0.0, 1168; EG-NEXT: LSHR * T0.X, T0.W, literal.x, 1169; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1170; 1171; CI-LABEL: v_test_imin_slt_i16: 1172; CI: ; %bb.0: 1173; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1174; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1175; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 1176; CI-NEXT: s_waitcnt lgkmcnt(0) 1177; CI-NEXT: v_mov_b32_e32 v1, s3 1178; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1179; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1180; CI-NEXT: v_mov_b32_e32 v3, s5 1181; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1182; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1183; CI-NEXT: flat_load_sshort v5, v[0:1] 1184; CI-NEXT: flat_load_sshort v2, v[2:3] 1185; CI-NEXT: v_mov_b32_e32 v1, s1 1186; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 1187; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1188; CI-NEXT: s_waitcnt vmcnt(0) 1189; CI-NEXT: v_min_i32_e32 v2, v5, v2 1190; CI-NEXT: flat_store_short v[0:1], v2 1191; CI-NEXT: s_endpgm 1192; 1193; VI-LABEL: v_test_imin_slt_i16: 1194; VI: ; %bb.0: 1195; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1196; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1197; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 1198; VI-NEXT: s_waitcnt lgkmcnt(0) 1199; VI-NEXT: v_mov_b32_e32 v1, s3 1200; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1201; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1202; VI-NEXT: v_mov_b32_e32 v3, s5 1203; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1204; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1205; VI-NEXT: flat_load_ushort v5, v[0:1] 1206; VI-NEXT: flat_load_ushort v2, v[2:3] 1207; VI-NEXT: v_mov_b32_e32 v1, s1 1208; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1209; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1210; VI-NEXT: s_waitcnt vmcnt(0) 1211; VI-NEXT: v_min_i16_e32 v2, v5, v2 1212; VI-NEXT: flat_store_short v[0:1], v2 1213; VI-NEXT: s_endpgm 1214; 1215; GFX9-LABEL: v_test_imin_slt_i16: 1216; GFX9: ; %bb.0: 1217; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1218; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1219; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1220; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1221; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 1222; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] 1223; GFX9-NEXT: s_waitcnt vmcnt(0) 1224; GFX9-NEXT: v_min_i16_e32 v1, v1, v2 1225; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1226; GFX9-NEXT: s_endpgm 1227; 1228; GFX10-LABEL: v_test_imin_slt_i16: 1229; GFX10: ; %bb.0: 1230; GFX10-NEXT: s_clause 0x1 1231; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1232; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1233; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1234; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1235; GFX10-NEXT: s_clause 0x1 1236; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1237; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] 1238; GFX10-NEXT: s_waitcnt vmcnt(0) 1239; GFX10-NEXT: v_min_i16 v1, v1, v2 1240; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1241; GFX10-NEXT: s_endpgm 1242; 1243; GFX11-LABEL: v_test_imin_slt_i16: 1244; GFX11: ; %bb.0: 1245; GFX11-NEXT: s_clause 0x1 1246; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1247; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1248; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1250; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1251; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX11-NEXT: s_clause 0x1 1253; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 1254; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] 1255; GFX11-NEXT: s_waitcnt vmcnt(0) 1256; GFX11-NEXT: v_min_i16 v1, v1, v2 1257; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1258; GFX11-NEXT: s_endpgm 1259 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1260 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid 1261 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid 1262 %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid 1263 1264 %a = load i16, ptr addrspace(1) %a.gep 1265 %b = load i16, ptr addrspace(1) %b.gep 1266 %cmp = icmp slt i16 %a, %b 1267 %val = select i1 %cmp, i16 %a, i16 %b 1268 store i16 %val, ptr addrspace(1) %out.gep 1269 ret void 1270} 1271 1272define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 1273; EG-LABEL: s_test_imin_slt_i32: 1274; EG: ; %bb.0: 1275; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1276; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1277; EG-NEXT: CF_END 1278; EG-NEXT: PAD 1279; EG-NEXT: ALU clause starting at 4: 1280; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 1281; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W, 1282; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1283; 1284; CI-LABEL: s_test_imin_slt_i32: 1285; CI: ; %bb.0: 1286; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1287; CI-NEXT: s_waitcnt lgkmcnt(0) 1288; CI-NEXT: s_min_i32 s2, s2, s3 1289; CI-NEXT: v_mov_b32_e32 v0, s0 1290; CI-NEXT: v_mov_b32_e32 v1, s1 1291; CI-NEXT: v_mov_b32_e32 v2, s2 1292; CI-NEXT: flat_store_dword v[0:1], v2 1293; CI-NEXT: s_endpgm 1294; 1295; VI-LABEL: s_test_imin_slt_i32: 1296; VI: ; %bb.0: 1297; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1298; VI-NEXT: s_waitcnt lgkmcnt(0) 1299; VI-NEXT: s_min_i32 s2, s2, s3 1300; VI-NEXT: v_mov_b32_e32 v0, s0 1301; VI-NEXT: v_mov_b32_e32 v1, s1 1302; VI-NEXT: v_mov_b32_e32 v2, s2 1303; VI-NEXT: flat_store_dword v[0:1], v2 1304; VI-NEXT: s_endpgm 1305; 1306; GFX9-LABEL: s_test_imin_slt_i32: 1307; GFX9: ; %bb.0: 1308; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1309; GFX9-NEXT: v_mov_b32_e32 v0, 0 1310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1311; GFX9-NEXT: s_min_i32 s2, s2, s3 1312; GFX9-NEXT: v_mov_b32_e32 v1, s2 1313; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1314; GFX9-NEXT: s_endpgm 1315; 1316; GFX10-LABEL: s_test_imin_slt_i32: 1317; GFX10: ; %bb.0: 1318; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1319; GFX10-NEXT: v_mov_b32_e32 v0, 0 1320; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1321; GFX10-NEXT: s_min_i32 s2, s2, s3 1322; GFX10-NEXT: v_mov_b32_e32 v1, s2 1323; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1324; GFX10-NEXT: s_endpgm 1325; 1326; GFX11-LABEL: s_test_imin_slt_i32: 1327; GFX11: ; %bb.0: 1328; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1329; GFX11-NEXT: v_mov_b32_e32 v0, 0 1330; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX11-NEXT: s_min_i32 s2, s2, s3 1332; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1333; GFX11-NEXT: v_mov_b32_e32 v1, s2 1334; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1335; GFX11-NEXT: s_endpgm 1336 %cmp = icmp slt i32 %a, %b 1337 %val = select i1 %cmp, i32 %a, i32 %b 1338 store i32 %val, ptr addrspace(1) %out, align 4 1339 ret void 1340} 1341 1342define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 { 1343; EG-LABEL: s_test_imin_slt_v2i32: 1344; EG: ; %bb.0: 1345; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1346; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1347; EG-NEXT: CF_END 1348; EG-NEXT: PAD 1349; EG-NEXT: ALU clause starting at 4: 1350; EG-NEXT: MIN_INT * T0.Y, KC0[3].X, KC0[3].Z, 1351; EG-NEXT: MIN_INT * T0.X, KC0[2].W, KC0[3].Y, 1352; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1353; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1354; 1355; CI-LABEL: s_test_imin_slt_v2i32: 1356; CI: ; %bb.0: 1357; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 1358; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1359; CI-NEXT: s_waitcnt lgkmcnt(0) 1360; CI-NEXT: s_min_i32 s1, s1, s3 1361; CI-NEXT: s_min_i32 s0, s0, s2 1362; CI-NEXT: v_mov_b32_e32 v2, s4 1363; CI-NEXT: v_mov_b32_e32 v0, s0 1364; CI-NEXT: v_mov_b32_e32 v1, s1 1365; CI-NEXT: v_mov_b32_e32 v3, s5 1366; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1367; CI-NEXT: s_endpgm 1368; 1369; VI-LABEL: s_test_imin_slt_v2i32: 1370; VI: ; %bb.0: 1371; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 1372; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1373; VI-NEXT: s_waitcnt lgkmcnt(0) 1374; VI-NEXT: s_min_i32 s1, s1, s3 1375; VI-NEXT: s_min_i32 s0, s0, s2 1376; VI-NEXT: v_mov_b32_e32 v2, s4 1377; VI-NEXT: v_mov_b32_e32 v0, s0 1378; VI-NEXT: v_mov_b32_e32 v1, s1 1379; VI-NEXT: v_mov_b32_e32 v3, s5 1380; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1381; VI-NEXT: s_endpgm 1382; 1383; GFX9-LABEL: s_test_imin_slt_v2i32: 1384; GFX9: ; %bb.0: 1385; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 1386; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1387; GFX9-NEXT: v_mov_b32_e32 v2, 0 1388; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1389; GFX9-NEXT: s_min_i32 s1, s1, s3 1390; GFX9-NEXT: s_min_i32 s0, s0, s2 1391; GFX9-NEXT: v_mov_b32_e32 v0, s0 1392; GFX9-NEXT: v_mov_b32_e32 v1, s1 1393; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 1394; GFX9-NEXT: s_endpgm 1395; 1396; GFX10-LABEL: s_test_imin_slt_v2i32: 1397; GFX10: ; %bb.0: 1398; GFX10-NEXT: s_clause 0x1 1399; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8 1400; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 1401; GFX10-NEXT: v_mov_b32_e32 v2, 0 1402; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX10-NEXT: s_min_i32 s0, s0, s2 1404; GFX10-NEXT: s_min_i32 s1, s1, s3 1405; GFX10-NEXT: v_mov_b32_e32 v0, s0 1406; GFX10-NEXT: v_mov_b32_e32 v1, s1 1407; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 1408; GFX10-NEXT: s_endpgm 1409; 1410; GFX11-LABEL: s_test_imin_slt_v2i32: 1411; GFX11: ; %bb.0: 1412; GFX11-NEXT: s_clause 0x1 1413; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 1414; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 1415; GFX11-NEXT: v_mov_b32_e32 v2, 0 1416; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1417; GFX11-NEXT: s_min_i32 s0, s0, s2 1418; GFX11-NEXT: s_min_i32 s1, s1, s3 1419; GFX11-NEXT: v_mov_b32_e32 v0, s0 1420; GFX11-NEXT: v_mov_b32_e32 v1, s1 1421; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 1422; GFX11-NEXT: s_endpgm 1423 %cmp = icmp slt <2 x i32> %a, %b 1424 %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b 1425 store <2 x i32> %val, ptr addrspace(1) %out 1426 ret void 1427} 1428 1429define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 { 1430; EG-LABEL: s_test_imin_slt_imm_i32: 1431; EG: ; %bb.0: 1432; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1433; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1434; EG-NEXT: CF_END 1435; EG-NEXT: PAD 1436; EG-NEXT: ALU clause starting at 4: 1437; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 1438; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y, 1439; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) 1440; 1441; CI-LABEL: s_test_imin_slt_imm_i32: 1442; CI: ; %bb.0: 1443; CI-NEXT: s_load_dword s2, s[8:9], 0x2 1444; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1445; CI-NEXT: s_waitcnt lgkmcnt(0) 1446; CI-NEXT: s_min_i32 s2, s2, 8 1447; CI-NEXT: v_mov_b32_e32 v0, s0 1448; CI-NEXT: v_mov_b32_e32 v1, s1 1449; CI-NEXT: v_mov_b32_e32 v2, s2 1450; CI-NEXT: flat_store_dword v[0:1], v2 1451; CI-NEXT: s_endpgm 1452; 1453; VI-LABEL: s_test_imin_slt_imm_i32: 1454; VI: ; %bb.0: 1455; VI-NEXT: s_load_dword s2, s[8:9], 0x8 1456; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1457; VI-NEXT: s_waitcnt lgkmcnt(0) 1458; VI-NEXT: s_min_i32 s2, s2, 8 1459; VI-NEXT: v_mov_b32_e32 v0, s0 1460; VI-NEXT: v_mov_b32_e32 v1, s1 1461; VI-NEXT: v_mov_b32_e32 v2, s2 1462; VI-NEXT: flat_store_dword v[0:1], v2 1463; VI-NEXT: s_endpgm 1464; 1465; GFX9-LABEL: s_test_imin_slt_imm_i32: 1466; GFX9: ; %bb.0: 1467; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 1468; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1469; GFX9-NEXT: v_mov_b32_e32 v0, 0 1470; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1471; GFX9-NEXT: s_min_i32 s2, s2, 8 1472; GFX9-NEXT: v_mov_b32_e32 v1, s2 1473; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1474; GFX9-NEXT: s_endpgm 1475; 1476; GFX10-LABEL: s_test_imin_slt_imm_i32: 1477; GFX10: ; %bb.0: 1478; GFX10-NEXT: s_clause 0x1 1479; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 1480; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1481; GFX10-NEXT: v_mov_b32_e32 v0, 0 1482; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1483; GFX10-NEXT: s_min_i32 s2, s2, 8 1484; GFX10-NEXT: v_mov_b32_e32 v1, s2 1485; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1486; GFX10-NEXT: s_endpgm 1487; 1488; GFX11-LABEL: s_test_imin_slt_imm_i32: 1489; GFX11: ; %bb.0: 1490; GFX11-NEXT: s_clause 0x1 1491; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 1492; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1493; GFX11-NEXT: v_mov_b32_e32 v0, 0 1494; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX11-NEXT: s_min_i32 s2, s2, 8 1496; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1497; GFX11-NEXT: v_mov_b32_e32 v1, s2 1498; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1499; GFX11-NEXT: s_endpgm 1500 %cmp = icmp slt i32 %a, 8 1501 %val = select i1 %cmp, i32 %a, i32 8 1502 store i32 %val, ptr addrspace(1) %out, align 4 1503 ret void 1504} 1505 1506define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 { 1507; EG-LABEL: s_test_imin_sle_imm_i32: 1508; EG: ; %bb.0: 1509; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1510; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1511; EG-NEXT: CF_END 1512; EG-NEXT: PAD 1513; EG-NEXT: ALU clause starting at 4: 1514; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 1515; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y, 1516; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) 1517; 1518; CI-LABEL: s_test_imin_sle_imm_i32: 1519; CI: ; %bb.0: 1520; CI-NEXT: s_load_dword s2, s[8:9], 0x2 1521; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1522; CI-NEXT: s_waitcnt lgkmcnt(0) 1523; CI-NEXT: s_min_i32 s2, s2, 8 1524; CI-NEXT: v_mov_b32_e32 v0, s0 1525; CI-NEXT: v_mov_b32_e32 v1, s1 1526; CI-NEXT: v_mov_b32_e32 v2, s2 1527; CI-NEXT: flat_store_dword v[0:1], v2 1528; CI-NEXT: s_endpgm 1529; 1530; VI-LABEL: s_test_imin_sle_imm_i32: 1531; VI: ; %bb.0: 1532; VI-NEXT: s_load_dword s2, s[8:9], 0x8 1533; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1534; VI-NEXT: s_waitcnt lgkmcnt(0) 1535; VI-NEXT: s_min_i32 s2, s2, 8 1536; VI-NEXT: v_mov_b32_e32 v0, s0 1537; VI-NEXT: v_mov_b32_e32 v1, s1 1538; VI-NEXT: v_mov_b32_e32 v2, s2 1539; VI-NEXT: flat_store_dword v[0:1], v2 1540; VI-NEXT: s_endpgm 1541; 1542; GFX9-LABEL: s_test_imin_sle_imm_i32: 1543; GFX9: ; %bb.0: 1544; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 1545; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1546; GFX9-NEXT: v_mov_b32_e32 v0, 0 1547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX9-NEXT: s_min_i32 s2, s2, 8 1549; GFX9-NEXT: v_mov_b32_e32 v1, s2 1550; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1551; GFX9-NEXT: s_endpgm 1552; 1553; GFX10-LABEL: s_test_imin_sle_imm_i32: 1554; GFX10: ; %bb.0: 1555; GFX10-NEXT: s_clause 0x1 1556; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 1557; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 1558; GFX10-NEXT: v_mov_b32_e32 v0, 0 1559; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX10-NEXT: s_min_i32 s2, s2, 8 1561; GFX10-NEXT: v_mov_b32_e32 v1, s2 1562; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1563; GFX10-NEXT: s_endpgm 1564; 1565; GFX11-LABEL: s_test_imin_sle_imm_i32: 1566; GFX11: ; %bb.0: 1567; GFX11-NEXT: s_clause 0x1 1568; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 1569; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 1570; GFX11-NEXT: v_mov_b32_e32 v0, 0 1571; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1572; GFX11-NEXT: s_min_i32 s2, s2, 8 1573; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1574; GFX11-NEXT: v_mov_b32_e32 v1, s2 1575; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1576; GFX11-NEXT: s_endpgm 1577 %cmp = icmp sle i32 %a, 8 1578 %val = select i1 %cmp, i32 %a, i32 8 1579 store i32 %val, ptr addrspace(1) %out, align 4 1580 ret void 1581} 1582 1583define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1584; EG-LABEL: v_test_umin_ule_i32: 1585; EG: ; %bb.0: 1586; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 1587; EG-NEXT: TEX 1 @6 1588; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] 1589; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1590; EG-NEXT: CF_END 1591; EG-NEXT: PAD 1592; EG-NEXT: Fetch clause starting at 6: 1593; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 1594; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1595; EG-NEXT: ALU clause starting at 10: 1596; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1597; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1598; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 1599; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 1600; EG-NEXT: ALU clause starting at 14: 1601; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X, 1602; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1603; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 1604; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1605; 1606; CI-LABEL: v_test_umin_ule_i32: 1607; CI: ; %bb.0: 1608; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1609; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1610; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1611; CI-NEXT: s_waitcnt lgkmcnt(0) 1612; CI-NEXT: v_mov_b32_e32 v1, s3 1613; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1614; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1615; CI-NEXT: v_mov_b32_e32 v3, s5 1616; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1617; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1618; CI-NEXT: flat_load_dword v5, v[0:1] 1619; CI-NEXT: flat_load_dword v2, v[2:3] 1620; CI-NEXT: v_mov_b32_e32 v1, s1 1621; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 1622; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1623; CI-NEXT: s_waitcnt vmcnt(0) 1624; CI-NEXT: v_min_u32_e32 v2, v5, v2 1625; CI-NEXT: flat_store_dword v[0:1], v2 1626; CI-NEXT: s_endpgm 1627; 1628; VI-LABEL: v_test_umin_ule_i32: 1629; VI: ; %bb.0: 1630; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1631; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1632; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1633; VI-NEXT: s_waitcnt lgkmcnt(0) 1634; VI-NEXT: v_mov_b32_e32 v1, s3 1635; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1636; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1637; VI-NEXT: v_mov_b32_e32 v3, s5 1638; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1639; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1640; VI-NEXT: flat_load_dword v5, v[0:1] 1641; VI-NEXT: flat_load_dword v2, v[2:3] 1642; VI-NEXT: v_mov_b32_e32 v1, s1 1643; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1644; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1645; VI-NEXT: s_waitcnt vmcnt(0) 1646; VI-NEXT: v_min_u32_e32 v2, v5, v2 1647; VI-NEXT: flat_store_dword v[0:1], v2 1648; VI-NEXT: s_endpgm 1649; 1650; GFX9-LABEL: v_test_umin_ule_i32: 1651; GFX9: ; %bb.0: 1652; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1653; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1654; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1656; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1657; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 1658; GFX9-NEXT: s_waitcnt vmcnt(0) 1659; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 1660; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1661; GFX9-NEXT: s_endpgm 1662; 1663; GFX10-LABEL: v_test_umin_ule_i32: 1664; GFX10: ; %bb.0: 1665; GFX10-NEXT: s_clause 0x1 1666; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1667; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1668; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1669; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX10-NEXT: s_clause 0x1 1671; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1672; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 1673; GFX10-NEXT: s_waitcnt vmcnt(0) 1674; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 1675; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1676; GFX10-NEXT: s_endpgm 1677; 1678; GFX11-LABEL: v_test_umin_ule_i32: 1679; GFX11: ; %bb.0: 1680; GFX11-NEXT: s_clause 0x1 1681; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1682; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1683; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1684; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1685; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1686; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1687; GFX11-NEXT: s_clause 0x1 1688; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1689; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 1690; GFX11-NEXT: s_waitcnt vmcnt(0) 1691; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 1692; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1693; GFX11-NEXT: s_endpgm 1694 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1695 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid 1696 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid 1697 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid 1698 %a = load i32, ptr addrspace(1) %a.gep, align 4 1699 %b = load i32, ptr addrspace(1) %b.gep, align 4 1700 %cmp = icmp ule i32 %a, %b 1701 %val = select i1 %cmp, i32 %a, i32 %b 1702 store i32 %val, ptr addrspace(1) %out.gep, align 4 1703 ret void 1704} 1705 1706define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1707; EG-LABEL: v_test_umin_ule_v3i32: 1708; EG: ; %bb.0: 1709; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 1710; EG-NEXT: TEX 1 @6 1711; EG-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[] 1712; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 1713; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1714; EG-NEXT: CF_END 1715; EG-NEXT: Fetch clause starting at 6: 1716; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 1717; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 1718; EG-NEXT: ALU clause starting at 10: 1719; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1720; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1721; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 1722; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 1723; EG-NEXT: ALU clause starting at 14: 1724; EG-NEXT: MIN_UINT * T0.Y, T2.Y, T1.Y, 1725; EG-NEXT: MIN_UINT T0.X, T2.X, T1.X, 1726; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1727; EG-NEXT: LSHR T1.X, PV.W, literal.x, 1728; EG-NEXT: MIN_UINT * T2.X, T2.Z, T1.Z, 1729; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1730; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x, 1731; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1732; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 1733; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1734; 1735; CI-LABEL: v_test_umin_ule_v3i32: 1736; CI: ; %bb.0: 1737; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1738; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1739; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 1740; CI-NEXT: s_waitcnt lgkmcnt(0) 1741; CI-NEXT: v_mov_b32_e32 v1, s3 1742; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6 1743; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1744; CI-NEXT: v_mov_b32_e32 v2, s5 1745; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6 1746; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc 1747; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1748; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] 1749; CI-NEXT: v_mov_b32_e32 v7, s1 1750; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v6 1751; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc 1752; CI-NEXT: s_waitcnt vmcnt(0) 1753; CI-NEXT: v_min_u32_e32 v2, v2, v5 1754; CI-NEXT: v_min_u32_e32 v1, v1, v4 1755; CI-NEXT: v_min_u32_e32 v0, v0, v3 1756; CI-NEXT: flat_store_dwordx3 v[6:7], v[0:2] 1757; CI-NEXT: s_endpgm 1758; 1759; VI-LABEL: v_test_umin_ule_v3i32: 1760; VI: ; %bb.0: 1761; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1762; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1763; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0 1764; VI-NEXT: s_waitcnt lgkmcnt(0) 1765; VI-NEXT: v_mov_b32_e32 v1, s3 1766; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 1767; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1768; VI-NEXT: v_mov_b32_e32 v2, s5 1769; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6 1770; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc 1771; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1772; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4] 1773; VI-NEXT: v_mov_b32_e32 v7, s1 1774; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6 1775; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc 1776; VI-NEXT: s_waitcnt vmcnt(0) 1777; VI-NEXT: v_min_u32_e32 v2, v2, v5 1778; VI-NEXT: v_min_u32_e32 v1, v1, v4 1779; VI-NEXT: v_min_u32_e32 v0, v0, v3 1780; VI-NEXT: flat_store_dwordx3 v[6:7], v[0:2] 1781; VI-NEXT: s_endpgm 1782; 1783; GFX9-LABEL: v_test_umin_ule_v3i32: 1784; GFX9: ; %bb.0: 1785; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1786; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1787; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0 1788; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1789; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] 1790; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] 1791; GFX9-NEXT: s_waitcnt vmcnt(0) 1792; GFX9-NEXT: v_min_u32_e32 v2, v2, v5 1793; GFX9-NEXT: v_min_u32_e32 v1, v1, v4 1794; GFX9-NEXT: v_min_u32_e32 v0, v0, v3 1795; GFX9-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] 1796; GFX9-NEXT: s_endpgm 1797; 1798; GFX10-LABEL: v_test_umin_ule_v3i32: 1799; GFX10: ; %bb.0: 1800; GFX10-NEXT: s_clause 0x1 1801; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1802; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1803; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0 1804; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX10-NEXT: s_clause 0x1 1806; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3] 1807; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[4:5] 1808; GFX10-NEXT: s_waitcnt vmcnt(0) 1809; GFX10-NEXT: v_min_u32_e32 v2, v2, v5 1810; GFX10-NEXT: v_min_u32_e32 v1, v1, v4 1811; GFX10-NEXT: v_min_u32_e32 v0, v0, v3 1812; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1] 1813; GFX10-NEXT: s_endpgm 1814; 1815; GFX11-LABEL: v_test_umin_ule_v3i32: 1816; GFX11: ; %bb.0: 1817; GFX11-NEXT: s_clause 0x1 1818; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1819; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1820; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1821; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1822; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0 1823; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1824; GFX11-NEXT: s_clause 0x1 1825; GFX11-NEXT: global_load_b96 v[0:2], v6, s[2:3] 1826; GFX11-NEXT: global_load_b96 v[3:5], v6, s[4:5] 1827; GFX11-NEXT: s_waitcnt vmcnt(0) 1828; GFX11-NEXT: v_min_u32_e32 v2, v2, v5 1829; GFX11-NEXT: v_min_u32_e32 v1, v1, v4 1830; GFX11-NEXT: v_min_u32_e32 v0, v0, v3 1831; GFX11-NEXT: global_store_b96 v6, v[0:2], s[0:1] 1832; GFX11-NEXT: s_endpgm 1833 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1834 %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid 1835 %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid 1836 %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid 1837 1838 %a = load <3 x i32>, ptr addrspace(1) %a.gep 1839 %b = load <3 x i32>, ptr addrspace(1) %b.gep 1840 %cmp = icmp ule <3 x i32> %a, %b 1841 %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b 1842 store <3 x i32> %val, ptr addrspace(1) %out.gep 1843 ret void 1844} 1845 1846; FIXME: Reduce unused packed component to scalar 1847 1848define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1849; EG-LABEL: v_test_umin_ule_v3i16: 1850; EG: ; %bb.0: 1851; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[] 1852; EG-NEXT: TEX 1 @8 1853; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[] 1854; EG-NEXT: TEX 3 @12 1855; EG-NEXT: ALU 8, @36, KC0[], KC1[] 1856; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0 1857; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X 1858; EG-NEXT: CF_END 1859; EG-NEXT: Fetch clause starting at 8: 1860; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1 1861; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1 1862; EG-NEXT: Fetch clause starting at 12: 1863; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1 1864; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1 1865; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1 1866; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 1867; EG-NEXT: ALU clause starting at 20: 1868; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1869; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1870; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 1871; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W, 1872; EG-NEXT: ALU clause starting at 24: 1873; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1874; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 1875; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 1876; EG-NEXT: AND_INT * T2.W, PV.W, literal.x, 1877; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1878; EG-NEXT: LSHL T2.W, PV.W, literal.x, 1879; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X, 1880; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1881; EG-NEXT: LSHL T7.X, PS, PV.W, 1882; EG-NEXT: LSHL * T7.W, literal.x, PV.W, 1883; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1884; EG-NEXT: MOV * T7.Y, 0.0, 1885; EG-NEXT: ALU clause starting at 36: 1886; EG-NEXT: MOV T7.Z, 0.0, 1887; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X, 1888; EG-NEXT: LSHR T0.X, T1.W, literal.x, 1889; EG-NEXT: LSHL T1.W, PV.W, literal.y, 1890; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X, 1891; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) 1892; EG-NEXT: OR_INT T6.X, PV.W, PS, 1893; EG-NEXT: LSHR * T8.X, T0.W, literal.x, 1894; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1895; 1896; CI-LABEL: v_test_umin_ule_v3i16: 1897; CI: ; %bb.0: 1898; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1899; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 1900; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1901; CI-NEXT: s_waitcnt lgkmcnt(0) 1902; CI-NEXT: v_mov_b32_e32 v1, s3 1903; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 1904; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1905; CI-NEXT: v_mov_b32_e32 v3, s5 1906; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 1907; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1908; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1909; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1910; CI-NEXT: v_mov_b32_e32 v5, s1 1911; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 1912; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1913; CI-NEXT: v_add_i32_e32 v6, vcc, 4, v4 1914; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc 1915; CI-NEXT: s_waitcnt vmcnt(1) 1916; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 1917; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1918; CI-NEXT: s_waitcnt vmcnt(0) 1919; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 1920; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 1921; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1922; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 1923; CI-NEXT: v_min_u32_e32 v0, v0, v2 1924; CI-NEXT: v_min_u32_e32 v2, v8, v9 1925; CI-NEXT: v_min_u32_e32 v1, v1, v3 1926; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1927; CI-NEXT: v_or_b32_e32 v0, v0, v2 1928; CI-NEXT: flat_store_short v[6:7], v1 1929; CI-NEXT: flat_store_dword v[4:5], v0 1930; CI-NEXT: s_endpgm 1931; 1932; VI-LABEL: v_test_umin_ule_v3i16: 1933; VI: ; %bb.0: 1934; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1935; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1936; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1937; VI-NEXT: s_waitcnt lgkmcnt(0) 1938; VI-NEXT: v_mov_b32_e32 v1, s3 1939; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1940; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1941; VI-NEXT: v_mov_b32_e32 v3, s5 1942; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1943; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1944; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1945; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1946; VI-NEXT: v_mov_b32_e32 v5, s1 1947; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1948; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1949; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4 1950; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc 1951; VI-NEXT: s_waitcnt vmcnt(0) 1952; VI-NEXT: v_min_u16_e32 v8, v0, v2 1953; VI-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1954; VI-NEXT: v_min_u16_e32 v1, v1, v3 1955; VI-NEXT: v_or_b32_e32 v0, v8, v0 1956; VI-NEXT: flat_store_short v[6:7], v1 1957; VI-NEXT: flat_store_dword v[4:5], v0 1958; VI-NEXT: s_endpgm 1959; 1960; GFX9-LABEL: v_test_umin_ule_v3i16: 1961; GFX9: ; %bb.0: 1962; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1963; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1964; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1965; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1966; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1967; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 1968; GFX9-NEXT: s_waitcnt vmcnt(0) 1969; GFX9-NEXT: v_pk_min_u16 v1, v1, v3 1970; GFX9-NEXT: v_pk_min_u16 v0, v0, v2 1971; GFX9-NEXT: global_store_short v4, v1, s[0:1] offset:4 1972; GFX9-NEXT: global_store_dword v4, v0, s[0:1] 1973; GFX9-NEXT: s_endpgm 1974; 1975; GFX10-LABEL: v_test_umin_ule_v3i16: 1976; GFX10: ; %bb.0: 1977; GFX10-NEXT: s_clause 0x1 1978; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1979; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 1980; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1981; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1982; GFX10-NEXT: s_clause 0x1 1983; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 1984; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] 1985; GFX10-NEXT: s_waitcnt vmcnt(0) 1986; GFX10-NEXT: v_pk_min_u16 v1, v1, v3 1987; GFX10-NEXT: v_pk_min_u16 v0, v0, v2 1988; GFX10-NEXT: global_store_short v4, v1, s[0:1] offset:4 1989; GFX10-NEXT: global_store_dword v4, v0, s[0:1] 1990; GFX10-NEXT: s_endpgm 1991; 1992; GFX11-LABEL: v_test_umin_ule_v3i16: 1993; GFX11: ; %bb.0: 1994; GFX11-NEXT: s_clause 0x1 1995; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1996; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 1997; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1998; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1999; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 2000; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX11-NEXT: s_clause 0x1 2002; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] 2003; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] 2004; GFX11-NEXT: s_waitcnt vmcnt(0) 2005; GFX11-NEXT: v_pk_min_u16 v1, v1, v3 2006; GFX11-NEXT: v_pk_min_u16 v0, v0, v2 2007; GFX11-NEXT: s_clause 0x1 2008; GFX11-NEXT: global_store_b16 v4, v1, s[0:1] offset:4 2009; GFX11-NEXT: global_store_b32 v4, v0, s[0:1] 2010; GFX11-NEXT: s_endpgm 2011 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2012 %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid 2013 %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid 2014 %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid 2015 2016 %a = load <3 x i16>, ptr addrspace(1) %a.gep 2017 %b = load <3 x i16>, ptr addrspace(1) %b.gep 2018 %cmp = icmp ule <3 x i16> %a, %b 2019 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b 2020 store <3 x i16> %val, ptr addrspace(1) %out.gep 2021 ret void 2022} 2023 2024define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 2025; EG-LABEL: s_test_umin_ule_i32: 2026; EG: ; %bb.0: 2027; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2028; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2029; EG-NEXT: CF_END 2030; EG-NEXT: PAD 2031; EG-NEXT: ALU clause starting at 4: 2032; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 2033; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W, 2034; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2035; 2036; CI-LABEL: s_test_umin_ule_i32: 2037; CI: ; %bb.0: 2038; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2039; CI-NEXT: s_waitcnt lgkmcnt(0) 2040; CI-NEXT: s_min_u32 s2, s2, s3 2041; CI-NEXT: v_mov_b32_e32 v0, s0 2042; CI-NEXT: v_mov_b32_e32 v1, s1 2043; CI-NEXT: v_mov_b32_e32 v2, s2 2044; CI-NEXT: flat_store_dword v[0:1], v2 2045; CI-NEXT: s_endpgm 2046; 2047; VI-LABEL: s_test_umin_ule_i32: 2048; VI: ; %bb.0: 2049; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2050; VI-NEXT: s_waitcnt lgkmcnt(0) 2051; VI-NEXT: s_min_u32 s2, s2, s3 2052; VI-NEXT: v_mov_b32_e32 v0, s0 2053; VI-NEXT: v_mov_b32_e32 v1, s1 2054; VI-NEXT: v_mov_b32_e32 v2, s2 2055; VI-NEXT: flat_store_dword v[0:1], v2 2056; VI-NEXT: s_endpgm 2057; 2058; GFX9-LABEL: s_test_umin_ule_i32: 2059; GFX9: ; %bb.0: 2060; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2061; GFX9-NEXT: v_mov_b32_e32 v0, 0 2062; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2063; GFX9-NEXT: s_min_u32 s2, s2, s3 2064; GFX9-NEXT: v_mov_b32_e32 v1, s2 2065; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2066; GFX9-NEXT: s_endpgm 2067; 2068; GFX10-LABEL: s_test_umin_ule_i32: 2069; GFX10: ; %bb.0: 2070; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2071; GFX10-NEXT: v_mov_b32_e32 v0, 0 2072; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX10-NEXT: s_min_u32 s2, s2, s3 2074; GFX10-NEXT: v_mov_b32_e32 v1, s2 2075; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2076; GFX10-NEXT: s_endpgm 2077; 2078; GFX11-LABEL: s_test_umin_ule_i32: 2079; GFX11: ; %bb.0: 2080; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2081; GFX11-NEXT: v_mov_b32_e32 v0, 0 2082; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2083; GFX11-NEXT: s_min_u32 s2, s2, s3 2084; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2085; GFX11-NEXT: v_mov_b32_e32 v1, s2 2086; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2087; GFX11-NEXT: s_endpgm 2088 %cmp = icmp ule i32 %a, %b 2089 %val = select i1 %cmp, i32 %a, i32 %b 2090 store i32 %val, ptr addrspace(1) %out, align 4 2091 ret void 2092} 2093 2094define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 2095; EG-LABEL: v_test_umin_ult_i32: 2096; EG: ; %bb.0: 2097; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 2098; EG-NEXT: TEX 1 @6 2099; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] 2100; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2101; EG-NEXT: CF_END 2102; EG-NEXT: PAD 2103; EG-NEXT: Fetch clause starting at 6: 2104; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 2105; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2106; EG-NEXT: ALU clause starting at 10: 2107; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2108; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2109; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 2110; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 2111; EG-NEXT: ALU clause starting at 14: 2112; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X, 2113; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 2114; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 2115; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2116; 2117; CI-LABEL: v_test_umin_ult_i32: 2118; CI: ; %bb.0: 2119; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2120; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 2121; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 2122; CI-NEXT: s_waitcnt lgkmcnt(0) 2123; CI-NEXT: v_mov_b32_e32 v1, s3 2124; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 2125; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2126; CI-NEXT: v_mov_b32_e32 v3, s5 2127; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 2128; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2129; CI-NEXT: flat_load_dword v5, v[0:1] 2130; CI-NEXT: flat_load_dword v2, v[2:3] 2131; CI-NEXT: v_mov_b32_e32 v1, s1 2132; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4 2133; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2134; CI-NEXT: s_waitcnt vmcnt(0) 2135; CI-NEXT: v_min_u32_e32 v2, v5, v2 2136; CI-NEXT: flat_store_dword v[0:1], v2 2137; CI-NEXT: s_endpgm 2138; 2139; VI-LABEL: v_test_umin_ult_i32: 2140; VI: ; %bb.0: 2141; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2142; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2143; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 2144; VI-NEXT: s_waitcnt lgkmcnt(0) 2145; VI-NEXT: v_mov_b32_e32 v1, s3 2146; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2147; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2148; VI-NEXT: v_mov_b32_e32 v3, s5 2149; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 2150; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2151; VI-NEXT: flat_load_dword v5, v[0:1] 2152; VI-NEXT: flat_load_dword v2, v[2:3] 2153; VI-NEXT: v_mov_b32_e32 v1, s1 2154; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 2155; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2156; VI-NEXT: s_waitcnt vmcnt(0) 2157; VI-NEXT: v_min_u32_e32 v2, v5, v2 2158; VI-NEXT: flat_store_dword v[0:1], v2 2159; VI-NEXT: s_endpgm 2160; 2161; GFX9-LABEL: v_test_umin_ult_i32: 2162; GFX9: ; %bb.0: 2163; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2164; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2165; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2167; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2168; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 2169; GFX9-NEXT: s_waitcnt vmcnt(0) 2170; GFX9-NEXT: v_min_u32_e32 v1, v1, v2 2171; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2172; GFX9-NEXT: s_endpgm 2173; 2174; GFX10-LABEL: v_test_umin_ult_i32: 2175; GFX10: ; %bb.0: 2176; GFX10-NEXT: s_clause 0x1 2177; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2178; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2179; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2180; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX10-NEXT: s_clause 0x1 2182; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 2183; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 2184; GFX10-NEXT: s_waitcnt vmcnt(0) 2185; GFX10-NEXT: v_min_u32_e32 v1, v1, v2 2186; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2187; GFX10-NEXT: s_endpgm 2188; 2189; GFX11-LABEL: v_test_umin_ult_i32: 2190; GFX11: ; %bb.0: 2191; GFX11-NEXT: s_clause 0x1 2192; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2193; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 2194; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2195; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2196; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2197; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2198; GFX11-NEXT: s_clause 0x1 2199; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2200; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 2201; GFX11-NEXT: s_waitcnt vmcnt(0) 2202; GFX11-NEXT: v_min_u32_e32 v1, v1, v2 2203; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2204; GFX11-NEXT: s_endpgm 2205 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2206 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid 2207 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid 2208 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid 2209 %a = load i32, ptr addrspace(1) %a.gep, align 4 2210 %b = load i32, ptr addrspace(1) %b.gep, align 4 2211 %cmp = icmp ult i32 %a, %b 2212 %val = select i1 %cmp, i32 %a, i32 %b 2213 store i32 %val, ptr addrspace(1) %out.gep, align 4 2214 ret void 2215} 2216 2217define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 2218; EG-LABEL: v_test_umin_ult_i8: 2219; EG: ; %bb.0: 2220; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2221; EG-NEXT: TEX 1 @6 2222; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[] 2223; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 2224; EG-NEXT: CF_END 2225; EG-NEXT: PAD 2226; EG-NEXT: Fetch clause starting at 6: 2227; EG-NEXT: VTX_READ_8 T2.X, T2.X, 0, #1 2228; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1 2229; EG-NEXT: ALU clause starting at 10: 2230; EG-NEXT: ADD_INT T1.X, KC0[2].Z, T0.X, 2231; EG-NEXT: ADD_INT * T2.X, KC0[2].W, T0.X, 2232; EG-NEXT: ALU clause starting at 12: 2233; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.X, 2234; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 2235; EG-NEXT: MIN_UINT * T2.W, T1.X, T2.X, 2236; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2237; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 2238; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2239; EG-NEXT: LSHL T1.X, T2.W, PV.W, 2240; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 2241; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2242; EG-NEXT: MOV T1.Y, 0.0, 2243; EG-NEXT: MOV * T1.Z, 0.0, 2244; EG-NEXT: LSHR * T0.X, T0.W, literal.x, 2245; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2246; 2247; CI-LABEL: v_test_umin_ult_i8: 2248; CI: ; %bb.0: 2249; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2250; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 2251; CI-NEXT: s_waitcnt lgkmcnt(0) 2252; CI-NEXT: v_mov_b32_e32 v2, s3 2253; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0 2254; CI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 2255; CI-NEXT: v_mov_b32_e32 v4, s5 2256; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v0 2257; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 2258; CI-NEXT: flat_load_ubyte v2, v[1:2] 2259; CI-NEXT: flat_load_ubyte v3, v[3:4] 2260; CI-NEXT: v_mov_b32_e32 v1, s1 2261; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 2262; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2263; CI-NEXT: s_waitcnt vmcnt(0) 2264; CI-NEXT: v_min_u32_e32 v2, v2, v3 2265; CI-NEXT: flat_store_byte v[0:1], v2 2266; CI-NEXT: s_endpgm 2267; 2268; VI-LABEL: v_test_umin_ult_i8: 2269; VI: ; %bb.0: 2270; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2271; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2272; VI-NEXT: s_waitcnt lgkmcnt(0) 2273; VI-NEXT: v_mov_b32_e32 v2, s3 2274; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0 2275; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 2276; VI-NEXT: v_mov_b32_e32 v4, s5 2277; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 2278; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 2279; VI-NEXT: flat_load_ubyte v2, v[1:2] 2280; VI-NEXT: flat_load_ubyte v3, v[3:4] 2281; VI-NEXT: v_mov_b32_e32 v1, s1 2282; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2283; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2284; VI-NEXT: s_waitcnt vmcnt(0) 2285; VI-NEXT: v_min_u16_e32 v2, v2, v3 2286; VI-NEXT: flat_store_byte v[0:1], v2 2287; VI-NEXT: s_endpgm 2288; 2289; GFX9-LABEL: v_test_umin_ult_i8: 2290; GFX9: ; %bb.0: 2291; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2292; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2293; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2294; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] 2295; GFX9-NEXT: global_load_ubyte v2, v0, s[4:5] 2296; GFX9-NEXT: s_waitcnt vmcnt(0) 2297; GFX9-NEXT: v_min_u16_e32 v1, v1, v2 2298; GFX9-NEXT: global_store_byte v0, v1, s[0:1] 2299; GFX9-NEXT: s_endpgm 2300; 2301; GFX10-LABEL: v_test_umin_ult_i8: 2302; GFX10: ; %bb.0: 2303; GFX10-NEXT: s_clause 0x1 2304; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2305; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 2306; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2307; GFX10-NEXT: s_clause 0x1 2308; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 2309; GFX10-NEXT: global_load_ubyte v2, v0, s[4:5] 2310; GFX10-NEXT: s_waitcnt vmcnt(0) 2311; GFX10-NEXT: v_min_u16 v1, v1, v2 2312; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 2313; GFX10-NEXT: s_endpgm 2314; 2315; GFX11-LABEL: v_test_umin_ult_i8: 2316; GFX11: ; %bb.0: 2317; GFX11-NEXT: s_clause 0x1 2318; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2319; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 2320; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2321; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX11-NEXT: s_clause 0x1 2323; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] 2324; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] 2325; GFX11-NEXT: s_waitcnt vmcnt(0) 2326; GFX11-NEXT: v_min_u16 v1, v1, v2 2327; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] 2328; GFX11-NEXT: s_endpgm 2329 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2330 %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid 2331 %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid 2332 %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid 2333 2334 %a = load i8, ptr addrspace(1) %a.gep, align 1 2335 %b = load i8, ptr addrspace(1) %b.gep, align 1 2336 %cmp = icmp ult i8 %a, %b 2337 %val = select i1 %cmp, i8 %a, i8 %b 2338 store i8 %val, ptr addrspace(1) %out.gep, align 1 2339 ret void 2340} 2341 2342define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { 2343; EG-LABEL: s_test_umin_ult_i32: 2344; EG: ; %bb.0: 2345; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2346; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2347; EG-NEXT: CF_END 2348; EG-NEXT: PAD 2349; EG-NEXT: ALU clause starting at 4: 2350; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 2351; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W, 2352; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2353; 2354; CI-LABEL: s_test_umin_ult_i32: 2355; CI: ; %bb.0: 2356; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2357; CI-NEXT: s_waitcnt lgkmcnt(0) 2358; CI-NEXT: s_min_u32 s2, s2, s3 2359; CI-NEXT: v_mov_b32_e32 v0, s0 2360; CI-NEXT: v_mov_b32_e32 v1, s1 2361; CI-NEXT: v_mov_b32_e32 v2, s2 2362; CI-NEXT: flat_store_dword v[0:1], v2 2363; CI-NEXT: s_endpgm 2364; 2365; VI-LABEL: s_test_umin_ult_i32: 2366; VI: ; %bb.0: 2367; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2368; VI-NEXT: s_waitcnt lgkmcnt(0) 2369; VI-NEXT: s_min_u32 s2, s2, s3 2370; VI-NEXT: v_mov_b32_e32 v0, s0 2371; VI-NEXT: v_mov_b32_e32 v1, s1 2372; VI-NEXT: v_mov_b32_e32 v2, s2 2373; VI-NEXT: flat_store_dword v[0:1], v2 2374; VI-NEXT: s_endpgm 2375; 2376; GFX9-LABEL: s_test_umin_ult_i32: 2377; GFX9: ; %bb.0: 2378; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2379; GFX9-NEXT: v_mov_b32_e32 v0, 0 2380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2381; GFX9-NEXT: s_min_u32 s2, s2, s3 2382; GFX9-NEXT: v_mov_b32_e32 v1, s2 2383; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2384; GFX9-NEXT: s_endpgm 2385; 2386; GFX10-LABEL: s_test_umin_ult_i32: 2387; GFX10: ; %bb.0: 2388; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2389; GFX10-NEXT: v_mov_b32_e32 v0, 0 2390; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2391; GFX10-NEXT: s_min_u32 s2, s2, s3 2392; GFX10-NEXT: v_mov_b32_e32 v1, s2 2393; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2394; GFX10-NEXT: s_endpgm 2395; 2396; GFX11-LABEL: s_test_umin_ult_i32: 2397; GFX11: ; %bb.0: 2398; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2399; GFX11-NEXT: v_mov_b32_e32 v0, 0 2400; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2401; GFX11-NEXT: s_min_u32 s2, s2, s3 2402; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2403; GFX11-NEXT: v_mov_b32_e32 v1, s2 2404; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2405; GFX11-NEXT: s_endpgm 2406 %cmp = icmp ult i32 %a, %b 2407 %val = select i1 %cmp, i32 %a, i32 %b 2408 store i32 %val, ptr addrspace(1) %out, align 4 2409 ret void 2410} 2411 2412define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 2413; EG-LABEL: v_test_umin_ult_i32_multi_use: 2414; EG: ; %bb.0: 2415; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2416; EG-NEXT: TEX 1 @6 2417; EG-NEXT: ALU 16, @12, KC0[CB0:0-32], KC1[] 2418; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0 2419; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 2420; EG-NEXT: CF_END 2421; EG-NEXT: Fetch clause starting at 6: 2422; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 2423; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2424; EG-NEXT: ALU clause starting at 10: 2425; EG-NEXT: MOV T0.X, KC0[2].W, 2426; EG-NEXT: MOV * T1.X, KC0[3].X, 2427; EG-NEXT: ALU clause starting at 12: 2428; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x, 2429; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X, 2430; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2431; EG-NEXT: AND_INT T1.W, PS, 1, 2432; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2433; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2434; EG-NEXT: LSHL T2.X, PV.W, PS, 2435; EG-NEXT: LSHL * T2.W, literal.x, PS, 2436; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2437; EG-NEXT: MOV T2.Y, 0.0, 2438; EG-NEXT: MOV * T2.Z, 0.0, 2439; EG-NEXT: LSHR T3.X, KC0[2].Z, literal.x, 2440; EG-NEXT: SETGE_UINT * T0.W, T0.X, T1.X, 2441; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2442; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X, 2443; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2444; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2445; 2446; CI-LABEL: v_test_umin_ult_i32_multi_use: 2447; CI: ; %bb.0: 2448; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2449; CI-NEXT: s_waitcnt lgkmcnt(0) 2450; CI-NEXT: s_load_dword s4, s[4:5], 0x0 2451; CI-NEXT: s_load_dword s5, s[6:7], 0x0 2452; CI-NEXT: v_mov_b32_e32 v0, s0 2453; CI-NEXT: v_mov_b32_e32 v1, s1 2454; CI-NEXT: v_mov_b32_e32 v2, s2 2455; CI-NEXT: v_mov_b32_e32 v3, s3 2456; CI-NEXT: s_waitcnt lgkmcnt(0) 2457; CI-NEXT: s_cmp_lt_u32 s4, s5 2458; CI-NEXT: s_cselect_b64 s[0:1], -1, 0 2459; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] 2460; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 2461; CI-NEXT: s_cselect_b32 s0, s4, s5 2462; CI-NEXT: v_mov_b32_e32 v5, s0 2463; CI-NEXT: flat_store_dword v[0:1], v5 2464; CI-NEXT: flat_store_byte v[2:3], v4 2465; CI-NEXT: s_endpgm 2466; 2467; VI-LABEL: v_test_umin_ult_i32_multi_use: 2468; VI: ; %bb.0: 2469; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2470; VI-NEXT: s_waitcnt lgkmcnt(0) 2471; VI-NEXT: s_load_dword s4, s[4:5], 0x0 2472; VI-NEXT: s_load_dword s5, s[6:7], 0x0 2473; VI-NEXT: v_mov_b32_e32 v0, s0 2474; VI-NEXT: v_mov_b32_e32 v1, s1 2475; VI-NEXT: v_mov_b32_e32 v2, s2 2476; VI-NEXT: v_mov_b32_e32 v3, s3 2477; VI-NEXT: s_waitcnt lgkmcnt(0) 2478; VI-NEXT: s_cmp_lt_u32 s4, s5 2479; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2480; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] 2481; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec 2482; VI-NEXT: s_cselect_b32 s0, s4, s5 2483; VI-NEXT: v_mov_b32_e32 v5, s0 2484; VI-NEXT: flat_store_dword v[0:1], v5 2485; VI-NEXT: flat_store_byte v[2:3], v4 2486; VI-NEXT: s_endpgm 2487; 2488; GFX9-LABEL: v_test_umin_ult_i32_multi_use: 2489; GFX9: ; %bb.0: 2490; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2491; GFX9-NEXT: v_mov_b32_e32 v0, 0 2492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2493; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0 2494; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0 2495; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2496; GFX9-NEXT: s_cmp_lt_u32 s8, s9 2497; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 2498; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 2499; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 2500; GFX9-NEXT: s_cselect_b32 s4, s8, s9 2501; GFX9-NEXT: v_mov_b32_e32 v2, s4 2502; GFX9-NEXT: global_store_dword v0, v2, s[0:1] 2503; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 2504; GFX9-NEXT: s_endpgm 2505; 2506; GFX10-LABEL: v_test_umin_ult_i32_multi_use: 2507; GFX10: ; %bb.0: 2508; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2509; GFX10-NEXT: v_mov_b32_e32 v1, 0 2510; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0 2512; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0 2513; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2514; GFX10-NEXT: s_cmp_lt_u32 s8, s9 2515; GFX10-NEXT: s_cselect_b32 s4, -1, 0 2516; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 2517; GFX10-NEXT: s_and_b32 s4, s4, exec_lo 2518; GFX10-NEXT: s_cselect_b32 s4, s8, s9 2519; GFX10-NEXT: v_mov_b32_e32 v2, s4 2520; GFX10-NEXT: global_store_dword v1, v2, s[0:1] 2521; GFX10-NEXT: global_store_byte v1, v0, s[2:3] 2522; GFX10-NEXT: s_endpgm 2523; 2524; GFX11-LABEL: v_test_umin_ult_i32_multi_use: 2525; GFX11: ; %bb.0: 2526; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 2527; GFX11-NEXT: v_mov_b32_e32 v1, 0 2528; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2529; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 2530; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0 2531; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2532; GFX11-NEXT: s_cmp_lt_u32 s4, s5 2533; GFX11-NEXT: s_cselect_b32 s6, -1, 0 2534; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) 2535; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 2536; GFX11-NEXT: s_and_b32 s6, s6, exec_lo 2537; GFX11-NEXT: s_cselect_b32 s4, s4, s5 2538; GFX11-NEXT: v_mov_b32_e32 v2, s4 2539; GFX11-NEXT: s_clause 0x1 2540; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] 2541; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] 2542; GFX11-NEXT: s_endpgm 2543 %a = load i32, ptr addrspace(1) %aptr, align 4 2544 %b = load i32, ptr addrspace(1) %bptr, align 4 2545 %cmp = icmp ult i32 %a, %b 2546 %val = select i1 %cmp, i32 %a, i32 %b 2547 store i32 %val, ptr addrspace(1) %out0, align 4 2548 store i1 %cmp, ptr addrspace(1) %out1 2549 ret void 2550} 2551 2552define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 2553; EG-LABEL: v_test_umin_ult_i16_multi_use: 2554; EG: ; %bb.0: 2555; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2556; EG-NEXT: TEX 1 @6 2557; EG-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[] 2558; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X 2559; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 2560; EG-NEXT: CF_END 2561; EG-NEXT: Fetch clause starting at 6: 2562; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 2563; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 2564; EG-NEXT: ALU clause starting at 10: 2565; EG-NEXT: MOV T0.X, KC0[2].W, 2566; EG-NEXT: MOV * T1.X, KC0[3].X, 2567; EG-NEXT: ALU clause starting at 12: 2568; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 2569; EG-NEXT: SETGE_UINT * T1.W, T0.X, T1.X, 2570; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2571; EG-NEXT: CNDE_INT T1.W, PS, T0.X, T1.X, 2572; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2573; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2574; EG-NEXT: LSHL T2.X, PV.W, PS, 2575; EG-NEXT: LSHL * T2.W, literal.x, PS, 2576; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2577; EG-NEXT: MOV T2.Y, 0.0, 2578; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x, 2579; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X, 2580; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2581; EG-NEXT: AND_INT T1.W, PS, 1, 2582; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2583; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2584; EG-NEXT: LSHL T0.X, PV.W, PS, 2585; EG-NEXT: LSHL * T0.W, literal.x, PS, 2586; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2587; EG-NEXT: MOV T0.Y, 0.0, 2588; EG-NEXT: MOV T2.Z, 0.0, 2589; EG-NEXT: MOV * T0.Z, 0.0, 2590; EG-NEXT: LSHR T1.X, KC0[2].Z, literal.x, 2591; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2592; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2593; 2594; CI-LABEL: v_test_umin_ult_i16_multi_use: 2595; CI: ; %bb.0: 2596; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2597; CI-NEXT: s_waitcnt lgkmcnt(0) 2598; CI-NEXT: v_mov_b32_e32 v0, s4 2599; CI-NEXT: v_mov_b32_e32 v1, s5 2600; CI-NEXT: v_mov_b32_e32 v2, s6 2601; CI-NEXT: v_mov_b32_e32 v3, s7 2602; CI-NEXT: flat_load_ushort v4, v[0:1] 2603; CI-NEXT: flat_load_ushort v5, v[2:3] 2604; CI-NEXT: v_mov_b32_e32 v0, s0 2605; CI-NEXT: v_mov_b32_e32 v1, s1 2606; CI-NEXT: v_mov_b32_e32 v2, s2 2607; CI-NEXT: v_mov_b32_e32 v3, s3 2608; CI-NEXT: s_waitcnt vmcnt(0) 2609; CI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5 2610; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 2611; CI-NEXT: flat_store_short v[0:1], v4 2612; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 2613; CI-NEXT: flat_store_byte v[2:3], v0 2614; CI-NEXT: s_endpgm 2615; 2616; VI-LABEL: v_test_umin_ult_i16_multi_use: 2617; VI: ; %bb.0: 2618; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2619; VI-NEXT: s_waitcnt lgkmcnt(0) 2620; VI-NEXT: v_mov_b32_e32 v0, s4 2621; VI-NEXT: v_mov_b32_e32 v1, s5 2622; VI-NEXT: v_mov_b32_e32 v2, s6 2623; VI-NEXT: v_mov_b32_e32 v3, s7 2624; VI-NEXT: flat_load_ushort v4, v[0:1] 2625; VI-NEXT: flat_load_ushort v5, v[2:3] 2626; VI-NEXT: v_mov_b32_e32 v0, s0 2627; VI-NEXT: v_mov_b32_e32 v1, s1 2628; VI-NEXT: v_mov_b32_e32 v2, s2 2629; VI-NEXT: v_mov_b32_e32 v3, s3 2630; VI-NEXT: s_waitcnt vmcnt(0) 2631; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5 2632; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 2633; VI-NEXT: flat_store_short v[0:1], v4 2634; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 2635; VI-NEXT: flat_store_byte v[2:3], v0 2636; VI-NEXT: s_endpgm 2637; 2638; GFX9-LABEL: v_test_umin_ult_i16_multi_use: 2639; GFX9: ; %bb.0: 2640; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2641; GFX9-NEXT: v_mov_b32_e32 v0, 0 2642; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2643; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] 2644; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] 2645; GFX9-NEXT: s_waitcnt vmcnt(0) 2646; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 2647; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 2648; GFX9-NEXT: global_store_short v0, v1, s[0:1] 2649; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 2650; GFX9-NEXT: global_store_byte v0, v1, s[2:3] 2651; GFX9-NEXT: s_endpgm 2652; 2653; GFX10-LABEL: v_test_umin_ult_i16_multi_use: 2654; GFX10: ; %bb.0: 2655; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0 2656; GFX10-NEXT: v_mov_b32_e32 v0, 0 2657; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2658; GFX10-NEXT: s_clause 0x1 2659; GFX10-NEXT: global_load_ushort v1, v0, s[4:5] 2660; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] 2661; GFX10-NEXT: s_waitcnt vmcnt(0) 2662; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 2663; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 2664; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 2665; GFX10-NEXT: global_store_short v0, v1, s[0:1] 2666; GFX10-NEXT: global_store_byte v0, v2, s[2:3] 2667; GFX10-NEXT: s_endpgm 2668; 2669; GFX11-LABEL: v_test_umin_ult_i16_multi_use: 2670; GFX11: ; %bb.0: 2671; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 2672; GFX11-NEXT: v_mov_b32_e32 v0, 0 2673; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2674; GFX11-NEXT: s_clause 0x1 2675; GFX11-NEXT: global_load_u16 v1, v0, s[4:5] 2676; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] 2677; GFX11-NEXT: s_waitcnt vmcnt(0) 2678; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 2679; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 2680; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 2681; GFX11-NEXT: s_clause 0x1 2682; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 2683; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] 2684; GFX11-NEXT: s_endpgm 2685 %a = load i16, ptr addrspace(1) %aptr, align 2 2686 %b = load i16, ptr addrspace(1) %bptr, align 2 2687 %cmp = icmp ult i16 %a, %b 2688 %val = select i1 %cmp, i16 %a, i16 %b 2689 store i16 %val, ptr addrspace(1) %out0, align 2 2690 store i1 %cmp, ptr addrspace(1) %out1 2691 ret void 2692} 2693 2694define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 { 2695; EG-LABEL: s_test_umin_ult_v1i32: 2696; EG: ; %bb.0: 2697; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2698; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2699; EG-NEXT: CF_END 2700; EG-NEXT: PAD 2701; EG-NEXT: ALU clause starting at 4: 2702; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 2703; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W, 2704; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2705; 2706; CI-LABEL: s_test_umin_ult_v1i32: 2707; CI: ; %bb.0: 2708; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2709; CI-NEXT: s_waitcnt lgkmcnt(0) 2710; CI-NEXT: s_min_u32 s2, s2, s3 2711; CI-NEXT: v_mov_b32_e32 v0, s0 2712; CI-NEXT: v_mov_b32_e32 v1, s1 2713; CI-NEXT: v_mov_b32_e32 v2, s2 2714; CI-NEXT: flat_store_dword v[0:1], v2 2715; CI-NEXT: s_endpgm 2716; 2717; VI-LABEL: s_test_umin_ult_v1i32: 2718; VI: ; %bb.0: 2719; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2720; VI-NEXT: s_waitcnt lgkmcnt(0) 2721; VI-NEXT: s_min_u32 s2, s2, s3 2722; VI-NEXT: v_mov_b32_e32 v0, s0 2723; VI-NEXT: v_mov_b32_e32 v1, s1 2724; VI-NEXT: v_mov_b32_e32 v2, s2 2725; VI-NEXT: flat_store_dword v[0:1], v2 2726; VI-NEXT: s_endpgm 2727; 2728; GFX9-LABEL: s_test_umin_ult_v1i32: 2729; GFX9: ; %bb.0: 2730; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2731; GFX9-NEXT: v_mov_b32_e32 v0, 0 2732; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2733; GFX9-NEXT: s_min_u32 s2, s2, s3 2734; GFX9-NEXT: v_mov_b32_e32 v1, s2 2735; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2736; GFX9-NEXT: s_endpgm 2737; 2738; GFX10-LABEL: s_test_umin_ult_v1i32: 2739; GFX10: ; %bb.0: 2740; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2741; GFX10-NEXT: v_mov_b32_e32 v0, 0 2742; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2743; GFX10-NEXT: s_min_u32 s2, s2, s3 2744; GFX10-NEXT: v_mov_b32_e32 v1, s2 2745; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2746; GFX10-NEXT: s_endpgm 2747; 2748; GFX11-LABEL: s_test_umin_ult_v1i32: 2749; GFX11: ; %bb.0: 2750; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2751; GFX11-NEXT: v_mov_b32_e32 v0, 0 2752; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2753; GFX11-NEXT: s_min_u32 s2, s2, s3 2754; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2755; GFX11-NEXT: v_mov_b32_e32 v1, s2 2756; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2757; GFX11-NEXT: s_endpgm 2758 %cmp = icmp ult <1 x i32> %a, %b 2759 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b 2760 store <1 x i32> %val, ptr addrspace(1) %out 2761 ret void 2762} 2763 2764define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 { 2765; EG-LABEL: s_test_umin_ult_v8i32: 2766; EG: ; %bb.0: 2767; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 2768; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 2769; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2770; EG-NEXT: CF_END 2771; EG-NEXT: ALU clause starting at 4: 2772; EG-NEXT: MIN_UINT * T0.W, KC0[5].X, KC0[7].X, 2773; EG-NEXT: MIN_UINT * T0.Z, KC0[4].W, KC0[6].W, 2774; EG-NEXT: MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z, 2775; EG-NEXT: MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y, 2776; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2777; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2778; EG-NEXT: MIN_UINT * T2.W, KC0[6].X, KC0[8].X, 2779; EG-NEXT: MIN_UINT * T2.Z, KC0[5].W, KC0[7].W, 2780; EG-NEXT: MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z, 2781; EG-NEXT: MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y, 2782; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 2783; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2784; EG-NEXT: LSHR * T3.X, PV.W, literal.x, 2785; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2786; 2787; CI-LABEL: s_test_umin_ult_v8i32: 2788; CI: ; %bb.0: 2789; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8 2790; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2791; CI-NEXT: s_waitcnt lgkmcnt(0) 2792; CI-NEXT: s_min_u32 s4, s15, s23 2793; CI-NEXT: s_min_u32 s5, s14, s22 2794; CI-NEXT: s_min_u32 s6, s13, s21 2795; CI-NEXT: s_min_u32 s7, s12, s20 2796; CI-NEXT: s_min_u32 s2, s19, s27 2797; CI-NEXT: s_min_u32 s3, s18, s26 2798; CI-NEXT: s_min_u32 s8, s17, s25 2799; CI-NEXT: s_min_u32 s9, s16, s24 2800; CI-NEXT: v_mov_b32_e32 v3, s2 2801; CI-NEXT: s_add_u32 s2, s0, 16 2802; CI-NEXT: v_mov_b32_e32 v2, s3 2803; CI-NEXT: s_addc_u32 s3, s1, 0 2804; CI-NEXT: v_mov_b32_e32 v5, s3 2805; CI-NEXT: v_mov_b32_e32 v0, s9 2806; CI-NEXT: v_mov_b32_e32 v1, s8 2807; CI-NEXT: v_mov_b32_e32 v4, s2 2808; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2809; CI-NEXT: v_mov_b32_e32 v5, s1 2810; CI-NEXT: v_mov_b32_e32 v0, s7 2811; CI-NEXT: v_mov_b32_e32 v1, s6 2812; CI-NEXT: v_mov_b32_e32 v2, s5 2813; CI-NEXT: v_mov_b32_e32 v3, s4 2814; CI-NEXT: v_mov_b32_e32 v4, s0 2815; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2816; CI-NEXT: s_endpgm 2817; 2818; VI-LABEL: s_test_umin_ult_v8i32: 2819; VI: ; %bb.0: 2820; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 2821; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2822; VI-NEXT: s_waitcnt lgkmcnt(0) 2823; VI-NEXT: s_min_u32 s4, s15, s23 2824; VI-NEXT: s_min_u32 s5, s14, s22 2825; VI-NEXT: s_min_u32 s6, s13, s21 2826; VI-NEXT: s_min_u32 s7, s12, s20 2827; VI-NEXT: s_min_u32 s2, s19, s27 2828; VI-NEXT: s_min_u32 s3, s18, s26 2829; VI-NEXT: s_min_u32 s8, s17, s25 2830; VI-NEXT: s_min_u32 s9, s16, s24 2831; VI-NEXT: v_mov_b32_e32 v3, s2 2832; VI-NEXT: s_add_u32 s2, s0, 16 2833; VI-NEXT: v_mov_b32_e32 v2, s3 2834; VI-NEXT: s_addc_u32 s3, s1, 0 2835; VI-NEXT: v_mov_b32_e32 v5, s3 2836; VI-NEXT: v_mov_b32_e32 v0, s9 2837; VI-NEXT: v_mov_b32_e32 v1, s8 2838; VI-NEXT: v_mov_b32_e32 v4, s2 2839; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2840; VI-NEXT: v_mov_b32_e32 v5, s1 2841; VI-NEXT: v_mov_b32_e32 v0, s7 2842; VI-NEXT: v_mov_b32_e32 v1, s6 2843; VI-NEXT: v_mov_b32_e32 v2, s5 2844; VI-NEXT: v_mov_b32_e32 v3, s4 2845; VI-NEXT: v_mov_b32_e32 v4, s0 2846; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2847; VI-NEXT: s_endpgm 2848; 2849; GFX9-LABEL: s_test_umin_ult_v8i32: 2850; GFX9: ; %bb.0: 2851; GFX9-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 2852; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2853; GFX9-NEXT: v_mov_b32_e32 v4, 0 2854; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2855; GFX9-NEXT: s_min_u32 s6, s19, s27 2856; GFX9-NEXT: s_min_u32 s7, s18, s26 2857; GFX9-NEXT: s_min_u32 s8, s17, s25 2858; GFX9-NEXT: s_min_u32 s9, s16, s24 2859; GFX9-NEXT: s_min_u32 s2, s15, s23 2860; GFX9-NEXT: s_min_u32 s3, s14, s22 2861; GFX9-NEXT: s_min_u32 s4, s13, s21 2862; GFX9-NEXT: s_min_u32 s5, s12, s20 2863; GFX9-NEXT: v_mov_b32_e32 v0, s9 2864; GFX9-NEXT: v_mov_b32_e32 v1, s8 2865; GFX9-NEXT: v_mov_b32_e32 v2, s7 2866; GFX9-NEXT: v_mov_b32_e32 v3, s6 2867; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 2868; GFX9-NEXT: s_nop 0 2869; GFX9-NEXT: v_mov_b32_e32 v0, s5 2870; GFX9-NEXT: v_mov_b32_e32 v1, s4 2871; GFX9-NEXT: v_mov_b32_e32 v2, s3 2872; GFX9-NEXT: v_mov_b32_e32 v3, s2 2873; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2874; GFX9-NEXT: s_endpgm 2875; 2876; GFX10-LABEL: s_test_umin_ult_v8i32: 2877; GFX10: ; %bb.0: 2878; GFX10-NEXT: s_clause 0x1 2879; GFX10-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20 2880; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2881; GFX10-NEXT: v_mov_b32_e32 v8, 0 2882; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2883; GFX10-NEXT: s_min_u32 s6, s19, s27 2884; GFX10-NEXT: s_min_u32 s7, s18, s26 2885; GFX10-NEXT: s_min_u32 s8, s16, s24 2886; GFX10-NEXT: s_min_u32 s9, s17, s25 2887; GFX10-NEXT: s_min_u32 s2, s15, s23 2888; GFX10-NEXT: s_min_u32 s3, s14, s22 2889; GFX10-NEXT: s_min_u32 s4, s13, s21 2890; GFX10-NEXT: s_min_u32 s5, s12, s20 2891; GFX10-NEXT: v_mov_b32_e32 v0, s8 2892; GFX10-NEXT: v_mov_b32_e32 v1, s9 2893; GFX10-NEXT: v_mov_b32_e32 v2, s7 2894; GFX10-NEXT: v_mov_b32_e32 v3, s6 2895; GFX10-NEXT: v_mov_b32_e32 v4, s5 2896; GFX10-NEXT: v_mov_b32_e32 v5, s4 2897; GFX10-NEXT: v_mov_b32_e32 v6, s3 2898; GFX10-NEXT: v_mov_b32_e32 v7, s2 2899; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 2900; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] 2901; GFX10-NEXT: s_endpgm 2902; 2903; GFX11-LABEL: s_test_umin_ult_v8i32: 2904; GFX11: ; %bb.0: 2905; GFX11-NEXT: s_clause 0x1 2906; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x20 2907; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2908; GFX11-NEXT: v_mov_b32_e32 v8, 0 2909; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2910; GFX11-NEXT: s_min_u32 s4, s9, s17 2911; GFX11-NEXT: s_min_u32 s5, s8, s16 2912; GFX11-NEXT: s_min_u32 s6, s15, s23 2913; GFX11-NEXT: s_min_u32 s7, s14, s22 2914; GFX11-NEXT: s_min_u32 s8, s12, s20 2915; GFX11-NEXT: s_min_u32 s9, s13, s21 2916; GFX11-NEXT: s_min_u32 s2, s11, s19 2917; GFX11-NEXT: s_min_u32 s3, s10, s18 2918; GFX11-NEXT: v_mov_b32_e32 v0, s8 2919; GFX11-NEXT: v_mov_b32_e32 v1, s9 2920; GFX11-NEXT: v_mov_b32_e32 v2, s7 2921; GFX11-NEXT: v_mov_b32_e32 v3, s6 2922; GFX11-NEXT: v_mov_b32_e32 v4, s5 2923; GFX11-NEXT: v_mov_b32_e32 v5, s4 2924; GFX11-NEXT: v_mov_b32_e32 v6, s3 2925; GFX11-NEXT: v_mov_b32_e32 v7, s2 2926; GFX11-NEXT: s_clause 0x1 2927; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 2928; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] 2929; GFX11-NEXT: s_endpgm 2930 %cmp = icmp ult <8 x i32> %a, %b 2931 %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b 2932 store <8 x i32> %val, ptr addrspace(1) %out 2933 ret void 2934} 2935 2936define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 { 2937; EG-LABEL: s_test_umin_ult_v8i16: 2938; EG: ; %bb.0: 2939; EG-NEXT: ALU 1, @52, KC0[], KC1[] 2940; EG-NEXT: TEX 1 @20 2941; EG-NEXT: ALU 9, @54, KC0[], KC1[] 2942; EG-NEXT: TEX 1 @24 2943; EG-NEXT: ALU 8, @64, KC0[], KC1[] 2944; EG-NEXT: TEX 1 @28 2945; EG-NEXT: ALU 10, @73, KC0[], KC1[] 2946; EG-NEXT: TEX 1 @32 2947; EG-NEXT: ALU 8, @84, KC0[], KC1[] 2948; EG-NEXT: TEX 1 @36 2949; EG-NEXT: ALU 10, @93, KC0[], KC1[] 2950; EG-NEXT: TEX 1 @40 2951; EG-NEXT: ALU 8, @104, KC0[], KC1[] 2952; EG-NEXT: TEX 1 @44 2953; EG-NEXT: ALU 10, @113, KC0[], KC1[] 2954; EG-NEXT: TEX 1 @48 2955; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[] 2956; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 2957; EG-NEXT: CF_END 2958; EG-NEXT: PAD 2959; EG-NEXT: Fetch clause starting at 20: 2960; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 2961; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3 2962; EG-NEXT: Fetch clause starting at 24: 2963; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 2964; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3 2965; EG-NEXT: Fetch clause starting at 28: 2966; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 2967; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3 2968; EG-NEXT: Fetch clause starting at 32: 2969; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 2970; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3 2971; EG-NEXT: Fetch clause starting at 36: 2972; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 2973; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3 2974; EG-NEXT: Fetch clause starting at 40: 2975; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 2976; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3 2977; EG-NEXT: Fetch clause starting at 44: 2978; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 2979; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3 2980; EG-NEXT: Fetch clause starting at 48: 2981; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3 2982; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3 2983; EG-NEXT: ALU clause starting at 52: 2984; EG-NEXT: MOV * T0.Y, T3.X, 2985; EG-NEXT: MOV * T7.X, 0.0, 2986; EG-NEXT: ALU clause starting at 54: 2987; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 2988; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 2989; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2990; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, 2991; EG-NEXT: LSHL T0.W, PV.W, literal.x, 2992; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 2993; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 2994; EG-NEXT: OR_INT * T0.W, PS, PV.W, 2995; EG-NEXT: MOV * T3.X, PV.W, 2996; EG-NEXT: MOV * T0.Y, PV.X, 2997; EG-NEXT: ALU clause starting at 64: 2998; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 2999; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3000; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3001; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, 3002; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, 3003; EG-NEXT: -65536(nan), 0(0.000000e+00) 3004; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3005; EG-NEXT: MOV T3.X, PV.W, 3006; EG-NEXT: MOV * T0.Y, T2.X, 3007; EG-NEXT: ALU clause starting at 73: 3008; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3009; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3010; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3011; EG-NEXT: MIN_UINT T0.W, PV.W, PS, 3012; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 3013; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3014; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3015; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3016; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3017; EG-NEXT: MOV * T2.X, PV.W, 3018; EG-NEXT: MOV * T0.Y, PV.X, 3019; EG-NEXT: ALU clause starting at 84: 3020; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3021; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3022; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3023; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, 3024; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, 3025; EG-NEXT: -65536(nan), 0(0.000000e+00) 3026; EG-NEXT: OR_INT * T7.Z, PV.W, PS, 3027; EG-NEXT: MOV T2.X, PV.Z, 3028; EG-NEXT: MOV * T0.Y, T5.X, 3029; EG-NEXT: ALU clause starting at 93: 3030; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3031; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3032; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3033; EG-NEXT: MIN_UINT T0.W, PV.W, PS, 3034; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 3035; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3036; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3037; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3038; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3039; EG-NEXT: MOV * T5.X, PV.W, 3040; EG-NEXT: MOV * T0.Y, PV.X, 3041; EG-NEXT: ALU clause starting at 104: 3042; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3043; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3044; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3045; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, 3046; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, 3047; EG-NEXT: -65536(nan), 0(0.000000e+00) 3048; EG-NEXT: OR_INT * T0.W, PV.W, PS, 3049; EG-NEXT: MOV T5.X, PV.W, 3050; EG-NEXT: MOV * T0.Y, T4.X, 3051; EG-NEXT: ALU clause starting at 113: 3052; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3053; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, 3054; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3055; EG-NEXT: MIN_UINT T0.W, PV.W, PS, 3056; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 3057; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3058; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 3059; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3060; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 3061; EG-NEXT: MOV * T4.X, PV.W, 3062; EG-NEXT: MOV * T0.Y, PV.X, 3063; EG-NEXT: ALU clause starting at 124: 3064; EG-NEXT: AND_INT T0.W, T8.X, literal.x, 3065; EG-NEXT: AND_INT * T1.W, T7.X, literal.x, 3066; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3067; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, 3068; EG-NEXT: AND_INT T2.W, T0.Y, literal.y, 3069; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, 3070; EG-NEXT: 2(2.802597e-45), -65536(nan) 3071; EG-NEXT: OR_INT * T7.X, PV.W, PS, 3072; EG-NEXT: MOV T4.X, PV.X, 3073; EG-NEXT: MOV * T7.W, T3.X, 3074; EG-NEXT: MOV * T7.Y, T5.X, 3075; 3076; CI-LABEL: s_test_umin_ult_v8i16: 3077; CI: ; %bb.0: 3078; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 3079; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 3080; CI-NEXT: s_waitcnt lgkmcnt(0) 3081; CI-NEXT: s_lshr_b32 s10, s0, 16 3082; CI-NEXT: s_and_b32 s0, s0, 0xffff 3083; CI-NEXT: s_lshr_b32 s11, s1, 16 3084; CI-NEXT: s_and_b32 s1, s1, 0xffff 3085; CI-NEXT: s_lshr_b32 s12, s2, 16 3086; CI-NEXT: s_and_b32 s2, s2, 0xffff 3087; CI-NEXT: s_lshr_b32 s13, s3, 16 3088; CI-NEXT: s_and_b32 s3, s3, 0xffff 3089; CI-NEXT: s_lshr_b32 s14, s4, 16 3090; CI-NEXT: s_and_b32 s4, s4, 0xffff 3091; CI-NEXT: s_lshr_b32 s15, s5, 16 3092; CI-NEXT: s_and_b32 s5, s5, 0xffff 3093; CI-NEXT: s_lshr_b32 s16, s6, 16 3094; CI-NEXT: s_and_b32 s6, s6, 0xffff 3095; CI-NEXT: s_lshr_b32 s17, s7, 16 3096; CI-NEXT: s_and_b32 s7, s7, 0xffff 3097; CI-NEXT: s_min_u32 s3, s3, s7 3098; CI-NEXT: s_min_u32 s7, s13, s17 3099; CI-NEXT: s_min_u32 s2, s2, s6 3100; CI-NEXT: s_min_u32 s6, s12, s16 3101; CI-NEXT: s_min_u32 s1, s1, s5 3102; CI-NEXT: s_min_u32 s5, s11, s15 3103; CI-NEXT: s_min_u32 s0, s0, s4 3104; CI-NEXT: s_min_u32 s4, s10, s14 3105; CI-NEXT: s_lshl_b32 s7, s7, 16 3106; CI-NEXT: s_lshl_b32 s6, s6, 16 3107; CI-NEXT: s_lshl_b32 s5, s5, 16 3108; CI-NEXT: s_lshl_b32 s4, s4, 16 3109; CI-NEXT: s_or_b32 s3, s3, s7 3110; CI-NEXT: s_or_b32 s2, s2, s6 3111; CI-NEXT: s_or_b32 s1, s1, s5 3112; CI-NEXT: s_or_b32 s0, s0, s4 3113; CI-NEXT: v_mov_b32_e32 v4, s8 3114; CI-NEXT: v_mov_b32_e32 v0, s0 3115; CI-NEXT: v_mov_b32_e32 v1, s1 3116; CI-NEXT: v_mov_b32_e32 v2, s2 3117; CI-NEXT: v_mov_b32_e32 v3, s3 3118; CI-NEXT: v_mov_b32_e32 v5, s9 3119; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3120; CI-NEXT: s_endpgm 3121; 3122; VI-LABEL: s_test_umin_ult_v8i16: 3123; VI: ; %bb.0: 3124; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 3125; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 3126; VI-NEXT: s_waitcnt lgkmcnt(0) 3127; VI-NEXT: s_lshr_b32 s10, s3, 16 3128; VI-NEXT: s_and_b32 s3, s3, 0xffff 3129; VI-NEXT: s_lshr_b32 s11, s2, 16 3130; VI-NEXT: s_and_b32 s2, s2, 0xffff 3131; VI-NEXT: s_lshr_b32 s12, s1, 16 3132; VI-NEXT: s_and_b32 s1, s1, 0xffff 3133; VI-NEXT: s_lshr_b32 s13, s0, 16 3134; VI-NEXT: s_and_b32 s0, s0, 0xffff 3135; VI-NEXT: s_lshr_b32 s14, s7, 16 3136; VI-NEXT: s_and_b32 s7, s7, 0xffff 3137; VI-NEXT: s_lshr_b32 s15, s6, 16 3138; VI-NEXT: s_and_b32 s6, s6, 0xffff 3139; VI-NEXT: s_lshr_b32 s16, s5, 16 3140; VI-NEXT: s_and_b32 s5, s5, 0xffff 3141; VI-NEXT: s_lshr_b32 s17, s4, 16 3142; VI-NEXT: s_and_b32 s4, s4, 0xffff 3143; VI-NEXT: s_min_u32 s0, s0, s4 3144; VI-NEXT: s_min_u32 s4, s13, s17 3145; VI-NEXT: s_min_u32 s1, s1, s5 3146; VI-NEXT: s_min_u32 s5, s12, s16 3147; VI-NEXT: s_min_u32 s2, s2, s6 3148; VI-NEXT: s_min_u32 s6, s11, s15 3149; VI-NEXT: s_min_u32 s3, s3, s7 3150; VI-NEXT: s_min_u32 s7, s10, s14 3151; VI-NEXT: s_lshl_b32 s7, s7, 16 3152; VI-NEXT: s_lshl_b32 s6, s6, 16 3153; VI-NEXT: s_lshl_b32 s5, s5, 16 3154; VI-NEXT: s_lshl_b32 s4, s4, 16 3155; VI-NEXT: s_or_b32 s3, s3, s7 3156; VI-NEXT: s_or_b32 s2, s2, s6 3157; VI-NEXT: s_or_b32 s1, s1, s5 3158; VI-NEXT: s_or_b32 s0, s0, s4 3159; VI-NEXT: v_mov_b32_e32 v4, s8 3160; VI-NEXT: v_mov_b32_e32 v0, s0 3161; VI-NEXT: v_mov_b32_e32 v1, s1 3162; VI-NEXT: v_mov_b32_e32 v2, s2 3163; VI-NEXT: v_mov_b32_e32 v3, s3 3164; VI-NEXT: v_mov_b32_e32 v5, s9 3165; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 3166; VI-NEXT: s_endpgm 3167; 3168; GFX9-LABEL: s_test_umin_ult_v8i16: 3169; GFX9: ; %bb.0: 3170; GFX9-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 3171; GFX9-NEXT: v_mov_b32_e32 v4, 0 3172; GFX9-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 3173; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3174; GFX9-NEXT: v_mov_b32_e32 v0, s7 3175; GFX9-NEXT: v_mov_b32_e32 v1, s6 3176; GFX9-NEXT: v_pk_min_u16 v3, s3, v0 3177; GFX9-NEXT: v_mov_b32_e32 v0, s5 3178; GFX9-NEXT: v_pk_min_u16 v2, s2, v1 3179; GFX9-NEXT: v_pk_min_u16 v1, s1, v0 3180; GFX9-NEXT: v_mov_b32_e32 v0, s4 3181; GFX9-NEXT: v_pk_min_u16 v0, s0, v0 3182; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 3183; GFX9-NEXT: s_endpgm 3184; 3185; GFX10-LABEL: s_test_umin_ult_v8i16: 3186; GFX10: ; %bb.0: 3187; GFX10-NEXT: s_clause 0x1 3188; GFX10-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 3189; GFX10-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 3190; GFX10-NEXT: v_mov_b32_e32 v4, 0 3191; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3192; GFX10-NEXT: v_pk_min_u16 v3, s3, s7 3193; GFX10-NEXT: v_pk_min_u16 v2, s2, s6 3194; GFX10-NEXT: v_pk_min_u16 v1, s1, s5 3195; GFX10-NEXT: v_pk_min_u16 v0, s0, s4 3196; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] 3197; GFX10-NEXT: s_endpgm 3198; 3199; GFX11-LABEL: s_test_umin_ult_v8i16: 3200; GFX11: ; %bb.0: 3201; GFX11-NEXT: s_clause 0x1 3202; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 3203; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3204; GFX11-NEXT: v_mov_b32_e32 v4, 0 3205; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3206; GFX11-NEXT: v_pk_min_u16 v3, s11, s15 3207; GFX11-NEXT: v_pk_min_u16 v2, s10, s14 3208; GFX11-NEXT: v_pk_min_u16 v1, s9, s13 3209; GFX11-NEXT: v_pk_min_u16 v0, s8, s12 3210; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 3211; GFX11-NEXT: s_endpgm 3212 %cmp = icmp ult <8 x i16> %a, %b 3213 %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b 3214 store <8 x i16> %val, ptr addrspace(1) %out 3215 ret void 3216} 3217 3218; Make sure redundant and removed 3219 3220define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 { 3221; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3222; EG: ; %bb.0: 3223; EG-NEXT: ALU 0, @10, KC0[], KC1[] 3224; EG-NEXT: TEX 1 @6 3225; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 3226; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 3227; EG-NEXT: CF_END 3228; EG-NEXT: PAD 3229; EG-NEXT: Fetch clause starting at 6: 3230; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3 3231; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3 3232; EG-NEXT: ALU clause starting at 10: 3233; EG-NEXT: MOV * T0.X, 0.0, 3234; EG-NEXT: ALU clause starting at 11: 3235; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, 3236; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 3237; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3238; EG-NEXT: MIN_UINT T0.X, PV.Z, PV.W, 3239; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3240; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3241; 3242; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3243; CI: ; %bb.0: 3244; CI-NEXT: s_load_dword s2, s[8:9], 0xa 3245; CI-NEXT: s_load_dword s3, s[8:9], 0x13 3246; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3247; CI-NEXT: s_waitcnt lgkmcnt(0) 3248; CI-NEXT: s_and_b32 s2, s2, 0xffff 3249; CI-NEXT: s_and_b32 s3, s3, 0xffff 3250; CI-NEXT: s_min_u32 s2, s2, s3 3251; CI-NEXT: v_mov_b32_e32 v0, s0 3252; CI-NEXT: v_mov_b32_e32 v1, s1 3253; CI-NEXT: v_mov_b32_e32 v2, s2 3254; CI-NEXT: flat_store_dword v[0:1], v2 3255; CI-NEXT: s_endpgm 3256; 3257; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3258; VI: ; %bb.0: 3259; VI-NEXT: s_load_dword s2, s[8:9], 0x28 3260; VI-NEXT: s_load_dword s3, s[8:9], 0x4c 3261; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3262; VI-NEXT: s_waitcnt lgkmcnt(0) 3263; VI-NEXT: s_and_b32 s2, s2, 0xffff 3264; VI-NEXT: s_and_b32 s3, s3, 0xffff 3265; VI-NEXT: s_min_u32 s2, s2, s3 3266; VI-NEXT: v_mov_b32_e32 v0, s0 3267; VI-NEXT: v_mov_b32_e32 v1, s1 3268; VI-NEXT: v_mov_b32_e32 v2, s2 3269; VI-NEXT: flat_store_dword v[0:1], v2 3270; VI-NEXT: s_endpgm 3271; 3272; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3273; GFX9: ; %bb.0: 3274; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 3275; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c 3276; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3277; GFX9-NEXT: v_mov_b32_e32 v0, 0 3278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3279; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 3280; GFX9-NEXT: s_and_b32 s3, s3, 0xffff 3281; GFX9-NEXT: s_min_u32 s2, s2, s3 3282; GFX9-NEXT: v_mov_b32_e32 v1, s2 3283; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3284; GFX9-NEXT: s_endpgm 3285; 3286; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3287; GFX10: ; %bb.0: 3288; GFX10-NEXT: s_clause 0x2 3289; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 3290; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c 3291; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3292; GFX10-NEXT: v_mov_b32_e32 v0, 0 3293; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3294; GFX10-NEXT: s_and_b32 s2, s2, 0xffff 3295; GFX10-NEXT: s_and_b32 s3, s3, 0xffff 3296; GFX10-NEXT: s_min_u32 s2, s2, s3 3297; GFX10-NEXT: v_mov_b32_e32 v1, s2 3298; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 3299; GFX10-NEXT: s_endpgm 3300; 3301; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16: 3302; GFX11: ; %bb.0: 3303; GFX11-NEXT: s_clause 0x2 3304; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 3305; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c 3306; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3307; GFX11-NEXT: v_mov_b32_e32 v0, 0 3308; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3309; GFX11-NEXT: s_and_b32 s2, s2, 0xffff 3310; GFX11-NEXT: s_and_b32 s3, s3, 0xffff 3311; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3312; GFX11-NEXT: s_min_u32 s2, s2, s3 3313; GFX11-NEXT: v_mov_b32_e32 v1, s2 3314; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3315; GFX11-NEXT: s_endpgm 3316 %a.ext = zext i16 %a to i32 3317 %b.ext = zext i16 %b to i32 3318 %cmp = icmp ult i32 %a.ext, %b.ext 3319 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext 3320 %mask = and i32 %val, 65535 3321 store i32 %mask, ptr addrspace(1) %out 3322 ret void 3323} 3324 3325; Make sure redundant sign_extend_inreg removed. 3326 3327define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 { 3328; EG-LABEL: simplify_demanded_bits_test_min_slt_i16: 3329; EG: ; %bb.0: 3330; EG-NEXT: ALU 0, @10, KC0[], KC1[] 3331; EG-NEXT: TEX 1 @6 3332; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 3333; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 3334; EG-NEXT: CF_END 3335; EG-NEXT: PAD 3336; EG-NEXT: Fetch clause starting at 6: 3337; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3 3338; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3 3339; EG-NEXT: ALU clause starting at 10: 3340; EG-NEXT: MOV * T0.X, 0.0, 3341; EG-NEXT: ALU clause starting at 11: 3342; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, 3343; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 3344; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3345; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W, 3346; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3347; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3348; 3349; CI-LABEL: simplify_demanded_bits_test_min_slt_i16: 3350; CI: ; %bb.0: 3351; CI-NEXT: s_load_dword s2, s[8:9], 0xa 3352; CI-NEXT: s_load_dword s3, s[8:9], 0x13 3353; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3354; CI-NEXT: s_waitcnt lgkmcnt(0) 3355; CI-NEXT: s_sext_i32_i16 s2, s2 3356; CI-NEXT: s_sext_i32_i16 s3, s3 3357; CI-NEXT: s_min_i32 s2, s2, s3 3358; CI-NEXT: v_mov_b32_e32 v0, s0 3359; CI-NEXT: v_mov_b32_e32 v1, s1 3360; CI-NEXT: v_mov_b32_e32 v2, s2 3361; CI-NEXT: flat_store_dword v[0:1], v2 3362; CI-NEXT: s_endpgm 3363; 3364; VI-LABEL: simplify_demanded_bits_test_min_slt_i16: 3365; VI: ; %bb.0: 3366; VI-NEXT: s_load_dword s2, s[8:9], 0x28 3367; VI-NEXT: s_load_dword s3, s[8:9], 0x4c 3368; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3369; VI-NEXT: s_waitcnt lgkmcnt(0) 3370; VI-NEXT: s_sext_i32_i16 s2, s2 3371; VI-NEXT: s_sext_i32_i16 s3, s3 3372; VI-NEXT: s_min_i32 s2, s2, s3 3373; VI-NEXT: v_mov_b32_e32 v0, s0 3374; VI-NEXT: v_mov_b32_e32 v1, s1 3375; VI-NEXT: v_mov_b32_e32 v2, s2 3376; VI-NEXT: flat_store_dword v[0:1], v2 3377; VI-NEXT: s_endpgm 3378; 3379; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16: 3380; GFX9: ; %bb.0: 3381; GFX9-NEXT: s_load_dword s2, s[8:9], 0x28 3382; GFX9-NEXT: s_load_dword s3, s[8:9], 0x4c 3383; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3384; GFX9-NEXT: v_mov_b32_e32 v0, 0 3385; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3386; GFX9-NEXT: s_sext_i32_i16 s2, s2 3387; GFX9-NEXT: s_sext_i32_i16 s3, s3 3388; GFX9-NEXT: s_min_i32 s2, s2, s3 3389; GFX9-NEXT: v_mov_b32_e32 v1, s2 3390; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 3391; GFX9-NEXT: s_endpgm 3392; 3393; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16: 3394; GFX10: ; %bb.0: 3395; GFX10-NEXT: s_clause 0x2 3396; GFX10-NEXT: s_load_dword s2, s[8:9], 0x28 3397; GFX10-NEXT: s_load_dword s3, s[8:9], 0x4c 3398; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3399; GFX10-NEXT: v_mov_b32_e32 v0, 0 3400; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3401; GFX10-NEXT: s_sext_i32_i16 s2, s2 3402; GFX10-NEXT: s_sext_i32_i16 s3, s3 3403; GFX10-NEXT: s_min_i32 s2, s2, s3 3404; GFX10-NEXT: v_mov_b32_e32 v1, s2 3405; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 3406; GFX10-NEXT: s_endpgm 3407; 3408; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16: 3409; GFX11: ; %bb.0: 3410; GFX11-NEXT: s_clause 0x2 3411; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x28 3412; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x4c 3413; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3414; GFX11-NEXT: v_mov_b32_e32 v0, 0 3415; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3416; GFX11-NEXT: s_sext_i32_i16 s2, s2 3417; GFX11-NEXT: s_sext_i32_i16 s3, s3 3418; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3419; GFX11-NEXT: s_min_i32 s2, s2, s3 3420; GFX11-NEXT: v_mov_b32_e32 v1, s2 3421; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 3422; GFX11-NEXT: s_endpgm 3423 %a.ext = sext i16 %a to i32 3424 %b.ext = sext i16 %b to i32 3425 %cmp = icmp slt i32 %a.ext, %b.ext 3426 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext 3427 %shl = shl i32 %val, 16 3428 %sextinreg = ashr i32 %shl, 16 3429 store i32 %sextinreg, ptr addrspace(1) %out 3430 ret void 3431} 3432 3433define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 { 3434; EG-LABEL: s_test_imin_sle_i16: 3435; EG: ; %bb.0: 3436; EG-NEXT: ALU 0, @10, KC0[], KC1[] 3437; EG-NEXT: TEX 1 @6 3438; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 3439; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 3440; EG-NEXT: CF_END 3441; EG-NEXT: PAD 3442; EG-NEXT: Fetch clause starting at 6: 3443; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3 3444; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3 3445; EG-NEXT: ALU clause starting at 10: 3446; EG-NEXT: MOV * T0.X, 0.0, 3447; EG-NEXT: ALU clause starting at 11: 3448; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, 3449; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 3450; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 3451; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45) 3452; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, 3453; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 3454; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 3455; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 3456; EG-NEXT: LSHL T0.X, PV.W, PS, 3457; EG-NEXT: LSHL * T0.W, literal.x, PS, 3458; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 3459; EG-NEXT: MOV T0.Y, 0.0, 3460; EG-NEXT: MOV * T0.Z, 0.0, 3461; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3462; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3463; 3464; CI-LABEL: s_test_imin_sle_i16: 3465; CI: ; %bb.0: 3466; CI-NEXT: s_load_dword s2, s[8:9], 0x2 3467; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3468; CI-NEXT: s_waitcnt lgkmcnt(0) 3469; CI-NEXT: s_sext_i32_i16 s3, s2 3470; CI-NEXT: s_ashr_i32 s2, s2, 16 3471; CI-NEXT: s_min_i32 s2, s3, s2 3472; CI-NEXT: v_mov_b32_e32 v0, s0 3473; CI-NEXT: v_mov_b32_e32 v1, s1 3474; CI-NEXT: v_mov_b32_e32 v2, s2 3475; CI-NEXT: flat_store_short v[0:1], v2 3476; CI-NEXT: s_endpgm 3477; 3478; VI-LABEL: s_test_imin_sle_i16: 3479; VI: ; %bb.0: 3480; VI-NEXT: s_load_dword s2, s[8:9], 0x8 3481; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3482; VI-NEXT: s_waitcnt lgkmcnt(0) 3483; VI-NEXT: s_sext_i32_i16 s3, s2 3484; VI-NEXT: s_ashr_i32 s2, s2, 16 3485; VI-NEXT: s_min_i32 s2, s3, s2 3486; VI-NEXT: v_mov_b32_e32 v0, s0 3487; VI-NEXT: v_mov_b32_e32 v1, s1 3488; VI-NEXT: v_mov_b32_e32 v2, s2 3489; VI-NEXT: flat_store_short v[0:1], v2 3490; VI-NEXT: s_endpgm 3491; 3492; GFX9-LABEL: s_test_imin_sle_i16: 3493; GFX9: ; %bb.0: 3494; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 3495; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3496; GFX9-NEXT: v_mov_b32_e32 v0, 0 3497; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3498; GFX9-NEXT: s_sext_i32_i16 s3, s2 3499; GFX9-NEXT: s_ashr_i32 s2, s2, 16 3500; GFX9-NEXT: s_min_i32 s2, s3, s2 3501; GFX9-NEXT: v_mov_b32_e32 v1, s2 3502; GFX9-NEXT: global_store_short v0, v1, s[0:1] 3503; GFX9-NEXT: s_endpgm 3504; 3505; GFX10-LABEL: s_test_imin_sle_i16: 3506; GFX10: ; %bb.0: 3507; GFX10-NEXT: s_clause 0x1 3508; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8 3509; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 3510; GFX10-NEXT: v_mov_b32_e32 v0, 0 3511; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3512; GFX10-NEXT: s_sext_i32_i16 s3, s2 3513; GFX10-NEXT: s_ashr_i32 s2, s2, 16 3514; GFX10-NEXT: s_min_i32 s2, s3, s2 3515; GFX10-NEXT: v_mov_b32_e32 v1, s2 3516; GFX10-NEXT: global_store_short v0, v1, s[0:1] 3517; GFX10-NEXT: s_endpgm 3518; 3519; GFX11-LABEL: s_test_imin_sle_i16: 3520; GFX11: ; %bb.0: 3521; GFX11-NEXT: s_clause 0x1 3522; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 3523; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 3524; GFX11-NEXT: v_mov_b32_e32 v0, 0 3525; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3526; GFX11-NEXT: s_sext_i32_i16 s3, s2 3527; GFX11-NEXT: s_ashr_i32 s2, s2, 16 3528; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 3529; GFX11-NEXT: s_min_i32 s2, s3, s2 3530; GFX11-NEXT: v_mov_b32_e32 v1, s2 3531; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 3532; GFX11-NEXT: s_endpgm 3533 %cmp = icmp sle i16 %a, %b 3534 %val = select i1 %cmp, i16 %a, i16 %b 3535 store i16 %val, ptr addrspace(1) %out 3536 ret void 3537} 3538 3539; 64 bit 3540 3541define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { 3542; EG-LABEL: test_umin_ult_i64: 3543; EG: ; %bb.0: 3544; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 3545; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 3546; EG-NEXT: CF_END 3547; EG-NEXT: PAD 3548; EG-NEXT: ALU clause starting at 4: 3549; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z, 3550; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X, 3551; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W, 3552; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W, 3553; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X, 3554; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W, 3555; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3556; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3557; 3558; CI-LABEL: test_umin_ult_i64: 3559; CI: ; %bb.0: 3560; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3561; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 3562; CI-NEXT: s_waitcnt lgkmcnt(0) 3563; CI-NEXT: v_mov_b32_e32 v0, s0 3564; CI-NEXT: v_mov_b32_e32 v1, s4 3565; CI-NEXT: v_mov_b32_e32 v2, s5 3566; CI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] 3567; CI-NEXT: v_mov_b32_e32 v1, s1 3568; CI-NEXT: s_and_b64 s[0:1], vcc, exec 3569; CI-NEXT: s_cselect_b32 s0, s3, s5 3570; CI-NEXT: s_cselect_b32 s1, s2, s4 3571; CI-NEXT: v_mov_b32_e32 v2, s1 3572; CI-NEXT: v_mov_b32_e32 v3, s0 3573; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3574; CI-NEXT: s_endpgm 3575; 3576; VI-LABEL: test_umin_ult_i64: 3577; VI: ; %bb.0: 3578; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3579; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3580; VI-NEXT: s_waitcnt lgkmcnt(0) 3581; VI-NEXT: v_mov_b32_e32 v0, s0 3582; VI-NEXT: v_mov_b32_e32 v1, s4 3583; VI-NEXT: v_mov_b32_e32 v2, s5 3584; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2] 3585; VI-NEXT: v_mov_b32_e32 v1, s1 3586; VI-NEXT: s_and_b64 s[0:1], vcc, exec 3587; VI-NEXT: s_cselect_b32 s0, s3, s5 3588; VI-NEXT: s_cselect_b32 s1, s2, s4 3589; VI-NEXT: v_mov_b32_e32 v2, s1 3590; VI-NEXT: v_mov_b32_e32 v3, s0 3591; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3592; VI-NEXT: s_endpgm 3593; 3594; GFX9-LABEL: test_umin_ult_i64: 3595; GFX9: ; %bb.0: 3596; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3597; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3598; GFX9-NEXT: v_mov_b32_e32 v2, 0 3599; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3600; GFX9-NEXT: v_mov_b32_e32 v0, s4 3601; GFX9-NEXT: v_mov_b32_e32 v1, s5 3602; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] 3603; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec 3604; GFX9-NEXT: s_cselect_b32 s3, s3, s5 3605; GFX9-NEXT: s_cselect_b32 s2, s2, s4 3606; GFX9-NEXT: v_mov_b32_e32 v0, s2 3607; GFX9-NEXT: v_mov_b32_e32 v1, s3 3608; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3609; GFX9-NEXT: s_endpgm 3610; 3611; GFX10-LABEL: test_umin_ult_i64: 3612; GFX10: ; %bb.0: 3613; GFX10-NEXT: s_clause 0x1 3614; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3615; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3616; GFX10-NEXT: v_mov_b32_e32 v2, 0 3617; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3618; GFX10-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] 3619; GFX10-NEXT: s_and_b32 s6, s6, exec_lo 3620; GFX10-NEXT: s_cselect_b32 s2, s2, s4 3621; GFX10-NEXT: s_cselect_b32 s3, s3, s5 3622; GFX10-NEXT: v_mov_b32_e32 v0, s2 3623; GFX10-NEXT: v_mov_b32_e32 v1, s3 3624; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3625; GFX10-NEXT: s_endpgm 3626; 3627; GFX11-LABEL: test_umin_ult_i64: 3628; GFX11: ; %bb.0: 3629; GFX11-NEXT: s_clause 0x1 3630; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 3631; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 3632; GFX11-NEXT: v_mov_b32_e32 v2, 0 3633; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3634; GFX11-NEXT: v_cmp_lt_u64_e64 s6, s[2:3], s[4:5] 3635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3636; GFX11-NEXT: s_and_b32 s6, s6, exec_lo 3637; GFX11-NEXT: s_cselect_b32 s2, s2, s4 3638; GFX11-NEXT: s_cselect_b32 s3, s3, s5 3639; GFX11-NEXT: v_mov_b32_e32 v0, s2 3640; GFX11-NEXT: v_mov_b32_e32 v1, s3 3641; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 3642; GFX11-NEXT: s_endpgm 3643 %tmp = icmp ult i64 %a, %b 3644 %val = select i1 %tmp, i64 %a, i64 %b 3645 store i64 %val, ptr addrspace(1) %out, align 8 3646 ret void 3647} 3648 3649define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { 3650; EG-LABEL: test_umin_ule_i64: 3651; EG: ; %bb.0: 3652; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 3653; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 3654; EG-NEXT: CF_END 3655; EG-NEXT: PAD 3656; EG-NEXT: ALU clause starting at 4: 3657; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z, 3658; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X, 3659; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W, 3660; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W, 3661; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X, 3662; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W, 3663; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3664; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3665; 3666; CI-LABEL: test_umin_ule_i64: 3667; CI: ; %bb.0: 3668; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3669; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 3670; CI-NEXT: s_waitcnt lgkmcnt(0) 3671; CI-NEXT: v_mov_b32_e32 v0, s0 3672; CI-NEXT: v_mov_b32_e32 v1, s4 3673; CI-NEXT: v_mov_b32_e32 v2, s5 3674; CI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2] 3675; CI-NEXT: v_mov_b32_e32 v1, s1 3676; CI-NEXT: s_and_b64 s[0:1], vcc, exec 3677; CI-NEXT: s_cselect_b32 s0, s3, s5 3678; CI-NEXT: s_cselect_b32 s1, s2, s4 3679; CI-NEXT: v_mov_b32_e32 v2, s1 3680; CI-NEXT: v_mov_b32_e32 v3, s0 3681; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3682; CI-NEXT: s_endpgm 3683; 3684; VI-LABEL: test_umin_ule_i64: 3685; VI: ; %bb.0: 3686; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3687; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3688; VI-NEXT: s_waitcnt lgkmcnt(0) 3689; VI-NEXT: v_mov_b32_e32 v0, s0 3690; VI-NEXT: v_mov_b32_e32 v1, s4 3691; VI-NEXT: v_mov_b32_e32 v2, s5 3692; VI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2] 3693; VI-NEXT: v_mov_b32_e32 v1, s1 3694; VI-NEXT: s_and_b64 s[0:1], vcc, exec 3695; VI-NEXT: s_cselect_b32 s0, s3, s5 3696; VI-NEXT: s_cselect_b32 s1, s2, s4 3697; VI-NEXT: v_mov_b32_e32 v2, s1 3698; VI-NEXT: v_mov_b32_e32 v3, s0 3699; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3700; VI-NEXT: s_endpgm 3701; 3702; GFX9-LABEL: test_umin_ule_i64: 3703; GFX9: ; %bb.0: 3704; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3705; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3706; GFX9-NEXT: v_mov_b32_e32 v2, 0 3707; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3708; GFX9-NEXT: v_mov_b32_e32 v0, s4 3709; GFX9-NEXT: v_mov_b32_e32 v1, s5 3710; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1] 3711; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec 3712; GFX9-NEXT: s_cselect_b32 s3, s3, s5 3713; GFX9-NEXT: s_cselect_b32 s2, s2, s4 3714; GFX9-NEXT: v_mov_b32_e32 v0, s2 3715; GFX9-NEXT: v_mov_b32_e32 v1, s3 3716; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3717; GFX9-NEXT: s_endpgm 3718; 3719; GFX10-LABEL: test_umin_ule_i64: 3720; GFX10: ; %bb.0: 3721; GFX10-NEXT: s_clause 0x1 3722; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3723; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3724; GFX10-NEXT: v_mov_b32_e32 v2, 0 3725; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3726; GFX10-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] 3727; GFX10-NEXT: s_and_b32 s6, s6, exec_lo 3728; GFX10-NEXT: s_cselect_b32 s2, s2, s4 3729; GFX10-NEXT: s_cselect_b32 s3, s3, s5 3730; GFX10-NEXT: v_mov_b32_e32 v0, s2 3731; GFX10-NEXT: v_mov_b32_e32 v1, s3 3732; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3733; GFX10-NEXT: s_endpgm 3734; 3735; GFX11-LABEL: test_umin_ule_i64: 3736; GFX11: ; %bb.0: 3737; GFX11-NEXT: s_clause 0x1 3738; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 3739; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 3740; GFX11-NEXT: v_mov_b32_e32 v2, 0 3741; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3742; GFX11-NEXT: v_cmp_le_u64_e64 s6, s[2:3], s[4:5] 3743; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3744; GFX11-NEXT: s_and_b32 s6, s6, exec_lo 3745; GFX11-NEXT: s_cselect_b32 s2, s2, s4 3746; GFX11-NEXT: s_cselect_b32 s3, s3, s5 3747; GFX11-NEXT: v_mov_b32_e32 v0, s2 3748; GFX11-NEXT: v_mov_b32_e32 v1, s3 3749; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 3750; GFX11-NEXT: s_endpgm 3751 %tmp = icmp ule i64 %a, %b 3752 %val = select i1 %tmp, i64 %a, i64 %b 3753 store i64 %val, ptr addrspace(1) %out, align 8 3754 ret void 3755} 3756 3757define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { 3758; EG-LABEL: test_imin_slt_i64: 3759; EG: ; %bb.0: 3760; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 3761; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 3762; EG-NEXT: CF_END 3763; EG-NEXT: PAD 3764; EG-NEXT: ALU clause starting at 4: 3765; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z, 3766; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X, 3767; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W, 3768; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W, 3769; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X, 3770; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W, 3771; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3772; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3773; 3774; CI-LABEL: test_imin_slt_i64: 3775; CI: ; %bb.0: 3776; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3777; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 3778; CI-NEXT: s_waitcnt lgkmcnt(0) 3779; CI-NEXT: v_mov_b32_e32 v0, s0 3780; CI-NEXT: v_mov_b32_e32 v1, s4 3781; CI-NEXT: v_mov_b32_e32 v2, s5 3782; CI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 3783; CI-NEXT: v_mov_b32_e32 v1, s1 3784; CI-NEXT: s_and_b64 s[0:1], vcc, exec 3785; CI-NEXT: s_cselect_b32 s0, s3, s5 3786; CI-NEXT: s_cselect_b32 s1, s2, s4 3787; CI-NEXT: v_mov_b32_e32 v2, s1 3788; CI-NEXT: v_mov_b32_e32 v3, s0 3789; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3790; CI-NEXT: s_endpgm 3791; 3792; VI-LABEL: test_imin_slt_i64: 3793; VI: ; %bb.0: 3794; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3795; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3796; VI-NEXT: s_waitcnt lgkmcnt(0) 3797; VI-NEXT: v_mov_b32_e32 v0, s0 3798; VI-NEXT: v_mov_b32_e32 v1, s4 3799; VI-NEXT: v_mov_b32_e32 v2, s5 3800; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] 3801; VI-NEXT: v_mov_b32_e32 v1, s1 3802; VI-NEXT: s_and_b64 s[0:1], vcc, exec 3803; VI-NEXT: s_cselect_b32 s0, s3, s5 3804; VI-NEXT: s_cselect_b32 s1, s2, s4 3805; VI-NEXT: v_mov_b32_e32 v2, s1 3806; VI-NEXT: v_mov_b32_e32 v3, s0 3807; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3808; VI-NEXT: s_endpgm 3809; 3810; GFX9-LABEL: test_imin_slt_i64: 3811; GFX9: ; %bb.0: 3812; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3813; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3814; GFX9-NEXT: v_mov_b32_e32 v2, 0 3815; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3816; GFX9-NEXT: v_mov_b32_e32 v0, s4 3817; GFX9-NEXT: v_mov_b32_e32 v1, s5 3818; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] 3819; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec 3820; GFX9-NEXT: s_cselect_b32 s3, s3, s5 3821; GFX9-NEXT: s_cselect_b32 s2, s2, s4 3822; GFX9-NEXT: v_mov_b32_e32 v0, s2 3823; GFX9-NEXT: v_mov_b32_e32 v1, s3 3824; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3825; GFX9-NEXT: s_endpgm 3826; 3827; GFX10-LABEL: test_imin_slt_i64: 3828; GFX10: ; %bb.0: 3829; GFX10-NEXT: s_clause 0x1 3830; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3831; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3832; GFX10-NEXT: v_mov_b32_e32 v2, 0 3833; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3834; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] 3835; GFX10-NEXT: s_and_b32 s6, s6, exec_lo 3836; GFX10-NEXT: s_cselect_b32 s2, s2, s4 3837; GFX10-NEXT: s_cselect_b32 s3, s3, s5 3838; GFX10-NEXT: v_mov_b32_e32 v0, s2 3839; GFX10-NEXT: v_mov_b32_e32 v1, s3 3840; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3841; GFX10-NEXT: s_endpgm 3842; 3843; GFX11-LABEL: test_imin_slt_i64: 3844; GFX11: ; %bb.0: 3845; GFX11-NEXT: s_clause 0x1 3846; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 3847; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 3848; GFX11-NEXT: v_mov_b32_e32 v2, 0 3849; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3850; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[4:5] 3851; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3852; GFX11-NEXT: s_and_b32 s6, s6, exec_lo 3853; GFX11-NEXT: s_cselect_b32 s2, s2, s4 3854; GFX11-NEXT: s_cselect_b32 s3, s3, s5 3855; GFX11-NEXT: v_mov_b32_e32 v0, s2 3856; GFX11-NEXT: v_mov_b32_e32 v1, s3 3857; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 3858; GFX11-NEXT: s_endpgm 3859 %tmp = icmp slt i64 %a, %b 3860 %val = select i1 %tmp, i64 %a, i64 %b 3861 store i64 %val, ptr addrspace(1) %out, align 8 3862 ret void 3863} 3864 3865define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { 3866; EG-LABEL: test_imin_sle_i64: 3867; EG: ; %bb.0: 3868; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 3869; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 3870; EG-NEXT: CF_END 3871; EG-NEXT: PAD 3872; EG-NEXT: ALU clause starting at 4: 3873; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z, 3874; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X, 3875; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W, 3876; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W, 3877; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X, 3878; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W, 3879; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3880; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3881; 3882; CI-LABEL: test_imin_sle_i64: 3883; CI: ; %bb.0: 3884; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3885; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 3886; CI-NEXT: s_waitcnt lgkmcnt(0) 3887; CI-NEXT: v_mov_b32_e32 v0, s0 3888; CI-NEXT: v_mov_b32_e32 v1, s4 3889; CI-NEXT: v_mov_b32_e32 v2, s5 3890; CI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2] 3891; CI-NEXT: v_mov_b32_e32 v1, s1 3892; CI-NEXT: s_and_b64 s[0:1], vcc, exec 3893; CI-NEXT: s_cselect_b32 s0, s3, s5 3894; CI-NEXT: s_cselect_b32 s1, s2, s4 3895; CI-NEXT: v_mov_b32_e32 v2, s1 3896; CI-NEXT: v_mov_b32_e32 v3, s0 3897; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3898; CI-NEXT: s_endpgm 3899; 3900; VI-LABEL: test_imin_sle_i64: 3901; VI: ; %bb.0: 3902; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3903; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3904; VI-NEXT: s_waitcnt lgkmcnt(0) 3905; VI-NEXT: v_mov_b32_e32 v0, s0 3906; VI-NEXT: v_mov_b32_e32 v1, s4 3907; VI-NEXT: v_mov_b32_e32 v2, s5 3908; VI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2] 3909; VI-NEXT: v_mov_b32_e32 v1, s1 3910; VI-NEXT: s_and_b64 s[0:1], vcc, exec 3911; VI-NEXT: s_cselect_b32 s0, s3, s5 3912; VI-NEXT: s_cselect_b32 s1, s2, s4 3913; VI-NEXT: v_mov_b32_e32 v2, s1 3914; VI-NEXT: v_mov_b32_e32 v3, s0 3915; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3916; VI-NEXT: s_endpgm 3917; 3918; GFX9-LABEL: test_imin_sle_i64: 3919; GFX9: ; %bb.0: 3920; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3921; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3922; GFX9-NEXT: v_mov_b32_e32 v2, 0 3923; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3924; GFX9-NEXT: v_mov_b32_e32 v0, s4 3925; GFX9-NEXT: v_mov_b32_e32 v1, s5 3926; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1] 3927; GFX9-NEXT: s_and_b64 s[6:7], vcc, exec 3928; GFX9-NEXT: s_cselect_b32 s3, s3, s5 3929; GFX9-NEXT: s_cselect_b32 s2, s2, s4 3930; GFX9-NEXT: v_mov_b32_e32 v0, s2 3931; GFX9-NEXT: v_mov_b32_e32 v1, s3 3932; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3933; GFX9-NEXT: s_endpgm 3934; 3935; GFX10-LABEL: test_imin_sle_i64: 3936; GFX10: ; %bb.0: 3937; GFX10-NEXT: s_clause 0x1 3938; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 3939; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 3940; GFX10-NEXT: v_mov_b32_e32 v2, 0 3941; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3942; GFX10-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] 3943; GFX10-NEXT: s_and_b32 s6, s6, exec_lo 3944; GFX10-NEXT: s_cselect_b32 s2, s2, s4 3945; GFX10-NEXT: s_cselect_b32 s3, s3, s5 3946; GFX10-NEXT: v_mov_b32_e32 v0, s2 3947; GFX10-NEXT: v_mov_b32_e32 v1, s3 3948; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 3949; GFX10-NEXT: s_endpgm 3950; 3951; GFX11-LABEL: test_imin_sle_i64: 3952; GFX11: ; %bb.0: 3953; GFX11-NEXT: s_clause 0x1 3954; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 3955; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 3956; GFX11-NEXT: v_mov_b32_e32 v2, 0 3957; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3958; GFX11-NEXT: v_cmp_le_i64_e64 s6, s[2:3], s[4:5] 3959; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3960; GFX11-NEXT: s_and_b32 s6, s6, exec_lo 3961; GFX11-NEXT: s_cselect_b32 s2, s2, s4 3962; GFX11-NEXT: s_cselect_b32 s3, s3, s5 3963; GFX11-NEXT: v_mov_b32_e32 v0, s2 3964; GFX11-NEXT: v_mov_b32_e32 v1, s3 3965; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 3966; GFX11-NEXT: s_endpgm 3967 %tmp = icmp sle i64 %a, %b 3968 %val = select i1 %tmp, i64 %a, i64 %b 3969 store i64 %val, ptr addrspace(1) %out, align 8 3970 ret void 3971} 3972 3973define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 3974; EG-LABEL: v_test_imin_sle_v2i16: 3975; EG: ; %bb.0: 3976; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 3977; EG-NEXT: TEX 0 @8 3978; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] 3979; EG-NEXT: TEX 0 @10 3980; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[] 3981; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 3982; EG-NEXT: CF_END 3983; EG-NEXT: PAD 3984; EG-NEXT: Fetch clause starting at 8: 3985; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 3986; EG-NEXT: Fetch clause starting at 10: 3987; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 3988; EG-NEXT: ALU clause starting at 12: 3989; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 3990; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3991; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 3992; EG-NEXT: ALU clause starting at 15: 3993; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W, 3994; EG-NEXT: ALU clause starting at 16: 3995; EG-NEXT: LSHR T1.W, T0.X, literal.x, 3996; EG-NEXT: LSHR * T2.W, T7.X, literal.x, 3997; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 3998; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x, 3999; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, 4000; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x, 4001; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 4002; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4003; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z, 4004; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X, 4005; EG-NEXT: LSHL T2.W, PS, literal.x, 4006; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, 4007; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 4008; EG-NEXT: OR_INT T0.X, PS, PV.W, 4009; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 4010; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4011; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4012; 4013; CI-LABEL: v_test_imin_sle_v2i16: 4014; CI: ; %bb.0: 4015; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4016; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 4017; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 4018; CI-NEXT: s_waitcnt lgkmcnt(0) 4019; CI-NEXT: v_mov_b32_e32 v1, s3 4020; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 4021; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4022; CI-NEXT: v_mov_b32_e32 v3, s5 4023; CI-NEXT: flat_load_dword v4, v[0:1] 4024; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 4025; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 4026; CI-NEXT: flat_load_dword v3, v[0:1] 4027; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 4028; CI-NEXT: v_mov_b32_e32 v1, s1 4029; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4030; CI-NEXT: s_waitcnt vmcnt(1) 4031; CI-NEXT: v_bfe_i32 v2, v4, 0, 16 4032; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v4 4033; CI-NEXT: s_waitcnt vmcnt(0) 4034; CI-NEXT: v_bfe_i32 v5, v3, 0, 16 4035; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v3 4036; CI-NEXT: v_min_i32_e32 v3, v4, v3 4037; CI-NEXT: v_min_i32_e32 v2, v2, v5 4038; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 4039; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 4040; CI-NEXT: v_or_b32_e32 v2, v2, v3 4041; CI-NEXT: flat_store_dword v[0:1], v2 4042; CI-NEXT: s_endpgm 4043; 4044; VI-LABEL: v_test_imin_sle_v2i16: 4045; VI: ; %bb.0: 4046; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4047; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4048; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 4049; VI-NEXT: s_waitcnt lgkmcnt(0) 4050; VI-NEXT: v_mov_b32_e32 v1, s3 4051; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 4052; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4053; VI-NEXT: v_mov_b32_e32 v3, s5 4054; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 4055; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 4056; VI-NEXT: flat_load_dword v5, v[0:1] 4057; VI-NEXT: flat_load_dword v2, v[2:3] 4058; VI-NEXT: v_mov_b32_e32 v1, s1 4059; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 4060; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4061; VI-NEXT: s_waitcnt vmcnt(0) 4062; VI-NEXT: v_min_i16_e32 v3, v5, v2 4063; VI-NEXT: v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 4064; VI-NEXT: v_or_b32_e32 v2, v3, v2 4065; VI-NEXT: flat_store_dword v[0:1], v2 4066; VI-NEXT: s_endpgm 4067; 4068; GFX9-LABEL: v_test_imin_sle_v2i16: 4069; GFX9: ; %bb.0: 4070; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4071; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4072; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4073; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4074; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 4075; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 4076; GFX9-NEXT: s_waitcnt vmcnt(0) 4077; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 4078; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4079; GFX9-NEXT: s_endpgm 4080; 4081; GFX10-LABEL: v_test_imin_sle_v2i16: 4082; GFX10: ; %bb.0: 4083; GFX10-NEXT: s_clause 0x1 4084; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4085; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4086; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4087; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4088; GFX10-NEXT: s_clause 0x1 4089; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 4090; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 4091; GFX10-NEXT: s_waitcnt vmcnt(0) 4092; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 4093; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 4094; GFX10-NEXT: s_endpgm 4095; 4096; GFX11-LABEL: v_test_imin_sle_v2i16: 4097; GFX11: ; %bb.0: 4098; GFX11-NEXT: s_clause 0x1 4099; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 4100; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 4101; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4102; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4103; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4104; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4105; GFX11-NEXT: s_clause 0x1 4106; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 4107; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 4108; GFX11-NEXT: s_waitcnt vmcnt(0) 4109; GFX11-NEXT: v_pk_min_i16 v1, v1, v2 4110; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 4111; GFX11-NEXT: s_endpgm 4112 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4113 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid 4114 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid 4115 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 4116 %a = load <2 x i16>, ptr addrspace(1) %a.gep 4117 %b = load <2 x i16>, ptr addrspace(1) %b.gep 4118 %cmp = icmp sle <2 x i16> %a, %b 4119 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 4120 store <2 x i16> %val, ptr addrspace(1) %out.gep 4121 ret void 4122} 4123 4124; FIXME: i16 min 4125 4126define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 4127; EG-LABEL: v_test_imin_ule_v2i16: 4128; EG: ; %bb.0: 4129; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 4130; EG-NEXT: TEX 0 @8 4131; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] 4132; EG-NEXT: TEX 0 @10 4133; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[] 4134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 4135; EG-NEXT: CF_END 4136; EG-NEXT: PAD 4137; EG-NEXT: Fetch clause starting at 8: 4138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 4139; EG-NEXT: Fetch clause starting at 10: 4140; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 4141; EG-NEXT: ALU clause starting at 12: 4142; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 4143; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4144; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, 4145; EG-NEXT: ALU clause starting at 15: 4146; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W, 4147; EG-NEXT: ALU clause starting at 16: 4148; EG-NEXT: LSHR T1.W, T0.X, literal.x, 4149; EG-NEXT: LSHR * T2.W, T7.X, literal.x, 4150; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4151; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, 4152; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212 4153; EG-NEXT: MIN_UINT * T1.W, PS, PV.W, 4154; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 4155; EG-NEXT: LSHL T1.W, PS, literal.x, 4156; EG-NEXT: MIN_UINT * T2.W, PV.W, PV.Z, 4157; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 4158; EG-NEXT: OR_INT T0.X, PS, PV.W, 4159; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 4160; EG-NEXT: LSHR * T7.X, PV.W, literal.x, 4161; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 4162; 4163; CI-LABEL: v_test_imin_ule_v2i16: 4164; CI: ; %bb.0: 4165; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4166; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4 4167; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 4168; CI-NEXT: s_waitcnt lgkmcnt(0) 4169; CI-NEXT: v_mov_b32_e32 v1, s3 4170; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 4171; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4172; CI-NEXT: v_mov_b32_e32 v3, s5 4173; CI-NEXT: flat_load_dword v4, v[0:1] 4174; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 4175; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 4176; CI-NEXT: flat_load_dword v3, v[0:1] 4177; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 4178; CI-NEXT: v_mov_b32_e32 v1, s1 4179; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4180; CI-NEXT: s_waitcnt vmcnt(1) 4181; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 4182; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 4183; CI-NEXT: s_waitcnt vmcnt(0) 4184; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 4185; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3 4186; CI-NEXT: v_min_u32_e32 v2, v2, v5 4187; CI-NEXT: v_min_u32_e32 v3, v4, v3 4188; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4189; CI-NEXT: v_or_b32_e32 v2, v3, v2 4190; CI-NEXT: flat_store_dword v[0:1], v2 4191; CI-NEXT: s_endpgm 4192; 4193; VI-LABEL: v_test_imin_ule_v2i16: 4194; VI: ; %bb.0: 4195; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4196; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4197; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 4198; VI-NEXT: s_waitcnt lgkmcnt(0) 4199; VI-NEXT: v_mov_b32_e32 v1, s3 4200; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 4201; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4202; VI-NEXT: v_mov_b32_e32 v3, s5 4203; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 4204; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 4205; VI-NEXT: flat_load_dword v5, v[0:1] 4206; VI-NEXT: flat_load_dword v2, v[2:3] 4207; VI-NEXT: v_mov_b32_e32 v1, s1 4208; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 4209; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4210; VI-NEXT: s_waitcnt vmcnt(0) 4211; VI-NEXT: v_min_u16_e32 v3, v5, v2 4212; VI-NEXT: v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 4213; VI-NEXT: v_or_b32_e32 v2, v3, v2 4214; VI-NEXT: flat_store_dword v[0:1], v2 4215; VI-NEXT: s_endpgm 4216; 4217; GFX9-LABEL: v_test_imin_ule_v2i16: 4218; GFX9: ; %bb.0: 4219; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4220; GFX9-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4221; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4222; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4223; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 4224; GFX9-NEXT: global_load_dword v2, v0, s[4:5] 4225; GFX9-NEXT: s_waitcnt vmcnt(0) 4226; GFX9-NEXT: v_pk_min_u16 v1, v1, v2 4227; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 4228; GFX9-NEXT: s_endpgm 4229; 4230; GFX10-LABEL: v_test_imin_ule_v2i16: 4231; GFX10: ; %bb.0: 4232; GFX10-NEXT: s_clause 0x1 4233; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 4234; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10 4235; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4236; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4237; GFX10-NEXT: s_clause 0x1 4238; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 4239; GFX10-NEXT: global_load_dword v2, v0, s[4:5] 4240; GFX10-NEXT: s_waitcnt vmcnt(0) 4241; GFX10-NEXT: v_pk_min_u16 v1, v1, v2 4242; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 4243; GFX10-NEXT: s_endpgm 4244; 4245; GFX11-LABEL: v_test_imin_ule_v2i16: 4246; GFX11: ; %bb.0: 4247; GFX11-NEXT: s_clause 0x1 4248; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 4249; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 4250; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 4251; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 4252; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 4253; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4254; GFX11-NEXT: s_clause 0x1 4255; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 4256; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] 4257; GFX11-NEXT: s_waitcnt vmcnt(0) 4258; GFX11-NEXT: v_pk_min_u16 v1, v1, v2 4259; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 4260; GFX11-NEXT: s_endpgm 4261 %tid = call i32 @llvm.amdgcn.workitem.id.x() 4262 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid 4263 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid 4264 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 4265 %a = load <2 x i16>, ptr addrspace(1) %a.gep 4266 %b = load <2 x i16>, ptr addrspace(1) %b.gep 4267 %cmp = icmp ule <2 x i16> %a, %b 4268 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 4269 store <2 x i16> %val, ptr addrspace(1) %out.gep 4270 ret void 4271} 4272 4273declare i32 @llvm.amdgcn.workitem.id.x() #1 4274 4275attributes #0 = { nounwind } 4276attributes #1 = { nounwind readnone } 4277