1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI 3; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9 4 5; FIXME: Need to handle non-uniform case for function below (load without gep). 6define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 7; VI-LABEL: v_test_imax_sge_i16: 8; VI: ; %bb.0: 9; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 10; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 11; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 12; VI-NEXT: s_waitcnt lgkmcnt(0) 13; VI-NEXT: v_mov_b32_e32 v1, s3 14; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 15; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16; VI-NEXT: v_mov_b32_e32 v3, s5 17; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 18; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 19; VI-NEXT: flat_load_ushort v5, v[0:1] 20; VI-NEXT: flat_load_ushort v2, v[2:3] 21; VI-NEXT: v_mov_b32_e32 v1, s1 22; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 23; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 24; VI-NEXT: s_waitcnt vmcnt(0) 25; VI-NEXT: v_max_i16_e32 v2, v5, v2 26; VI-NEXT: flat_store_short v[0:1], v2 27; VI-NEXT: s_endpgm 28; 29; GFX9-LABEL: v_test_imax_sge_i16: 30; GFX9: ; %bb.0: 31; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 32; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 33; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 34; GFX9-NEXT: s_waitcnt lgkmcnt(0) 35; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 36; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] 37; GFX9-NEXT: s_waitcnt vmcnt(0) 38; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 39; GFX9-NEXT: global_store_short v0, v1, s[0:1] 40; GFX9-NEXT: s_endpgm 41 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 42 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid 43 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid 44 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid 45 %a = load i16, ptr addrspace(1) %gep0, align 4 46 %b = load i16, ptr addrspace(1) %gep1, align 4 47 %cmp = icmp sge i16 %a, %b 48 %val = select i1 %cmp, i16 %a, i16 %b 49 store i16 %val, ptr addrspace(1) %outgep, align 4 50 ret void 51} 52 53; FIXME: Need to handle non-uniform case for function below (load without gep). 54define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 55; VI-LABEL: v_test_imax_sge_v2i16: 56; VI: ; %bb.0: 57; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 58; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 59; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 60; VI-NEXT: s_waitcnt lgkmcnt(0) 61; VI-NEXT: v_mov_b32_e32 v1, s3 62; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 63; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 64; VI-NEXT: v_mov_b32_e32 v3, s5 65; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 66; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 67; VI-NEXT: flat_load_dword v5, v[0:1] 68; VI-NEXT: flat_load_dword v2, v[2:3] 69; VI-NEXT: v_mov_b32_e32 v1, s1 70; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 71; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 72; VI-NEXT: s_waitcnt vmcnt(0) 73; VI-NEXT: v_max_i16_e32 v3, v5, v2 74; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 75; VI-NEXT: v_or_b32_e32 v2, v3, v2 76; VI-NEXT: flat_store_dword v[0:1], v2 77; VI-NEXT: s_endpgm 78; 79; GFX9-LABEL: v_test_imax_sge_v2i16: 80; GFX9: ; %bb.0: 81; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 82; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 83; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 84; GFX9-NEXT: s_waitcnt lgkmcnt(0) 85; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 86; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 87; GFX9-NEXT: s_waitcnt vmcnt(0) 88; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 89; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 90; GFX9-NEXT: s_endpgm 91 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 92 %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid 93 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid 94 %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid 95 %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4 96 %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4 97 %cmp = icmp sge <2 x i16> %a, %b 98 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 99 store <2 x i16> %val, ptr addrspace(1) %outgep, align 4 100 ret void 101} 102 103; FIXME: Need to handle non-uniform case for function below (load without gep). 104define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 105; VI-LABEL: v_test_imax_sge_v3i16: 106; VI: ; %bb.0: 107; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 108; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 109; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 110; VI-NEXT: s_waitcnt lgkmcnt(0) 111; VI-NEXT: v_mov_b32_e32 v1, s3 112; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 113; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 114; VI-NEXT: v_mov_b32_e32 v3, s5 115; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 116; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 117; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 118; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 119; VI-NEXT: flat_load_ushort v4, v[4:5] 120; VI-NEXT: flat_load_dword v5, v[0:1] 121; VI-NEXT: flat_load_dword v7, v[2:3] 122; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 123; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 124; VI-NEXT: flat_load_ushort v8, v[0:1] 125; VI-NEXT: v_mov_b32_e32 v1, s1 126; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 127; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 128; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 129; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 130; VI-NEXT: s_waitcnt vmcnt(1) 131; VI-NEXT: v_max_i16_e32 v6, v5, v7 132; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 133; VI-NEXT: v_or_b32_e32 v5, v6, v5 134; VI-NEXT: s_waitcnt vmcnt(0) 135; VI-NEXT: v_max_i16_e32 v4, v4, v8 136; VI-NEXT: flat_store_short v[2:3], v4 137; VI-NEXT: flat_store_dword v[0:1], v5 138; VI-NEXT: s_endpgm 139; 140; GFX9-LABEL: v_test_imax_sge_v3i16: 141; GFX9: ; %bb.0: 142; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 143; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 144; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 145; GFX9-NEXT: v_mov_b32_e32 v1, 0 146; GFX9-NEXT: v_mov_b32_e32 v2, 0 147; GFX9-NEXT: s_waitcnt lgkmcnt(0) 148; GFX9-NEXT: global_load_dword v3, v0, s[6:7] 149; GFX9-NEXT: s_nop 0 150; GFX9-NEXT: global_load_short_d16 v2, v0, s[2:3] offset:4 151; GFX9-NEXT: s_nop 0 152; GFX9-NEXT: global_load_dword v4, v0, s[2:3] 153; GFX9-NEXT: s_nop 0 154; GFX9-NEXT: global_load_short_d16 v1, v0, s[6:7] offset:4 155; GFX9-NEXT: s_waitcnt vmcnt(1) 156; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 157; GFX9-NEXT: s_waitcnt vmcnt(0) 158; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 159; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 160; GFX9-NEXT: global_store_dword v0, v3, s[0:1] 161; GFX9-NEXT: s_endpgm 162 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 163 %gep0 = getelementptr <3 x i16>, ptr addrspace(1) %aptr, i32 %tid 164 %gep1 = getelementptr <3 x i16>, ptr addrspace(1) %bptr, i32 %tid 165 %outgep = getelementptr <3 x i16>, ptr addrspace(1) %out, i32 %tid 166 %a = load <3 x i16>, ptr addrspace(1) %gep0, align 4 167 %b = load <3 x i16>, ptr addrspace(1) %gep1, align 4 168 %cmp = icmp sge <3 x i16> %a, %b 169 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b 170 store <3 x i16> %val, ptr addrspace(1) %outgep, align 4 171 ret void 172} 173 174; FIXME: Need to handle non-uniform case for function below (load without gep). 175define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 176; VI-LABEL: v_test_imax_sge_v4i16: 177; VI: ; %bb.0: 178; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 179; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 180; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: v_mov_b32_e32 v1, s3 183; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 184; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 185; VI-NEXT: v_mov_b32_e32 v3, s5 186; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 187; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 188; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 189; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 190; VI-NEXT: v_mov_b32_e32 v5, s1 191; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 192; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 193; VI-NEXT: s_waitcnt vmcnt(0) 194; VI-NEXT: v_max_i16_e32 v6, v1, v3 195; VI-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 196; VI-NEXT: v_max_i16_e32 v3, v0, v2 197; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 198; VI-NEXT: v_or_b32_e32 v1, v6, v1 199; VI-NEXT: v_or_b32_e32 v0, v3, v0 200; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 201; VI-NEXT: s_endpgm 202; 203; GFX9-LABEL: v_test_imax_sge_v4i16: 204; GFX9: ; %bb.0: 205; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 206; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 207; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 208; GFX9-NEXT: s_waitcnt lgkmcnt(0) 209; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 210; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 211; GFX9-NEXT: s_waitcnt vmcnt(0) 212; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 213; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 214; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 215; GFX9-NEXT: s_endpgm 216 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 217 %gep0 = getelementptr <4 x i16>, ptr addrspace(1) %aptr, i32 %tid 218 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %bptr, i32 %tid 219 %outgep = getelementptr <4 x i16>, ptr addrspace(1) %out, i32 %tid 220 %a = load <4 x i16>, ptr addrspace(1) %gep0, align 4 221 %b = load <4 x i16>, ptr addrspace(1) %gep1, align 4 222 %cmp = icmp sge <4 x i16> %a, %b 223 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b 224 store <4 x i16> %val, ptr addrspace(1) %outgep, align 4 225 ret void 226} 227 228; FIXME: Need to handle non-uniform case for function below (load without gep). 229define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 230; VI-LABEL: v_test_imax_sgt_i16: 231; VI: ; %bb.0: 232; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 233; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 234; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: v_mov_b32_e32 v1, s3 237; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 238; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 239; VI-NEXT: v_mov_b32_e32 v3, s5 240; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 241; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 242; VI-NEXT: flat_load_ushort v5, v[0:1] 243; VI-NEXT: flat_load_ushort v2, v[2:3] 244; VI-NEXT: v_mov_b32_e32 v1, s1 245; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 246; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 247; VI-NEXT: s_waitcnt vmcnt(0) 248; VI-NEXT: v_max_i16_e32 v2, v5, v2 249; VI-NEXT: flat_store_short v[0:1], v2 250; VI-NEXT: s_endpgm 251; 252; GFX9-LABEL: v_test_imax_sgt_i16: 253; GFX9: ; %bb.0: 254; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 255; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 256; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 257; GFX9-NEXT: s_waitcnt lgkmcnt(0) 258; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 259; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] 260; GFX9-NEXT: s_waitcnt vmcnt(0) 261; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 262; GFX9-NEXT: global_store_short v0, v1, s[0:1] 263; GFX9-NEXT: s_endpgm 264 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 265 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid 266 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid 267 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid 268 %a = load i16, ptr addrspace(1) %gep0, align 4 269 %b = load i16, ptr addrspace(1) %gep1, align 4 270 %cmp = icmp sgt i16 %a, %b 271 %val = select i1 %cmp, i16 %a, i16 %b 272 store i16 %val, ptr addrspace(1) %outgep, align 4 273 ret void 274} 275 276; FIXME: Need to handle non-uniform case for function below (load without gep). 277define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 278; VI-LABEL: v_test_umax_uge_i16: 279; VI: ; %bb.0: 280; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 281; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 282; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 283; VI-NEXT: s_waitcnt lgkmcnt(0) 284; VI-NEXT: v_mov_b32_e32 v1, s3 285; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 286; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 287; VI-NEXT: v_mov_b32_e32 v3, s5 288; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 289; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 290; VI-NEXT: flat_load_ushort v5, v[0:1] 291; VI-NEXT: flat_load_ushort v2, v[2:3] 292; VI-NEXT: v_mov_b32_e32 v1, s1 293; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 294; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 295; VI-NEXT: s_waitcnt vmcnt(0) 296; VI-NEXT: v_max_u16_e32 v2, v5, v2 297; VI-NEXT: flat_store_short v[0:1], v2 298; VI-NEXT: s_endpgm 299; 300; GFX9-LABEL: v_test_umax_uge_i16: 301; GFX9: ; %bb.0: 302; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 303; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 304; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 305; GFX9-NEXT: s_waitcnt lgkmcnt(0) 306; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 307; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] 308; GFX9-NEXT: s_waitcnt vmcnt(0) 309; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 310; GFX9-NEXT: global_store_short v0, v1, s[0:1] 311; GFX9-NEXT: s_endpgm 312 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 313 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid 314 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid 315 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid 316 %a = load i16, ptr addrspace(1) %gep0, align 4 317 %b = load i16, ptr addrspace(1) %gep1, align 4 318 %cmp = icmp uge i16 %a, %b 319 %val = select i1 %cmp, i16 %a, i16 %b 320 store i16 %val, ptr addrspace(1) %outgep, align 4 321 ret void 322} 323 324; FIXME: Need to handle non-uniform case for function below (load without gep). 325define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 326; VI-LABEL: v_test_umax_ugt_i16: 327; VI: ; %bb.0: 328; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 329; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 330; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 331; VI-NEXT: s_waitcnt lgkmcnt(0) 332; VI-NEXT: v_mov_b32_e32 v1, s3 333; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 334; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 335; VI-NEXT: v_mov_b32_e32 v3, s5 336; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 337; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 338; VI-NEXT: flat_load_ushort v5, v[0:1] 339; VI-NEXT: flat_load_ushort v2, v[2:3] 340; VI-NEXT: v_mov_b32_e32 v1, s1 341; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 342; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 343; VI-NEXT: s_waitcnt vmcnt(0) 344; VI-NEXT: v_max_u16_e32 v2, v5, v2 345; VI-NEXT: flat_store_short v[0:1], v2 346; VI-NEXT: s_endpgm 347; 348; GFX9-LABEL: v_test_umax_ugt_i16: 349; GFX9: ; %bb.0: 350; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 351; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 352; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 353; GFX9-NEXT: s_waitcnt lgkmcnt(0) 354; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 355; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] 356; GFX9-NEXT: s_waitcnt vmcnt(0) 357; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 358; GFX9-NEXT: global_store_short v0, v1, s[0:1] 359; GFX9-NEXT: s_endpgm 360 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 361 %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid 362 %gep1 = getelementptr i16, ptr addrspace(1) %bptr, i32 %tid 363 %outgep = getelementptr i16, ptr addrspace(1) %out, i32 %tid 364 %a = load i16, ptr addrspace(1) %gep0, align 4 365 %b = load i16, ptr addrspace(1) %gep1, align 4 366 %cmp = icmp ugt i16 %a, %b 367 %val = select i1 %cmp, i16 %a, i16 %b 368 store i16 %val, ptr addrspace(1) %outgep, align 4 369 ret void 370} 371 372define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 373; VI-LABEL: v_test_umax_ugt_v2i16: 374; VI: ; %bb.0: 375; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 376; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 377; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 378; VI-NEXT: s_waitcnt lgkmcnt(0) 379; VI-NEXT: v_mov_b32_e32 v1, s3 380; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 381; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 382; VI-NEXT: v_mov_b32_e32 v3, s5 383; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 384; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 385; VI-NEXT: flat_load_dword v5, v[0:1] 386; VI-NEXT: flat_load_dword v2, v[2:3] 387; VI-NEXT: v_mov_b32_e32 v1, s1 388; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 389; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 390; VI-NEXT: s_waitcnt vmcnt(0) 391; VI-NEXT: v_max_u16_e32 v3, v5, v2 392; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 393; VI-NEXT: v_or_b32_e32 v2, v3, v2 394; VI-NEXT: flat_store_dword v[0:1], v2 395; VI-NEXT: s_endpgm 396; 397; GFX9-LABEL: v_test_umax_ugt_v2i16: 398; GFX9: ; %bb.0: 399; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 400; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 401; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 403; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 404; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 405; GFX9-NEXT: s_waitcnt vmcnt(0) 406; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 407; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 408; GFX9-NEXT: s_endpgm 409 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 410 %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid 411 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid 412 %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid 413 %a = load <2 x i16>, ptr addrspace(1) %gep0, align 4 414 %b = load <2 x i16>, ptr addrspace(1) %gep1, align 4 415 %cmp = icmp ugt <2 x i16> %a, %b 416 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b 417 store <2 x i16> %val, ptr addrspace(1) %outgep, align 4 418 ret void 419} 420 421declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 422