1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 7 8declare half @llvm.maxnum.f16(half %a, half %b) 9declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) 10declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b) 11declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b) 12 13define amdgpu_kernel void @maxnum_f16( 14; SI-LABEL: maxnum_f16: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 17; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 18; SI-NEXT: s_mov_b32 s7, 0xf000 19; SI-NEXT: s_mov_b32 s6, -1 20; SI-NEXT: s_mov_b32 s14, s6 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_mov_b32 s12, s2 23; SI-NEXT: s_mov_b32 s13, s3 24; SI-NEXT: s_mov_b32 s15, s7 25; SI-NEXT: s_mov_b32 s10, s6 26; SI-NEXT: s_mov_b32 s11, s7 27; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: s_mov_b32 s4, s0 32; SI-NEXT: s_mov_b32 s5, s1 33; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 34; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 35; SI-NEXT: v_max_f32_e32 v0, v0, v1 36; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 37; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 38; SI-NEXT: s_endpgm 39; 40; VI-LABEL: maxnum_f16: 41; VI: ; %bb.0: ; %entry 42; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 43; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 44; VI-NEXT: s_mov_b32 s7, 0xf000 45; VI-NEXT: s_mov_b32 s6, -1 46; VI-NEXT: s_mov_b32 s14, s6 47; VI-NEXT: s_waitcnt lgkmcnt(0) 48; VI-NEXT: s_mov_b32 s12, s2 49; VI-NEXT: s_mov_b32 s13, s3 50; VI-NEXT: s_mov_b32 s15, s7 51; VI-NEXT: s_mov_b32 s10, s6 52; VI-NEXT: s_mov_b32 s11, s7 53; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 54; VI-NEXT: s_waitcnt vmcnt(0) 55; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 56; VI-NEXT: s_waitcnt vmcnt(0) 57; VI-NEXT: s_mov_b32 s4, s0 58; VI-NEXT: s_mov_b32 s5, s1 59; VI-NEXT: v_max_f16_e32 v0, v0, v0 60; VI-NEXT: v_max_f16_e32 v1, v1, v1 61; VI-NEXT: v_max_f16_e32 v0, v0, v1 62; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 63; VI-NEXT: s_endpgm 64; 65; GFX9-LABEL: maxnum_f16: 66; GFX9: ; %bb.0: ; %entry 67; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 68; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 69; GFX9-NEXT: s_mov_b32 s7, 0xf000 70; GFX9-NEXT: s_mov_b32 s6, -1 71; GFX9-NEXT: s_mov_b32 s14, s6 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: s_mov_b32 s12, s2 74; GFX9-NEXT: s_mov_b32 s13, s3 75; GFX9-NEXT: s_mov_b32 s15, s7 76; GFX9-NEXT: s_mov_b32 s10, s6 77; GFX9-NEXT: s_mov_b32 s11, s7 78; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 79; GFX9-NEXT: s_waitcnt vmcnt(0) 80; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: s_mov_b32 s4, s0 83; GFX9-NEXT: s_mov_b32 s5, s1 84; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 85; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 86; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 87; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 88; GFX9-NEXT: s_endpgm 89; 90; GFX10-LABEL: maxnum_f16: 91; GFX10: ; %bb.0: ; %entry 92; GFX10-NEXT: s_clause 0x1 93; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 94; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 95; GFX10-NEXT: s_mov_b32 s6, -1 96; GFX10-NEXT: s_mov_b32 s7, 0x31016000 97; GFX10-NEXT: s_mov_b32 s14, s6 98; GFX10-NEXT: s_mov_b32 s15, s7 99; GFX10-NEXT: s_mov_b32 s10, s6 100; GFX10-NEXT: s_mov_b32 s11, s7 101; GFX10-NEXT: s_waitcnt lgkmcnt(0) 102; GFX10-NEXT: s_mov_b32 s12, s2 103; GFX10-NEXT: s_mov_b32 s13, s3 104; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 105; GFX10-NEXT: s_waitcnt vmcnt(0) 106; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: s_mov_b32 s4, s0 109; GFX10-NEXT: s_mov_b32 s5, s1 110; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 111; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 112; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 113; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 114; GFX10-NEXT: s_endpgm 115; 116; GFX11-LABEL: maxnum_f16: 117; GFX11: ; %bb.0: ; %entry 118; GFX11-NEXT: s_clause 0x1 119; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 120; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 121; GFX11-NEXT: s_mov_b32 s10, -1 122; GFX11-NEXT: s_mov_b32 s11, 0x31016000 123; GFX11-NEXT: s_mov_b32 s14, s10 124; GFX11-NEXT: s_mov_b32 s15, s11 125; GFX11-NEXT: s_mov_b32 s6, s10 126; GFX11-NEXT: s_mov_b32 s7, s11 127; GFX11-NEXT: s_waitcnt lgkmcnt(0) 128; GFX11-NEXT: s_mov_b32 s12, s2 129; GFX11-NEXT: s_mov_b32 s13, s3 130; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 131; GFX11-NEXT: s_waitcnt vmcnt(0) 132; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 133; GFX11-NEXT: s_waitcnt vmcnt(0) 134; GFX11-NEXT: s_mov_b32 s8, s0 135; GFX11-NEXT: s_mov_b32 s9, s1 136; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 137; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 139; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 140; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 141; GFX11-NEXT: s_endpgm 142 ptr addrspace(1) %r, 143 ptr addrspace(1) %a, 144 ptr addrspace(1) %b) #0 { 145entry: 146 %a.val = load volatile half, ptr addrspace(1) %a 147 %b.val = load volatile half, ptr addrspace(1) %b 148 %r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val) 149 store half %r.val, ptr addrspace(1) %r 150 ret void 151} 152 153define amdgpu_kernel void @maxnum_f16_imm_a( 154; SI-LABEL: maxnum_f16_imm_a: 155; SI: ; %bb.0: ; %entry 156; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 157; SI-NEXT: s_mov_b32 s7, 0xf000 158; SI-NEXT: s_mov_b32 s6, -1 159; SI-NEXT: s_mov_b32 s10, s6 160; SI-NEXT: s_mov_b32 s11, s7 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_mov_b32 s8, s2 163; SI-NEXT: s_mov_b32 s9, s3 164; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 165; SI-NEXT: s_mov_b32 s4, s0 166; SI-NEXT: s_mov_b32 s5, s1 167; SI-NEXT: s_waitcnt vmcnt(0) 168; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 169; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 170; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 171; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 172; SI-NEXT: s_endpgm 173; 174; VI-LABEL: maxnum_f16_imm_a: 175; VI: ; %bb.0: ; %entry 176; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 177; VI-NEXT: s_mov_b32 s7, 0xf000 178; VI-NEXT: s_mov_b32 s6, -1 179; VI-NEXT: s_mov_b32 s10, s6 180; VI-NEXT: s_mov_b32 s11, s7 181; VI-NEXT: s_waitcnt lgkmcnt(0) 182; VI-NEXT: s_mov_b32 s8, s2 183; VI-NEXT: s_mov_b32 s9, s3 184; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 185; VI-NEXT: s_mov_b32 s4, s0 186; VI-NEXT: s_mov_b32 s5, s1 187; VI-NEXT: s_waitcnt vmcnt(0) 188; VI-NEXT: v_max_f16_e32 v0, v0, v0 189; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 190; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 191; VI-NEXT: s_endpgm 192; 193; GFX9-LABEL: maxnum_f16_imm_a: 194; GFX9: ; %bb.0: ; %entry 195; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 196; GFX9-NEXT: s_mov_b32 s7, 0xf000 197; GFX9-NEXT: s_mov_b32 s6, -1 198; GFX9-NEXT: s_mov_b32 s10, s6 199; GFX9-NEXT: s_mov_b32 s11, s7 200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 201; GFX9-NEXT: s_mov_b32 s8, s2 202; GFX9-NEXT: s_mov_b32 s9, s3 203; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 204; GFX9-NEXT: s_mov_b32 s4, s0 205; GFX9-NEXT: s_mov_b32 s5, s1 206; GFX9-NEXT: s_waitcnt vmcnt(0) 207; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 208; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0 209; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 210; GFX9-NEXT: s_endpgm 211; 212; GFX10-LABEL: maxnum_f16_imm_a: 213; GFX10: ; %bb.0: ; %entry 214; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 215; GFX10-NEXT: s_mov_b32 s6, -1 216; GFX10-NEXT: s_mov_b32 s7, 0x31016000 217; GFX10-NEXT: s_mov_b32 s10, s6 218; GFX10-NEXT: s_mov_b32 s11, s7 219; GFX10-NEXT: s_waitcnt lgkmcnt(0) 220; GFX10-NEXT: s_mov_b32 s8, s2 221; GFX10-NEXT: s_mov_b32 s9, s3 222; GFX10-NEXT: s_mov_b32 s4, s0 223; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 224; GFX10-NEXT: s_mov_b32 s5, s1 225; GFX10-NEXT: s_waitcnt vmcnt(0) 226; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 227; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 228; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 229; GFX10-NEXT: s_endpgm 230; 231; GFX11-LABEL: maxnum_f16_imm_a: 232; GFX11: ; %bb.0: ; %entry 233; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 234; GFX11-NEXT: s_mov_b32 s6, -1 235; GFX11-NEXT: s_mov_b32 s7, 0x31016000 236; GFX11-NEXT: s_mov_b32 s10, s6 237; GFX11-NEXT: s_mov_b32 s11, s7 238; GFX11-NEXT: s_waitcnt lgkmcnt(0) 239; GFX11-NEXT: s_mov_b32 s8, s2 240; GFX11-NEXT: s_mov_b32 s9, s3 241; GFX11-NEXT: s_mov_b32 s4, s0 242; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 243; GFX11-NEXT: s_mov_b32 s5, s1 244; GFX11-NEXT: s_waitcnt vmcnt(0) 245; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 246; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 247; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 248; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 249; GFX11-NEXT: s_endpgm 250 ptr addrspace(1) %r, 251 ptr addrspace(1) %b) #0 { 252entry: 253 %b.val = load half, ptr addrspace(1) %b 254 %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) 255 store half %r.val, ptr addrspace(1) %r 256 ret void 257} 258 259define amdgpu_kernel void @maxnum_f16_imm_b( 260; SI-LABEL: maxnum_f16_imm_b: 261; SI: ; %bb.0: ; %entry 262; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 263; SI-NEXT: s_mov_b32 s7, 0xf000 264; SI-NEXT: s_mov_b32 s6, -1 265; SI-NEXT: s_mov_b32 s10, s6 266; SI-NEXT: s_mov_b32 s11, s7 267; SI-NEXT: s_waitcnt lgkmcnt(0) 268; SI-NEXT: s_mov_b32 s8, s2 269; SI-NEXT: s_mov_b32 s9, s3 270; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 271; SI-NEXT: s_mov_b32 s4, s0 272; SI-NEXT: s_mov_b32 s5, s1 273; SI-NEXT: s_waitcnt vmcnt(0) 274; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 275; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 276; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 277; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 278; SI-NEXT: s_endpgm 279; 280; VI-LABEL: maxnum_f16_imm_b: 281; VI: ; %bb.0: ; %entry 282; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 283; VI-NEXT: s_mov_b32 s7, 0xf000 284; VI-NEXT: s_mov_b32 s6, -1 285; VI-NEXT: s_mov_b32 s10, s6 286; VI-NEXT: s_mov_b32 s11, s7 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: s_mov_b32 s8, s2 289; VI-NEXT: s_mov_b32 s9, s3 290; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 291; VI-NEXT: s_mov_b32 s4, s0 292; VI-NEXT: s_mov_b32 s5, s1 293; VI-NEXT: s_waitcnt vmcnt(0) 294; VI-NEXT: v_max_f16_e32 v0, v0, v0 295; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 296; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 297; VI-NEXT: s_endpgm 298; 299; GFX9-LABEL: maxnum_f16_imm_b: 300; GFX9: ; %bb.0: ; %entry 301; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 302; GFX9-NEXT: s_mov_b32 s7, 0xf000 303; GFX9-NEXT: s_mov_b32 s6, -1 304; GFX9-NEXT: s_mov_b32 s10, s6 305; GFX9-NEXT: s_mov_b32 s11, s7 306; GFX9-NEXT: s_waitcnt lgkmcnt(0) 307; GFX9-NEXT: s_mov_b32 s8, s2 308; GFX9-NEXT: s_mov_b32 s9, s3 309; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 310; GFX9-NEXT: s_mov_b32 s4, s0 311; GFX9-NEXT: s_mov_b32 s5, s1 312; GFX9-NEXT: s_waitcnt vmcnt(0) 313; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 314; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0 315; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 316; GFX9-NEXT: s_endpgm 317; 318; GFX10-LABEL: maxnum_f16_imm_b: 319; GFX10: ; %bb.0: ; %entry 320; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 321; GFX10-NEXT: s_mov_b32 s6, -1 322; GFX10-NEXT: s_mov_b32 s7, 0x31016000 323; GFX10-NEXT: s_mov_b32 s10, s6 324; GFX10-NEXT: s_mov_b32 s11, s7 325; GFX10-NEXT: s_waitcnt lgkmcnt(0) 326; GFX10-NEXT: s_mov_b32 s8, s2 327; GFX10-NEXT: s_mov_b32 s9, s3 328; GFX10-NEXT: s_mov_b32 s4, s0 329; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 330; GFX10-NEXT: s_mov_b32 s5, s1 331; GFX10-NEXT: s_waitcnt vmcnt(0) 332; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 333; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 334; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 335; GFX10-NEXT: s_endpgm 336; 337; GFX11-LABEL: maxnum_f16_imm_b: 338; GFX11: ; %bb.0: ; %entry 339; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 340; GFX11-NEXT: s_mov_b32 s6, -1 341; GFX11-NEXT: s_mov_b32 s7, 0x31016000 342; GFX11-NEXT: s_mov_b32 s10, s6 343; GFX11-NEXT: s_mov_b32 s11, s7 344; GFX11-NEXT: s_waitcnt lgkmcnt(0) 345; GFX11-NEXT: s_mov_b32 s8, s2 346; GFX11-NEXT: s_mov_b32 s9, s3 347; GFX11-NEXT: s_mov_b32 s4, s0 348; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 349; GFX11-NEXT: s_mov_b32 s5, s1 350; GFX11-NEXT: s_waitcnt vmcnt(0) 351; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 352; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 353; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 354; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 355; GFX11-NEXT: s_endpgm 356 ptr addrspace(1) %r, 357 ptr addrspace(1) %a) #0 { 358entry: 359 %a.val = load half, ptr addrspace(1) %a 360 %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) 361 store half %r.val, ptr addrspace(1) %r 362 ret void 363} 364 365define amdgpu_kernel void @maxnum_v2f16( 366; SI-LABEL: maxnum_v2f16: 367; SI: ; %bb.0: ; %entry 368; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 369; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 370; SI-NEXT: s_waitcnt lgkmcnt(0) 371; SI-NEXT: s_load_dword s2, s[2:3], 0x0 372; SI-NEXT: s_load_dword s4, s[4:5], 0x0 373; SI-NEXT: s_mov_b32 s3, 0xf000 374; SI-NEXT: s_waitcnt lgkmcnt(0) 375; SI-NEXT: s_lshr_b32 s5, s2, 16 376; SI-NEXT: s_lshr_b32 s6, s4, 16 377; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 378; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 379; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 380; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 381; SI-NEXT: s_mov_b32 s2, -1 382; SI-NEXT: v_max_f32_e32 v0, v0, v1 383; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 384; SI-NEXT: v_max_f32_e32 v1, v2, v3 385; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 386; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 387; SI-NEXT: v_or_b32_e32 v0, v1, v0 388; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; SI-NEXT: s_endpgm 390; 391; VI-LABEL: maxnum_v2f16: 392; VI: ; %bb.0: ; %entry 393; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 394; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 395; VI-NEXT: s_mov_b32 s7, 0xf000 396; VI-NEXT: s_mov_b32 s6, -1 397; VI-NEXT: s_waitcnt lgkmcnt(0) 398; VI-NEXT: s_load_dword s8, s[8:9], 0x0 399; VI-NEXT: s_load_dword s2, s[2:3], 0x0 400; VI-NEXT: s_mov_b32 s4, s0 401; VI-NEXT: s_mov_b32 s5, s1 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: v_max_f16_e64 v0, s8, s8 404; VI-NEXT: v_max_f16_e64 v1, s2, s2 405; VI-NEXT: s_lshr_b32 s0, s8, 16 406; VI-NEXT: v_max_f16_e32 v0, v1, v0 407; VI-NEXT: v_max_f16_e64 v1, s0, s0 408; VI-NEXT: s_lshr_b32 s0, s2, 16 409; VI-NEXT: v_max_f16_e64 v2, s0, s0 410; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 411; VI-NEXT: v_or_b32_e32 v0, v0, v1 412; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 413; VI-NEXT: s_endpgm 414; 415; GFX9-LABEL: maxnum_v2f16: 416; GFX9: ; %bb.0: ; %entry 417; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 418; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 419; GFX9-NEXT: s_mov_b32 s7, 0xf000 420; GFX9-NEXT: s_mov_b32 s6, -1 421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 422; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 423; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 424; GFX9-NEXT: s_mov_b32 s4, s0 425; GFX9-NEXT: s_mov_b32 s5, s1 426; GFX9-NEXT: s_waitcnt lgkmcnt(0) 427; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 428; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 429; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 430; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 431; GFX9-NEXT: s_endpgm 432; 433; GFX10-LABEL: maxnum_v2f16: 434; GFX10: ; %bb.0: ; %entry 435; GFX10-NEXT: s_clause 0x1 436; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 437; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 438; GFX10-NEXT: s_waitcnt lgkmcnt(0) 439; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 440; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 441; GFX10-NEXT: s_mov_b32 s3, 0x31016000 442; GFX10-NEXT: s_mov_b32 s2, -1 443; GFX10-NEXT: s_waitcnt lgkmcnt(0) 444; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 445; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 446; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 447; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 448; GFX10-NEXT: s_endpgm 449; 450; GFX11-LABEL: maxnum_v2f16: 451; GFX11: ; %bb.0: ; %entry 452; GFX11-NEXT: s_clause 0x1 453; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 454; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 455; GFX11-NEXT: s_waitcnt lgkmcnt(0) 456; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 457; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 458; GFX11-NEXT: s_mov_b32 s3, 0x31016000 459; GFX11-NEXT: s_waitcnt lgkmcnt(0) 460; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 461; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 462; GFX11-NEXT: s_mov_b32 s2, -1 463; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 464; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 465; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 466; GFX11-NEXT: s_endpgm 467 ptr addrspace(1) %r, 468 ptr addrspace(1) %a, 469 ptr addrspace(1) %b) #0 { 470entry: 471 %a.val = load <2 x half>, ptr addrspace(1) %a 472 %b.val = load <2 x half>, ptr addrspace(1) %b 473 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 474 store <2 x half> %r.val, ptr addrspace(1) %r 475 ret void 476} 477 478define amdgpu_kernel void @maxnum_v2f16_imm_a( 479; SI-LABEL: maxnum_v2f16_imm_a: 480; SI: ; %bb.0: ; %entry 481; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 482; SI-NEXT: s_waitcnt lgkmcnt(0) 483; SI-NEXT: s_load_dword s2, s[2:3], 0x0 484; SI-NEXT: s_waitcnt lgkmcnt(0) 485; SI-NEXT: s_lshr_b32 s3, s2, 16 486; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 487; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 488; SI-NEXT: s_mov_b32 s3, 0xf000 489; SI-NEXT: s_mov_b32 s2, -1 490; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 491; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 492; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 493; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 494; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 495; SI-NEXT: v_or_b32_e32 v0, v1, v0 496; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 497; SI-NEXT: s_endpgm 498; 499; VI-LABEL: maxnum_v2f16_imm_a: 500; VI: ; %bb.0: ; %entry 501; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 502; VI-NEXT: v_mov_b32_e32 v2, 0x4400 503; VI-NEXT: s_waitcnt lgkmcnt(0) 504; VI-NEXT: s_load_dword s4, s[2:3], 0x0 505; VI-NEXT: s_mov_b32 s3, 0xf000 506; VI-NEXT: s_mov_b32 s2, -1 507; VI-NEXT: s_waitcnt lgkmcnt(0) 508; VI-NEXT: v_max_f16_e64 v0, s4, s4 509; VI-NEXT: s_lshr_b32 s4, s4, 16 510; VI-NEXT: v_max_f16_e64 v1, s4, s4 511; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0 512; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 513; VI-NEXT: v_or_b32_e32 v0, v0, v1 514; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 515; VI-NEXT: s_endpgm 516; 517; GFX9-LABEL: maxnum_v2f16_imm_a: 518; GFX9: ; %bb.0: ; %entry 519; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 520; GFX9-NEXT: s_waitcnt lgkmcnt(0) 521; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 522; GFX9-NEXT: s_mov_b32 s3, 0xf000 523; GFX9-NEXT: s_mov_b32 s2, -1 524; GFX9-NEXT: s_waitcnt lgkmcnt(0) 525; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 526; GFX9-NEXT: s_mov_b32 s4, 0x44004200 527; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 528; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 529; GFX9-NEXT: s_endpgm 530; 531; GFX10-LABEL: maxnum_v2f16_imm_a: 532; GFX10: ; %bb.0: ; %entry 533; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 534; GFX10-NEXT: s_waitcnt lgkmcnt(0) 535; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 536; GFX10-NEXT: s_mov_b32 s3, 0x31016000 537; GFX10-NEXT: s_waitcnt lgkmcnt(0) 538; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 539; GFX10-NEXT: s_mov_b32 s2, -1 540; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 541; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 542; GFX10-NEXT: s_endpgm 543; 544; GFX11-LABEL: maxnum_v2f16_imm_a: 545; GFX11: ; %bb.0: ; %entry 546; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 547; GFX11-NEXT: s_waitcnt lgkmcnt(0) 548; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 549; GFX11-NEXT: s_mov_b32 s3, 0x31016000 550; GFX11-NEXT: s_waitcnt lgkmcnt(0) 551; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 552; GFX11-NEXT: s_mov_b32 s2, -1 553; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 554; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 555; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 556; GFX11-NEXT: s_endpgm 557 ptr addrspace(1) %r, 558 ptr addrspace(1) %b) #0 { 559entry: 560 %b.val = load <2 x half>, ptr addrspace(1) %b 561 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 562 store <2 x half> %r.val, ptr addrspace(1) %r 563 ret void 564} 565 566define amdgpu_kernel void @maxnum_v2f16_imm_b( 567; SI-LABEL: maxnum_v2f16_imm_b: 568; SI: ; %bb.0: ; %entry 569; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 570; SI-NEXT: s_waitcnt lgkmcnt(0) 571; SI-NEXT: s_load_dword s2, s[2:3], 0x0 572; SI-NEXT: s_waitcnt lgkmcnt(0) 573; SI-NEXT: s_lshr_b32 s3, s2, 16 574; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 575; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 576; SI-NEXT: s_mov_b32 s3, 0xf000 577; SI-NEXT: s_mov_b32 s2, -1 578; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 579; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 580; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 581; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 582; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 583; SI-NEXT: v_or_b32_e32 v0, v1, v0 584; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 585; SI-NEXT: s_endpgm 586; 587; VI-LABEL: maxnum_v2f16_imm_b: 588; VI: ; %bb.0: ; %entry 589; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 590; VI-NEXT: v_mov_b32_e32 v2, 0x4200 591; VI-NEXT: s_waitcnt lgkmcnt(0) 592; VI-NEXT: s_load_dword s4, s[2:3], 0x0 593; VI-NEXT: s_mov_b32 s3, 0xf000 594; VI-NEXT: s_mov_b32 s2, -1 595; VI-NEXT: s_waitcnt lgkmcnt(0) 596; VI-NEXT: v_max_f16_e64 v0, s4, s4 597; VI-NEXT: s_lshr_b32 s4, s4, 16 598; VI-NEXT: v_max_f16_e64 v1, s4, s4 599; VI-NEXT: v_max_f16_e32 v0, 4.0, v0 600; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 601; VI-NEXT: v_or_b32_e32 v0, v0, v1 602; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 603; VI-NEXT: s_endpgm 604; 605; GFX9-LABEL: maxnum_v2f16_imm_b: 606; GFX9: ; %bb.0: ; %entry 607; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 608; GFX9-NEXT: s_waitcnt lgkmcnt(0) 609; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 610; GFX9-NEXT: s_mov_b32 s3, 0xf000 611; GFX9-NEXT: s_mov_b32 s2, -1 612; GFX9-NEXT: s_waitcnt lgkmcnt(0) 613; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 614; GFX9-NEXT: s_mov_b32 s4, 0x42004400 615; GFX9-NEXT: v_pk_max_f16 v0, v0, s4 616; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 617; GFX9-NEXT: s_endpgm 618; 619; GFX10-LABEL: maxnum_v2f16_imm_b: 620; GFX10: ; %bb.0: ; %entry 621; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 622; GFX10-NEXT: s_waitcnt lgkmcnt(0) 623; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 624; GFX10-NEXT: s_mov_b32 s3, 0x31016000 625; GFX10-NEXT: s_waitcnt lgkmcnt(0) 626; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 627; GFX10-NEXT: s_mov_b32 s2, -1 628; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 629; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 630; GFX10-NEXT: s_endpgm 631; 632; GFX11-LABEL: maxnum_v2f16_imm_b: 633; GFX11: ; %bb.0: ; %entry 634; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 636; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 637; GFX11-NEXT: s_mov_b32 s3, 0x31016000 638; GFX11-NEXT: s_waitcnt lgkmcnt(0) 639; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 640; GFX11-NEXT: s_mov_b32 s2, -1 641; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 642; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 643; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 644; GFX11-NEXT: s_endpgm 645 ptr addrspace(1) %r, 646 ptr addrspace(1) %a) #0 { 647entry: 648 %a.val = load <2 x half>, ptr addrspace(1) %a 649 %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 650 store <2 x half> %r.val, ptr addrspace(1) %r 651 ret void 652} 653 654; FIXME: Scalarize with undef half 655define amdgpu_kernel void @maxnum_v3f16( 656; SI-LABEL: maxnum_v3f16: 657; SI: ; %bb.0: ; %entry 658; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 659; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 660; SI-NEXT: s_waitcnt lgkmcnt(0) 661; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 662; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 663; SI-NEXT: s_mov_b32 s3, 0xf000 664; SI-NEXT: s_mov_b32 s2, -1 665; SI-NEXT: s_waitcnt lgkmcnt(0) 666; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 667; SI-NEXT: s_lshr_b32 s7, s6, 16 668; SI-NEXT: s_lshr_b32 s8, s4, 16 669; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 670; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 671; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 672; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 673; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 674; SI-NEXT: v_max_f32_e32 v1, v1, v2 675; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 676; SI-NEXT: v_max_f32_e32 v2, v3, v4 677; SI-NEXT: v_max_f32_e32 v0, v0, v5 678; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 679; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 680; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 681; SI-NEXT: v_or_b32_e32 v1, v2, v1 682; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 683; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 684; SI-NEXT: s_endpgm 685; 686; VI-LABEL: maxnum_v3f16: 687; VI: ; %bb.0: ; %entry 688; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 689; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 690; VI-NEXT: s_mov_b32 s7, 0xf000 691; VI-NEXT: s_mov_b32 s6, -1 692; VI-NEXT: s_waitcnt lgkmcnt(0) 693; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 694; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 695; VI-NEXT: s_mov_b32 s4, s0 696; VI-NEXT: s_mov_b32 s5, s1 697; VI-NEXT: s_waitcnt lgkmcnt(0) 698; VI-NEXT: v_max_f16_e64 v0, s8, s8 699; VI-NEXT: v_max_f16_e64 v1, s2, s2 700; VI-NEXT: s_lshr_b32 s0, s8, 16 701; VI-NEXT: v_max_f16_e32 v0, v1, v0 702; VI-NEXT: v_max_f16_e64 v1, s0, s0 703; VI-NEXT: s_lshr_b32 s0, s2, 16 704; VI-NEXT: v_max_f16_e64 v2, s0, s0 705; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 706; VI-NEXT: v_or_b32_e32 v0, v0, v1 707; VI-NEXT: v_max_f16_e64 v1, s9, s9 708; VI-NEXT: v_max_f16_e64 v2, s3, s3 709; VI-NEXT: v_max_f16_e32 v1, v2, v1 710; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 711; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 712; VI-NEXT: s_endpgm 713; 714; GFX9-LABEL: maxnum_v3f16: 715; GFX9: ; %bb.0: ; %entry 716; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 717; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 718; GFX9-NEXT: s_mov_b32 s7, 0xf000 719; GFX9-NEXT: s_mov_b32 s6, -1 720; GFX9-NEXT: s_waitcnt lgkmcnt(0) 721; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 722; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 723; GFX9-NEXT: s_mov_b32 s4, s0 724; GFX9-NEXT: s_mov_b32 s5, s1 725; GFX9-NEXT: s_waitcnt lgkmcnt(0) 726; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 727; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 728; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 729; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 730; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 731; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 732; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 733; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 734; GFX9-NEXT: s_endpgm 735; 736; GFX10-LABEL: maxnum_v3f16: 737; GFX10: ; %bb.0: ; %entry 738; GFX10-NEXT: s_clause 0x1 739; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 740; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 741; GFX10-NEXT: s_waitcnt lgkmcnt(0) 742; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 743; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 744; GFX10-NEXT: s_mov_b32 s3, 0x31016000 745; GFX10-NEXT: s_mov_b32 s2, -1 746; GFX10-NEXT: s_waitcnt lgkmcnt(0) 747; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 748; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 749; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 750; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 751; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 752; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 753; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 754; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 755; GFX10-NEXT: s_endpgm 756; 757; GFX11-LABEL: maxnum_v3f16: 758; GFX11: ; %bb.0: ; %entry 759; GFX11-NEXT: s_clause 0x1 760; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 761; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 762; GFX11-NEXT: s_waitcnt lgkmcnt(0) 763; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 764; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 765; GFX11-NEXT: s_waitcnt lgkmcnt(0) 766; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 767; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 768; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 769; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 770; GFX11-NEXT: s_mov_b32 s3, 0x31016000 771; GFX11-NEXT: s_mov_b32 s2, -1 772; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 773; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 774; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 775; GFX11-NEXT: s_clause 0x1 776; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 777; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 778; GFX11-NEXT: s_endpgm 779 ptr addrspace(1) %r, 780 ptr addrspace(1) %a, 781 ptr addrspace(1) %b) #0 { 782entry: 783 %a.val = load <3 x half>, ptr addrspace(1) %a 784 %b.val = load <3 x half>, ptr addrspace(1) %b 785 %r.val = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 786 store <3 x half> %r.val, ptr addrspace(1) %r 787 ret void 788} 789 790define amdgpu_kernel void @maxnum_v4f16( 791; SI-LABEL: maxnum_v4f16: 792; SI: ; %bb.0: ; %entry 793; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 794; SI-NEXT: s_mov_b32 s3, 0xf000 795; SI-NEXT: s_mov_b32 s2, -1 796; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 797; SI-NEXT: s_waitcnt lgkmcnt(0) 798; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 799; SI-NEXT: s_mov_b32 s0, s8 800; SI-NEXT: s_mov_b32 s1, s9 801; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 802; SI-NEXT: s_waitcnt lgkmcnt(0) 803; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 804; SI-NEXT: s_lshr_b32 s6, s6, 16 805; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 806; SI-NEXT: s_lshr_b32 s6, s7, 16 807; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 808; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 809; SI-NEXT: s_lshr_b32 s6, s5, 16 810; SI-NEXT: s_lshr_b32 s4, s4, 16 811; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 812; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 813; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 814; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 815; SI-NEXT: v_max_f32_e32 v3, v3, v5 816; SI-NEXT: v_max_f32_e32 v2, v2, v7 817; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 818; SI-NEXT: v_max_f32_e32 v1, v1, v6 819; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 820; SI-NEXT: v_max_f32_e32 v0, v0, v4 821; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 822; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 823; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 824; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 825; SI-NEXT: v_or_b32_e32 v1, v1, v3 826; SI-NEXT: v_or_b32_e32 v0, v0, v2 827; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 828; SI-NEXT: s_endpgm 829; 830; VI-LABEL: maxnum_v4f16: 831; VI: ; %bb.0: ; %entry 832; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 833; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 834; VI-NEXT: s_mov_b32 s7, 0xf000 835; VI-NEXT: s_mov_b32 s6, -1 836; VI-NEXT: s_waitcnt lgkmcnt(0) 837; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 838; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 839; VI-NEXT: s_mov_b32 s4, s0 840; VI-NEXT: s_mov_b32 s5, s1 841; VI-NEXT: s_waitcnt lgkmcnt(0) 842; VI-NEXT: v_max_f16_e64 v0, s9, s9 843; VI-NEXT: v_max_f16_e64 v1, s3, s3 844; VI-NEXT: s_lshr_b32 s0, s9, 16 845; VI-NEXT: v_max_f16_e32 v0, v1, v0 846; VI-NEXT: v_max_f16_e64 v1, s0, s0 847; VI-NEXT: s_lshr_b32 s0, s3, 16 848; VI-NEXT: v_max_f16_e64 v2, s0, s0 849; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 850; VI-NEXT: v_or_b32_e32 v1, v0, v1 851; VI-NEXT: v_max_f16_e64 v0, s8, s8 852; VI-NEXT: v_max_f16_e64 v2, s2, s2 853; VI-NEXT: s_lshr_b32 s0, s8, 16 854; VI-NEXT: v_max_f16_e32 v0, v2, v0 855; VI-NEXT: v_max_f16_e64 v2, s0, s0 856; VI-NEXT: s_lshr_b32 s0, s2, 16 857; VI-NEXT: v_max_f16_e64 v3, s0, s0 858; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 859; VI-NEXT: v_or_b32_e32 v0, v0, v2 860; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 861; VI-NEXT: s_endpgm 862; 863; GFX9-LABEL: maxnum_v4f16: 864; GFX9: ; %bb.0: ; %entry 865; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 866; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 867; GFX9-NEXT: s_mov_b32 s7, 0xf000 868; GFX9-NEXT: s_mov_b32 s6, -1 869; GFX9-NEXT: s_waitcnt lgkmcnt(0) 870; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 871; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 872; GFX9-NEXT: s_mov_b32 s4, s0 873; GFX9-NEXT: s_mov_b32 s5, s1 874; GFX9-NEXT: s_waitcnt lgkmcnt(0) 875; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 876; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 877; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 878; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 879; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 880; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 881; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 882; GFX9-NEXT: s_endpgm 883; 884; GFX10-LABEL: maxnum_v4f16: 885; GFX10: ; %bb.0: ; %entry 886; GFX10-NEXT: s_clause 0x1 887; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 888; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 889; GFX10-NEXT: s_waitcnt lgkmcnt(0) 890; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 891; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 892; GFX10-NEXT: s_mov_b32 s3, 0x31016000 893; GFX10-NEXT: s_mov_b32 s2, -1 894; GFX10-NEXT: s_waitcnt lgkmcnt(0) 895; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 896; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 897; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 898; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 899; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 900; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 901; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 902; GFX10-NEXT: s_endpgm 903; 904; GFX11-LABEL: maxnum_v4f16: 905; GFX11: ; %bb.0: ; %entry 906; GFX11-NEXT: s_clause 0x1 907; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 908; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 909; GFX11-NEXT: s_waitcnt lgkmcnt(0) 910; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 911; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 912; GFX11-NEXT: s_waitcnt lgkmcnt(0) 913; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 914; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 915; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 916; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 917; GFX11-NEXT: s_mov_b32 s3, 0x31016000 918; GFX11-NEXT: s_mov_b32 s2, -1 919; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 920; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 921; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 922; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 923; GFX11-NEXT: s_endpgm 924 ptr addrspace(1) %r, 925 ptr addrspace(1) %a, 926 ptr addrspace(1) %b) #0 { 927entry: 928 %a.val = load <4 x half>, ptr addrspace(1) %a 929 %b.val = load <4 x half>, ptr addrspace(1) %b 930 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 931 store <4 x half> %r.val, ptr addrspace(1) %r 932 ret void 933} 934 935define amdgpu_kernel void @fmax_v4f16_imm_a( 936; SI-LABEL: fmax_v4f16_imm_a: 937; SI: ; %bb.0: ; %entry 938; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 939; SI-NEXT: s_waitcnt lgkmcnt(0) 940; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 941; SI-NEXT: s_mov_b32 s3, 0xf000 942; SI-NEXT: s_mov_b32 s2, -1 943; SI-NEXT: s_waitcnt lgkmcnt(0) 944; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 945; SI-NEXT: s_lshr_b32 s5, s5, 16 946; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 947; SI-NEXT: s_lshr_b32 s4, s4, 16 948; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 949; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 950; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 951; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 952; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 953; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 954; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 955; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 956; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 957; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 958; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 959; SI-NEXT: v_or_b32_e32 v1, v1, v2 960; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 961; SI-NEXT: v_or_b32_e32 v0, v0, v2 962; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 963; SI-NEXT: s_endpgm 964; 965; VI-LABEL: fmax_v4f16_imm_a: 966; VI: ; %bb.0: ; %entry 967; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 968; VI-NEXT: v_mov_b32_e32 v0, 0x4400 969; VI-NEXT: s_mov_b32 s7, 0xf000 970; VI-NEXT: s_mov_b32 s6, -1 971; VI-NEXT: s_waitcnt lgkmcnt(0) 972; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 973; VI-NEXT: s_mov_b32 s4, s0 974; VI-NEXT: s_mov_b32 s5, s1 975; VI-NEXT: s_waitcnt lgkmcnt(0) 976; VI-NEXT: s_lshr_b32 s0, s3, 16 977; VI-NEXT: v_max_f16_e64 v1, s3, s3 978; VI-NEXT: v_max_f16_e64 v3, s0, s0 979; VI-NEXT: v_max_f16_e64 v2, s2, s2 980; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 981; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 982; VI-NEXT: s_lshr_b32 s0, s2, 16 983; VI-NEXT: v_or_b32_e32 v1, v1, v0 984; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 985; VI-NEXT: v_max_f16_e64 v2, s0, s0 986; VI-NEXT: v_mov_b32_e32 v3, 0x4000 987; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 988; VI-NEXT: v_or_b32_e32 v0, v0, v2 989; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 990; VI-NEXT: s_endpgm 991; 992; GFX9-LABEL: fmax_v4f16_imm_a: 993; GFX9: ; %bb.0: ; %entry 994; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 995; GFX9-NEXT: s_mov_b32 s8, 0x44004200 996; GFX9-NEXT: s_mov_b32 s9, 0x40004800 997; GFX9-NEXT: s_mov_b32 s7, 0xf000 998; GFX9-NEXT: s_mov_b32 s6, -1 999; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1001; GFX9-NEXT: s_mov_b32 s4, s0 1002; GFX9-NEXT: s_mov_b32 s5, s1 1003; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 1005; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 1006; GFX9-NEXT: v_pk_max_f16 v1, v0, s8 1007; GFX9-NEXT: v_pk_max_f16 v0, v2, s9 1008; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1009; GFX9-NEXT: s_endpgm 1010; 1011; GFX10-LABEL: fmax_v4f16_imm_a: 1012; GFX10: ; %bb.0: ; %entry 1013; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1014; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1016; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 1018; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 1019; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1020; GFX10-NEXT: s_mov_b32 s2, -1 1021; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0 1022; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 1023; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1024; GFX10-NEXT: s_endpgm 1025; 1026; GFX11-LABEL: fmax_v4f16_imm_a: 1027; GFX11: ; %bb.0: ; %entry 1028; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1029; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1030; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 1031; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1032; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 1033; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 1034; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1035; GFX11-NEXT: s_mov_b32 s2, -1 1036; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1037; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 1038; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 1039; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1040; GFX11-NEXT: s_endpgm 1041 ptr addrspace(1) %r, 1042 ptr addrspace(1) %b) #0 { 1043entry: 1044 %b.val = load <4 x half>, ptr addrspace(1) %b 1045 %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 1046 store <4 x half> %r.val, ptr addrspace(1) %r 1047 ret void 1048} 1049 1050attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1051