1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s 7 8declare half @llvm.minnum.f16(half %a, half %b) 9declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 10declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) 11declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) 12 13define amdgpu_kernel void @minnum_f16_ieee( 14; SI-LABEL: minnum_f16_ieee: 15; SI: ; %bb.0: ; %entry 16; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 17; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 18; SI-NEXT: s_mov_b32 s7, 0xf000 19; SI-NEXT: s_mov_b32 s6, -1 20; SI-NEXT: s_mov_b32 s14, s6 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_mov_b32 s12, s2 23; SI-NEXT: s_mov_b32 s13, s3 24; SI-NEXT: s_mov_b32 s15, s7 25; SI-NEXT: s_mov_b32 s10, s6 26; SI-NEXT: s_mov_b32 s11, s7 27; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 28; SI-NEXT: s_waitcnt vmcnt(0) 29; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: s_mov_b32 s4, s0 32; SI-NEXT: s_mov_b32 s5, s1 33; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 34; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 35; SI-NEXT: v_min_f32_e32 v0, v0, v1 36; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 37; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 38; SI-NEXT: s_endpgm 39; 40; VI-LABEL: minnum_f16_ieee: 41; VI: ; %bb.0: ; %entry 42; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 43; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 44; VI-NEXT: s_mov_b32 s7, 0xf000 45; VI-NEXT: s_mov_b32 s6, -1 46; VI-NEXT: s_mov_b32 s14, s6 47; VI-NEXT: s_waitcnt lgkmcnt(0) 48; VI-NEXT: s_mov_b32 s12, s2 49; VI-NEXT: s_mov_b32 s13, s3 50; VI-NEXT: s_mov_b32 s15, s7 51; VI-NEXT: s_mov_b32 s10, s6 52; VI-NEXT: s_mov_b32 s11, s7 53; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 54; VI-NEXT: s_waitcnt vmcnt(0) 55; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 56; VI-NEXT: s_waitcnt vmcnt(0) 57; VI-NEXT: s_mov_b32 s4, s0 58; VI-NEXT: s_mov_b32 s5, s1 59; VI-NEXT: v_max_f16_e32 v0, v0, v0 60; VI-NEXT: v_max_f16_e32 v1, v1, v1 61; VI-NEXT: v_min_f16_e32 v0, v0, v1 62; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 63; VI-NEXT: s_endpgm 64; 65; GFX9-LABEL: minnum_f16_ieee: 66; GFX9: ; %bb.0: ; %entry 67; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 68; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 69; GFX9-NEXT: s_mov_b32 s7, 0xf000 70; GFX9-NEXT: s_mov_b32 s6, -1 71; GFX9-NEXT: s_mov_b32 s14, s6 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: s_mov_b32 s12, s2 74; GFX9-NEXT: s_mov_b32 s13, s3 75; GFX9-NEXT: s_mov_b32 s15, s7 76; GFX9-NEXT: s_mov_b32 s10, s6 77; GFX9-NEXT: s_mov_b32 s11, s7 78; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 79; GFX9-NEXT: s_waitcnt vmcnt(0) 80; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 81; GFX9-NEXT: s_waitcnt vmcnt(0) 82; GFX9-NEXT: s_mov_b32 s4, s0 83; GFX9-NEXT: s_mov_b32 s5, s1 84; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 85; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 86; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 87; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 88; GFX9-NEXT: s_endpgm 89; 90; GFX10-LABEL: minnum_f16_ieee: 91; GFX10: ; %bb.0: ; %entry 92; GFX10-NEXT: s_clause 0x1 93; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 94; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 95; GFX10-NEXT: s_mov_b32 s6, -1 96; GFX10-NEXT: s_mov_b32 s7, 0x31016000 97; GFX10-NEXT: s_mov_b32 s14, s6 98; GFX10-NEXT: s_mov_b32 s15, s7 99; GFX10-NEXT: s_mov_b32 s10, s6 100; GFX10-NEXT: s_mov_b32 s11, s7 101; GFX10-NEXT: s_waitcnt lgkmcnt(0) 102; GFX10-NEXT: s_mov_b32 s12, s2 103; GFX10-NEXT: s_mov_b32 s13, s3 104; GFX10-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc dlc 105; GFX10-NEXT: s_waitcnt vmcnt(0) 106; GFX10-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc dlc 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: s_mov_b32 s4, s0 109; GFX10-NEXT: s_mov_b32 s5, s1 110; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 111; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 112; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 113; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 114; GFX10-NEXT: s_endpgm 115; 116; GFX11-LABEL: minnum_f16_ieee: 117; GFX11: ; %bb.0: ; %entry 118; GFX11-NEXT: s_clause 0x1 119; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 120; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 121; GFX11-NEXT: s_mov_b32 s10, -1 122; GFX11-NEXT: s_mov_b32 s11, 0x31016000 123; GFX11-NEXT: s_mov_b32 s14, s10 124; GFX11-NEXT: s_mov_b32 s15, s11 125; GFX11-NEXT: s_mov_b32 s6, s10 126; GFX11-NEXT: s_mov_b32 s7, s11 127; GFX11-NEXT: s_waitcnt lgkmcnt(0) 128; GFX11-NEXT: s_mov_b32 s12, s2 129; GFX11-NEXT: s_mov_b32 s13, s3 130; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 131; GFX11-NEXT: s_waitcnt vmcnt(0) 132; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 133; GFX11-NEXT: s_waitcnt vmcnt(0) 134; GFX11-NEXT: s_mov_b32 s8, s0 135; GFX11-NEXT: s_mov_b32 s9, s1 136; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 137; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 138; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 139; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 140; GFX11-NEXT: s_endpgm 141 ptr addrspace(1) %r, 142 ptr addrspace(1) %a, 143 ptr addrspace(1) %b) #0 { 144entry: 145 %a.val = load volatile half, ptr addrspace(1) %a 146 %b.val = load volatile half, ptr addrspace(1) %b 147 %r.val = call half @llvm.minnum.f16(half %a.val, half %b.val) 148 store half %r.val, ptr addrspace(1) %r 149 ret void 150} 151 152define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { 153; SI-LABEL: minnum_f16_no_ieee: 154; SI: ; %bb.0: 155; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 156; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 157; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 158; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 159; SI-NEXT: v_min_f32_e32 v0, v0, v1 160; SI-NEXT: ; return to shader part epilog 161; 162; VI-LABEL: minnum_f16_no_ieee: 163; VI: ; %bb.0: 164; VI-NEXT: v_min_f16_e32 v0, v0, v1 165; VI-NEXT: ; return to shader part epilog 166; 167; GFX9-LABEL: minnum_f16_no_ieee: 168; GFX9: ; %bb.0: 169; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 170; GFX9-NEXT: ; return to shader part epilog 171; 172; GFX10PLUS-LABEL: minnum_f16_no_ieee: 173; GFX10PLUS: ; %bb.0: 174; GFX10PLUS-NEXT: v_min_f16_e32 v0, v0, v1 175; GFX10PLUS-NEXT: ; return to shader part epilog 176 %r.val = call half @llvm.minnum.f16(half %a, half %b) 177 ret half %r.val 178} 179 180define amdgpu_kernel void @minnum_f16_imm_a( 181; SI-LABEL: minnum_f16_imm_a: 182; SI: ; %bb.0: ; %entry 183; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 184; SI-NEXT: s_mov_b32 s7, 0xf000 185; SI-NEXT: s_mov_b32 s6, -1 186; SI-NEXT: s_mov_b32 s10, s6 187; SI-NEXT: s_mov_b32 s11, s7 188; SI-NEXT: s_waitcnt lgkmcnt(0) 189; SI-NEXT: s_mov_b32 s8, s2 190; SI-NEXT: s_mov_b32 s9, s3 191; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 192; SI-NEXT: s_mov_b32 s4, s0 193; SI-NEXT: s_mov_b32 s5, s1 194; SI-NEXT: s_waitcnt vmcnt(0) 195; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 196; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 197; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 198; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 199; SI-NEXT: s_endpgm 200; 201; VI-LABEL: minnum_f16_imm_a: 202; VI: ; %bb.0: ; %entry 203; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 204; VI-NEXT: s_mov_b32 s7, 0xf000 205; VI-NEXT: s_mov_b32 s6, -1 206; VI-NEXT: s_mov_b32 s10, s6 207; VI-NEXT: s_mov_b32 s11, s7 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: s_mov_b32 s8, s2 210; VI-NEXT: s_mov_b32 s9, s3 211; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 212; VI-NEXT: s_mov_b32 s4, s0 213; VI-NEXT: s_mov_b32 s5, s1 214; VI-NEXT: s_waitcnt vmcnt(0) 215; VI-NEXT: v_max_f16_e32 v0, v0, v0 216; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 217; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 218; VI-NEXT: s_endpgm 219; 220; GFX9-LABEL: minnum_f16_imm_a: 221; GFX9: ; %bb.0: ; %entry 222; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 223; GFX9-NEXT: s_mov_b32 s7, 0xf000 224; GFX9-NEXT: s_mov_b32 s6, -1 225; GFX9-NEXT: s_mov_b32 s10, s6 226; GFX9-NEXT: s_mov_b32 s11, s7 227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 228; GFX9-NEXT: s_mov_b32 s8, s2 229; GFX9-NEXT: s_mov_b32 s9, s3 230; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 231; GFX9-NEXT: s_mov_b32 s4, s0 232; GFX9-NEXT: s_mov_b32 s5, s1 233; GFX9-NEXT: s_waitcnt vmcnt(0) 234; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 235; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0 236; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 237; GFX9-NEXT: s_endpgm 238; 239; GFX10-LABEL: minnum_f16_imm_a: 240; GFX10: ; %bb.0: ; %entry 241; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 242; GFX10-NEXT: s_mov_b32 s6, -1 243; GFX10-NEXT: s_mov_b32 s7, 0x31016000 244; GFX10-NEXT: s_mov_b32 s10, s6 245; GFX10-NEXT: s_mov_b32 s11, s7 246; GFX10-NEXT: s_waitcnt lgkmcnt(0) 247; GFX10-NEXT: s_mov_b32 s8, s2 248; GFX10-NEXT: s_mov_b32 s9, s3 249; GFX10-NEXT: s_mov_b32 s4, s0 250; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 251; GFX10-NEXT: s_mov_b32 s5, s1 252; GFX10-NEXT: s_waitcnt vmcnt(0) 253; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 254; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 255; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 256; GFX10-NEXT: s_endpgm 257; 258; GFX11-LABEL: minnum_f16_imm_a: 259; GFX11: ; %bb.0: ; %entry 260; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 261; GFX11-NEXT: s_mov_b32 s6, -1 262; GFX11-NEXT: s_mov_b32 s7, 0x31016000 263; GFX11-NEXT: s_mov_b32 s10, s6 264; GFX11-NEXT: s_mov_b32 s11, s7 265; GFX11-NEXT: s_waitcnt lgkmcnt(0) 266; GFX11-NEXT: s_mov_b32 s8, s2 267; GFX11-NEXT: s_mov_b32 s9, s3 268; GFX11-NEXT: s_mov_b32 s4, s0 269; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 270; GFX11-NEXT: s_mov_b32 s5, s1 271; GFX11-NEXT: s_waitcnt vmcnt(0) 272; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 273; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 274; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 275; GFX11-NEXT: s_endpgm 276 ptr addrspace(1) %r, 277 ptr addrspace(1) %b) #0 { 278entry: 279 %b.val = load half, ptr addrspace(1) %b 280 %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) 281 store half %r.val, ptr addrspace(1) %r 282 ret void 283} 284 285define amdgpu_kernel void @minnum_f16_imm_b( 286; SI-LABEL: minnum_f16_imm_b: 287; SI: ; %bb.0: ; %entry 288; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 289; SI-NEXT: s_mov_b32 s7, 0xf000 290; SI-NEXT: s_mov_b32 s6, -1 291; SI-NEXT: s_mov_b32 s10, s6 292; SI-NEXT: s_mov_b32 s11, s7 293; SI-NEXT: s_waitcnt lgkmcnt(0) 294; SI-NEXT: s_mov_b32 s8, s2 295; SI-NEXT: s_mov_b32 s9, s3 296; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 297; SI-NEXT: s_mov_b32 s4, s0 298; SI-NEXT: s_mov_b32 s5, s1 299; SI-NEXT: s_waitcnt vmcnt(0) 300; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 301; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 302; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 303; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 304; SI-NEXT: s_endpgm 305; 306; VI-LABEL: minnum_f16_imm_b: 307; VI: ; %bb.0: ; %entry 308; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 309; VI-NEXT: s_mov_b32 s7, 0xf000 310; VI-NEXT: s_mov_b32 s6, -1 311; VI-NEXT: s_mov_b32 s10, s6 312; VI-NEXT: s_mov_b32 s11, s7 313; VI-NEXT: s_waitcnt lgkmcnt(0) 314; VI-NEXT: s_mov_b32 s8, s2 315; VI-NEXT: s_mov_b32 s9, s3 316; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 317; VI-NEXT: s_mov_b32 s4, s0 318; VI-NEXT: s_mov_b32 s5, s1 319; VI-NEXT: s_waitcnt vmcnt(0) 320; VI-NEXT: v_max_f16_e32 v0, v0, v0 321; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 322; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 323; VI-NEXT: s_endpgm 324; 325; GFX9-LABEL: minnum_f16_imm_b: 326; GFX9: ; %bb.0: ; %entry 327; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 328; GFX9-NEXT: s_mov_b32 s7, 0xf000 329; GFX9-NEXT: s_mov_b32 s6, -1 330; GFX9-NEXT: s_mov_b32 s10, s6 331; GFX9-NEXT: s_mov_b32 s11, s7 332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 333; GFX9-NEXT: s_mov_b32 s8, s2 334; GFX9-NEXT: s_mov_b32 s9, s3 335; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0 336; GFX9-NEXT: s_mov_b32 s4, s0 337; GFX9-NEXT: s_mov_b32 s5, s1 338; GFX9-NEXT: s_waitcnt vmcnt(0) 339; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 340; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0 341; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 342; GFX9-NEXT: s_endpgm 343; 344; GFX10-LABEL: minnum_f16_imm_b: 345; GFX10: ; %bb.0: ; %entry 346; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 347; GFX10-NEXT: s_mov_b32 s6, -1 348; GFX10-NEXT: s_mov_b32 s7, 0x31016000 349; GFX10-NEXT: s_mov_b32 s10, s6 350; GFX10-NEXT: s_mov_b32 s11, s7 351; GFX10-NEXT: s_waitcnt lgkmcnt(0) 352; GFX10-NEXT: s_mov_b32 s8, s2 353; GFX10-NEXT: s_mov_b32 s9, s3 354; GFX10-NEXT: s_mov_b32 s4, s0 355; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 356; GFX10-NEXT: s_mov_b32 s5, s1 357; GFX10-NEXT: s_waitcnt vmcnt(0) 358; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 359; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 360; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 361; GFX10-NEXT: s_endpgm 362; 363; GFX11-LABEL: minnum_f16_imm_b: 364; GFX11: ; %bb.0: ; %entry 365; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 366; GFX11-NEXT: s_mov_b32 s6, -1 367; GFX11-NEXT: s_mov_b32 s7, 0x31016000 368; GFX11-NEXT: s_mov_b32 s10, s6 369; GFX11-NEXT: s_mov_b32 s11, s7 370; GFX11-NEXT: s_waitcnt lgkmcnt(0) 371; GFX11-NEXT: s_mov_b32 s8, s2 372; GFX11-NEXT: s_mov_b32 s9, s3 373; GFX11-NEXT: s_mov_b32 s4, s0 374; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 375; GFX11-NEXT: s_mov_b32 s5, s1 376; GFX11-NEXT: s_waitcnt vmcnt(0) 377; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 378; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 379; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 380; GFX11-NEXT: s_endpgm 381 ptr addrspace(1) %r, 382 ptr addrspace(1) %a) #0 { 383entry: 384 %a.val = load half, ptr addrspace(1) %a 385 %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) 386 store half %r.val, ptr addrspace(1) %r 387 ret void 388} 389 390define amdgpu_kernel void @minnum_v2f16_ieee( 391; SI-LABEL: minnum_v2f16_ieee: 392; SI: ; %bb.0: ; %entry 393; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 394; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 395; SI-NEXT: s_waitcnt lgkmcnt(0) 396; SI-NEXT: s_load_dword s2, s[2:3], 0x0 397; SI-NEXT: s_load_dword s4, s[4:5], 0x0 398; SI-NEXT: s_mov_b32 s3, 0xf000 399; SI-NEXT: s_waitcnt lgkmcnt(0) 400; SI-NEXT: s_lshr_b32 s5, s2, 16 401; SI-NEXT: s_lshr_b32 s6, s4, 16 402; SI-NEXT: v_cvt_f32_f16_e32 v0, s5 403; SI-NEXT: v_cvt_f32_f16_e32 v1, s6 404; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 405; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 406; SI-NEXT: s_mov_b32 s2, -1 407; SI-NEXT: v_min_f32_e32 v0, v0, v1 408; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 409; SI-NEXT: v_min_f32_e32 v1, v2, v3 410; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 411; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 412; SI-NEXT: v_or_b32_e32 v0, v1, v0 413; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 414; SI-NEXT: s_endpgm 415; 416; VI-LABEL: minnum_v2f16_ieee: 417; VI: ; %bb.0: ; %entry 418; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 419; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 420; VI-NEXT: s_mov_b32 s7, 0xf000 421; VI-NEXT: s_mov_b32 s6, -1 422; VI-NEXT: s_waitcnt lgkmcnt(0) 423; VI-NEXT: s_load_dword s8, s[8:9], 0x0 424; VI-NEXT: s_load_dword s2, s[2:3], 0x0 425; VI-NEXT: s_mov_b32 s4, s0 426; VI-NEXT: s_mov_b32 s5, s1 427; VI-NEXT: s_waitcnt lgkmcnt(0) 428; VI-NEXT: v_max_f16_e64 v0, s8, s8 429; VI-NEXT: v_max_f16_e64 v1, s2, s2 430; VI-NEXT: s_lshr_b32 s0, s8, 16 431; VI-NEXT: v_min_f16_e32 v0, v1, v0 432; VI-NEXT: v_max_f16_e64 v1, s0, s0 433; VI-NEXT: s_lshr_b32 s0, s2, 16 434; VI-NEXT: v_max_f16_e64 v2, s0, s0 435; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 436; VI-NEXT: v_or_b32_e32 v0, v0, v1 437; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 438; VI-NEXT: s_endpgm 439; 440; GFX9-LABEL: minnum_v2f16_ieee: 441; GFX9: ; %bb.0: ; %entry 442; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 443; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 444; GFX9-NEXT: s_mov_b32 s7, 0xf000 445; GFX9-NEXT: s_mov_b32 s6, -1 446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 447; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 448; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 449; GFX9-NEXT: s_mov_b32 s4, s0 450; GFX9-NEXT: s_mov_b32 s5, s1 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 453; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 454; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 455; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 456; GFX9-NEXT: s_endpgm 457; 458; GFX10-LABEL: minnum_v2f16_ieee: 459; GFX10: ; %bb.0: ; %entry 460; GFX10-NEXT: s_clause 0x1 461; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 462; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 463; GFX10-NEXT: s_waitcnt lgkmcnt(0) 464; GFX10-NEXT: s_load_dword s4, s[6:7], 0x0 465; GFX10-NEXT: s_load_dword s5, s[2:3], 0x0 466; GFX10-NEXT: s_mov_b32 s3, 0x31016000 467; GFX10-NEXT: s_mov_b32 s2, -1 468; GFX10-NEXT: s_waitcnt lgkmcnt(0) 469; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 470; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 471; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 472; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 473; GFX10-NEXT: s_endpgm 474; 475; GFX11-LABEL: minnum_v2f16_ieee: 476; GFX11: ; %bb.0: ; %entry 477; GFX11-NEXT: s_clause 0x1 478; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 479; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 480; GFX11-NEXT: s_waitcnt lgkmcnt(0) 481; GFX11-NEXT: s_load_b32 s4, s[6:7], 0x0 482; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 483; GFX11-NEXT: s_mov_b32 s3, 0x31016000 484; GFX11-NEXT: s_waitcnt lgkmcnt(0) 485; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 486; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 487; GFX11-NEXT: s_mov_b32 s2, -1 488; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 489; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 490; GFX11-NEXT: s_endpgm 491 ptr addrspace(1) %r, 492 ptr addrspace(1) %a, 493 ptr addrspace(1) %b) #0 { 494entry: 495 %a.val = load <2 x half>, ptr addrspace(1) %a 496 %b.val = load <2 x half>, ptr addrspace(1) %b 497 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> %b.val) 498 store <2 x half> %r.val, ptr addrspace(1) %r 499 ret void 500} 501 502define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { 503; SI-LABEL: minnum_v2f16_no_ieee: 504; SI: ; %bb.0: 505; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 506; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 507; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 508; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 509; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 510; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 511; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 512; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 513; SI-NEXT: v_min_f32_e32 v0, v0, v2 514; SI-NEXT: v_min_f32_e32 v1, v1, v3 515; SI-NEXT: ; return to shader part epilog 516; 517; VI-LABEL: minnum_v2f16_no_ieee: 518; VI: ; %bb.0: 519; VI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 520; VI-NEXT: v_min_f16_e32 v0, v0, v1 521; VI-NEXT: v_or_b32_e32 v0, v0, v2 522; VI-NEXT: ; return to shader part epilog 523; 524; GFX9-LABEL: minnum_v2f16_no_ieee: 525; GFX9: ; %bb.0: 526; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 527; GFX9-NEXT: ; return to shader part epilog 528; 529; GFX10PLUS-LABEL: minnum_v2f16_no_ieee: 530; GFX10PLUS: ; %bb.0: 531; GFX10PLUS-NEXT: v_pk_min_f16 v0, v0, v1 532; GFX10PLUS-NEXT: ; return to shader part epilog 533 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) 534 ret <2 x half> %r.val 535} 536 537define amdgpu_kernel void @minnum_v2f16_imm_a( 538; SI-LABEL: minnum_v2f16_imm_a: 539; SI: ; %bb.0: ; %entry 540; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 541; SI-NEXT: s_waitcnt lgkmcnt(0) 542; SI-NEXT: s_load_dword s2, s[2:3], 0x0 543; SI-NEXT: s_waitcnt lgkmcnt(0) 544; SI-NEXT: s_lshr_b32 s3, s2, 16 545; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 546; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 547; SI-NEXT: s_mov_b32 s3, 0xf000 548; SI-NEXT: s_mov_b32 s2, -1 549; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 550; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 551; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 552; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 553; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 554; SI-NEXT: v_or_b32_e32 v0, v1, v0 555; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 556; SI-NEXT: s_endpgm 557; 558; VI-LABEL: minnum_v2f16_imm_a: 559; VI: ; %bb.0: ; %entry 560; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 561; VI-NEXT: v_mov_b32_e32 v2, 0x4400 562; VI-NEXT: s_waitcnt lgkmcnt(0) 563; VI-NEXT: s_load_dword s4, s[2:3], 0x0 564; VI-NEXT: s_mov_b32 s3, 0xf000 565; VI-NEXT: s_mov_b32 s2, -1 566; VI-NEXT: s_waitcnt lgkmcnt(0) 567; VI-NEXT: v_max_f16_e64 v0, s4, s4 568; VI-NEXT: s_lshr_b32 s4, s4, 16 569; VI-NEXT: v_max_f16_e64 v1, s4, s4 570; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0 571; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 572; VI-NEXT: v_or_b32_e32 v0, v0, v1 573; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 574; VI-NEXT: s_endpgm 575; 576; GFX9-LABEL: minnum_v2f16_imm_a: 577; GFX9: ; %bb.0: ; %entry 578; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 579; GFX9-NEXT: s_waitcnt lgkmcnt(0) 580; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 581; GFX9-NEXT: s_mov_b32 s3, 0xf000 582; GFX9-NEXT: s_mov_b32 s2, -1 583; GFX9-NEXT: s_waitcnt lgkmcnt(0) 584; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 585; GFX9-NEXT: s_mov_b32 s4, 0x44004200 586; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 587; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 588; GFX9-NEXT: s_endpgm 589; 590; GFX10-LABEL: minnum_v2f16_imm_a: 591; GFX10: ; %bb.0: ; %entry 592; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 593; GFX10-NEXT: s_waitcnt lgkmcnt(0) 594; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 595; GFX10-NEXT: s_mov_b32 s3, 0x31016000 596; GFX10-NEXT: s_waitcnt lgkmcnt(0) 597; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 598; GFX10-NEXT: s_mov_b32 s2, -1 599; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 600; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 601; GFX10-NEXT: s_endpgm 602; 603; GFX11-LABEL: minnum_v2f16_imm_a: 604; GFX11: ; %bb.0: ; %entry 605; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 606; GFX11-NEXT: s_waitcnt lgkmcnt(0) 607; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 608; GFX11-NEXT: s_mov_b32 s3, 0x31016000 609; GFX11-NEXT: s_waitcnt lgkmcnt(0) 610; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 611; GFX11-NEXT: s_mov_b32 s2, -1 612; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 613; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 614; GFX11-NEXT: s_endpgm 615 ptr addrspace(1) %r, 616 ptr addrspace(1) %b) #0 { 617entry: 618 %b.val = load <2 x half>, ptr addrspace(1) %b 619 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> <half 3.0, half 4.0>, <2 x half> %b.val) 620 store <2 x half> %r.val, ptr addrspace(1) %r 621 ret void 622} 623 624define amdgpu_kernel void @minnum_v2f16_imm_b( 625; SI-LABEL: minnum_v2f16_imm_b: 626; SI: ; %bb.0: ; %entry 627; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 628; SI-NEXT: s_waitcnt lgkmcnt(0) 629; SI-NEXT: s_load_dword s2, s[2:3], 0x0 630; SI-NEXT: s_waitcnt lgkmcnt(0) 631; SI-NEXT: s_lshr_b32 s3, s2, 16 632; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 633; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 634; SI-NEXT: s_mov_b32 s3, 0xf000 635; SI-NEXT: s_mov_b32 s2, -1 636; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 637; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 638; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 639; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 640; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 641; SI-NEXT: v_or_b32_e32 v0, v1, v0 642; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 643; SI-NEXT: s_endpgm 644; 645; VI-LABEL: minnum_v2f16_imm_b: 646; VI: ; %bb.0: ; %entry 647; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 648; VI-NEXT: v_mov_b32_e32 v2, 0x4200 649; VI-NEXT: s_waitcnt lgkmcnt(0) 650; VI-NEXT: s_load_dword s4, s[2:3], 0x0 651; VI-NEXT: s_mov_b32 s3, 0xf000 652; VI-NEXT: s_mov_b32 s2, -1 653; VI-NEXT: s_waitcnt lgkmcnt(0) 654; VI-NEXT: v_max_f16_e64 v0, s4, s4 655; VI-NEXT: s_lshr_b32 s4, s4, 16 656; VI-NEXT: v_max_f16_e64 v1, s4, s4 657; VI-NEXT: v_min_f16_e32 v0, 4.0, v0 658; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 659; VI-NEXT: v_or_b32_e32 v0, v0, v1 660; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 661; VI-NEXT: s_endpgm 662; 663; GFX9-LABEL: minnum_v2f16_imm_b: 664; GFX9: ; %bb.0: ; %entry 665; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 667; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0 668; GFX9-NEXT: s_mov_b32 s3, 0xf000 669; GFX9-NEXT: s_mov_b32 s2, -1 670; GFX9-NEXT: s_waitcnt lgkmcnt(0) 671; GFX9-NEXT: v_pk_max_f16 v0, s4, s4 672; GFX9-NEXT: s_mov_b32 s4, 0x42004400 673; GFX9-NEXT: v_pk_min_f16 v0, v0, s4 674; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 675; GFX9-NEXT: s_endpgm 676; 677; GFX10-LABEL: minnum_v2f16_imm_b: 678; GFX10: ; %bb.0: ; %entry 679; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 680; GFX10-NEXT: s_waitcnt lgkmcnt(0) 681; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 682; GFX10-NEXT: s_mov_b32 s3, 0x31016000 683; GFX10-NEXT: s_waitcnt lgkmcnt(0) 684; GFX10-NEXT: v_pk_max_f16 v0, s2, s2 685; GFX10-NEXT: s_mov_b32 s2, -1 686; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 687; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 688; GFX10-NEXT: s_endpgm 689; 690; GFX11-LABEL: minnum_v2f16_imm_b: 691; GFX11: ; %bb.0: ; %entry 692; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 693; GFX11-NEXT: s_waitcnt lgkmcnt(0) 694; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 695; GFX11-NEXT: s_mov_b32 s3, 0x31016000 696; GFX11-NEXT: s_waitcnt lgkmcnt(0) 697; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 698; GFX11-NEXT: s_mov_b32 s2, -1 699; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 700; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 701; GFX11-NEXT: s_endpgm 702 ptr addrspace(1) %r, 703 ptr addrspace(1) %a) #0 { 704entry: 705 %a.val = load <2 x half>, ptr addrspace(1) %a 706 %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> <half 4.0, half 3.0>) 707 store <2 x half> %r.val, ptr addrspace(1) %r 708 ret void 709} 710 711; FIXME: Scalarize with undef half 712define amdgpu_kernel void @minnum_v3f16( 713; SI-LABEL: minnum_v3f16: 714; SI: ; %bb.0: ; %entry 715; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 716; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 717; SI-NEXT: s_waitcnt lgkmcnt(0) 718; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 719; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 720; SI-NEXT: s_mov_b32 s3, 0xf000 721; SI-NEXT: s_mov_b32 s2, -1 722; SI-NEXT: s_waitcnt lgkmcnt(0) 723; SI-NEXT: v_cvt_f32_f16_e32 v0, s7 724; SI-NEXT: s_lshr_b32 s7, s6, 16 725; SI-NEXT: s_lshr_b32 s8, s4, 16 726; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 727; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 728; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 729; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 730; SI-NEXT: v_cvt_f32_f16_e32 v5, s5 731; SI-NEXT: v_min_f32_e32 v1, v1, v2 732; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 733; SI-NEXT: v_min_f32_e32 v2, v3, v4 734; SI-NEXT: v_min_f32_e32 v0, v0, v5 735; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 736; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 737; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 738; SI-NEXT: v_or_b32_e32 v1, v2, v1 739; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 740; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 741; SI-NEXT: s_endpgm 742; 743; VI-LABEL: minnum_v3f16: 744; VI: ; %bb.0: ; %entry 745; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 746; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 747; VI-NEXT: s_mov_b32 s7, 0xf000 748; VI-NEXT: s_mov_b32 s6, -1 749; VI-NEXT: s_waitcnt lgkmcnt(0) 750; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 751; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 752; VI-NEXT: s_mov_b32 s4, s0 753; VI-NEXT: s_mov_b32 s5, s1 754; VI-NEXT: s_waitcnt lgkmcnt(0) 755; VI-NEXT: v_max_f16_e64 v0, s8, s8 756; VI-NEXT: v_max_f16_e64 v1, s2, s2 757; VI-NEXT: s_lshr_b32 s0, s8, 16 758; VI-NEXT: v_min_f16_e32 v0, v1, v0 759; VI-NEXT: v_max_f16_e64 v1, s0, s0 760; VI-NEXT: s_lshr_b32 s0, s2, 16 761; VI-NEXT: v_max_f16_e64 v2, s0, s0 762; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 763; VI-NEXT: v_or_b32_e32 v0, v0, v1 764; VI-NEXT: v_max_f16_e64 v1, s9, s9 765; VI-NEXT: v_max_f16_e64 v2, s3, s3 766; VI-NEXT: v_min_f16_e32 v1, v2, v1 767; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 768; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 769; VI-NEXT: s_endpgm 770; 771; GFX9-LABEL: minnum_v3f16: 772; GFX9: ; %bb.0: ; %entry 773; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 774; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 775; GFX9-NEXT: s_mov_b32 s7, 0xf000 776; GFX9-NEXT: s_mov_b32 s6, -1 777; GFX9-NEXT: s_waitcnt lgkmcnt(0) 778; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 779; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 780; GFX9-NEXT: s_mov_b32 s4, s0 781; GFX9-NEXT: s_mov_b32 s5, s1 782; GFX9-NEXT: s_waitcnt lgkmcnt(0) 783; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 784; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 785; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 786; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 787; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 788; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 789; GFX9-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 790; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 791; GFX9-NEXT: s_endpgm 792; 793; GFX10-LABEL: minnum_v3f16: 794; GFX10: ; %bb.0: ; %entry 795; GFX10-NEXT: s_clause 0x1 796; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 797; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 798; GFX10-NEXT: s_waitcnt lgkmcnt(0) 799; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 800; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 801; GFX10-NEXT: s_mov_b32 s3, 0x31016000 802; GFX10-NEXT: s_mov_b32 s2, -1 803; GFX10-NEXT: s_waitcnt lgkmcnt(0) 804; GFX10-NEXT: v_pk_max_f16 v1, s5, s5 805; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 806; GFX10-NEXT: v_pk_max_f16 v0, s4, s4 807; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 808; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 809; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 810; GFX10-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 811; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 812; GFX10-NEXT: s_endpgm 813; 814; GFX11-LABEL: minnum_v3f16: 815; GFX11: ; %bb.0: ; %entry 816; GFX11-NEXT: s_clause 0x1 817; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 818; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 819; GFX11-NEXT: s_waitcnt lgkmcnt(0) 820; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 821; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 822; GFX11-NEXT: s_waitcnt lgkmcnt(0) 823; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 824; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 825; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 826; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 827; GFX11-NEXT: s_mov_b32 s3, 0x31016000 828; GFX11-NEXT: s_mov_b32 s2, -1 829; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 830; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 831; GFX11-NEXT: s_clause 0x1 832; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 833; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 834; GFX11-NEXT: s_endpgm 835 ptr addrspace(1) %r, 836 ptr addrspace(1) %a, 837 ptr addrspace(1) %b) #0 { 838entry: 839 %a.val = load <3 x half>, ptr addrspace(1) %a 840 %b.val = load <3 x half>, ptr addrspace(1) %b 841 %r.val = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a.val, <3 x half> %b.val) 842 store <3 x half> %r.val, ptr addrspace(1) %r 843 ret void 844} 845 846define amdgpu_kernel void @minnum_v4f16( 847; SI-LABEL: minnum_v4f16: 848; SI: ; %bb.0: ; %entry 849; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 850; SI-NEXT: s_mov_b32 s3, 0xf000 851; SI-NEXT: s_mov_b32 s2, -1 852; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 853; SI-NEXT: s_waitcnt lgkmcnt(0) 854; SI-NEXT: s_load_dwordx2 s[6:7], s[10:11], 0x0 855; SI-NEXT: s_mov_b32 s0, s8 856; SI-NEXT: s_mov_b32 s1, s9 857; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 858; SI-NEXT: s_waitcnt lgkmcnt(0) 859; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 860; SI-NEXT: s_lshr_b32 s6, s6, 16 861; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 862; SI-NEXT: s_lshr_b32 s6, s7, 16 863; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 864; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 865; SI-NEXT: s_lshr_b32 s6, s5, 16 866; SI-NEXT: s_lshr_b32 s4, s4, 16 867; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 868; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 869; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 870; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 871; SI-NEXT: v_min_f32_e32 v3, v3, v5 872; SI-NEXT: v_min_f32_e32 v2, v2, v7 873; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 874; SI-NEXT: v_min_f32_e32 v1, v1, v6 875; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 876; SI-NEXT: v_min_f32_e32 v0, v0, v4 877; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 878; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 879; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 880; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 881; SI-NEXT: v_or_b32_e32 v1, v1, v3 882; SI-NEXT: v_or_b32_e32 v0, v0, v2 883; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 884; SI-NEXT: s_endpgm 885; 886; VI-LABEL: minnum_v4f16: 887; VI: ; %bb.0: ; %entry 888; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 889; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 890; VI-NEXT: s_mov_b32 s7, 0xf000 891; VI-NEXT: s_mov_b32 s6, -1 892; VI-NEXT: s_waitcnt lgkmcnt(0) 893; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 894; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 895; VI-NEXT: s_mov_b32 s4, s0 896; VI-NEXT: s_mov_b32 s5, s1 897; VI-NEXT: s_waitcnt lgkmcnt(0) 898; VI-NEXT: v_max_f16_e64 v0, s9, s9 899; VI-NEXT: v_max_f16_e64 v1, s3, s3 900; VI-NEXT: s_lshr_b32 s0, s9, 16 901; VI-NEXT: v_min_f16_e32 v0, v1, v0 902; VI-NEXT: v_max_f16_e64 v1, s0, s0 903; VI-NEXT: s_lshr_b32 s0, s3, 16 904; VI-NEXT: v_max_f16_e64 v2, s0, s0 905; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 906; VI-NEXT: v_or_b32_e32 v1, v0, v1 907; VI-NEXT: v_max_f16_e64 v0, s8, s8 908; VI-NEXT: v_max_f16_e64 v2, s2, s2 909; VI-NEXT: s_lshr_b32 s0, s8, 16 910; VI-NEXT: v_min_f16_e32 v0, v2, v0 911; VI-NEXT: v_max_f16_e64 v2, s0, s0 912; VI-NEXT: s_lshr_b32 s0, s2, 16 913; VI-NEXT: v_max_f16_e64 v3, s0, s0 914; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 915; VI-NEXT: v_or_b32_e32 v0, v0, v2 916; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 917; VI-NEXT: s_endpgm 918; 919; GFX9-LABEL: minnum_v4f16: 920; GFX9: ; %bb.0: ; %entry 921; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 922; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 923; GFX9-NEXT: s_mov_b32 s7, 0xf000 924; GFX9-NEXT: s_mov_b32 s6, -1 925; GFX9-NEXT: s_waitcnt lgkmcnt(0) 926; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 927; GFX9-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0x0 928; GFX9-NEXT: s_mov_b32 s4, s0 929; GFX9-NEXT: s_mov_b32 s5, s1 930; GFX9-NEXT: s_waitcnt lgkmcnt(0) 931; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 932; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 933; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 934; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 935; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 936; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 937; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 938; GFX9-NEXT: s_endpgm 939; 940; GFX10-LABEL: minnum_v4f16: 941; GFX10: ; %bb.0: ; %entry 942; GFX10-NEXT: s_clause 0x1 943; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 944; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 945; GFX10-NEXT: s_waitcnt lgkmcnt(0) 946; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 947; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 948; GFX10-NEXT: s_mov_b32 s3, 0x31016000 949; GFX10-NEXT: s_mov_b32 s2, -1 950; GFX10-NEXT: s_waitcnt lgkmcnt(0) 951; GFX10-NEXT: v_pk_max_f16 v0, s5, s5 952; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 953; GFX10-NEXT: v_pk_max_f16 v2, s4, s4 954; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 955; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 956; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 957; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 958; GFX10-NEXT: s_endpgm 959; 960; GFX11-LABEL: minnum_v4f16: 961; GFX11: ; %bb.0: ; %entry 962; GFX11-NEXT: s_clause 0x1 963; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 964; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 965; GFX11-NEXT: s_waitcnt lgkmcnt(0) 966; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 967; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 968; GFX11-NEXT: s_waitcnt lgkmcnt(0) 969; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 970; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 971; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 972; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 973; GFX11-NEXT: s_mov_b32 s3, 0x31016000 974; GFX11-NEXT: s_mov_b32 s2, -1 975; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 976; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 977; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 978; GFX11-NEXT: s_endpgm 979 ptr addrspace(1) %r, 980 ptr addrspace(1) %a, 981 ptr addrspace(1) %b) #0 { 982entry: 983 %a.val = load <4 x half>, ptr addrspace(1) %a 984 %b.val = load <4 x half>, ptr addrspace(1) %b 985 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a.val, <4 x half> %b.val) 986 store <4 x half> %r.val, ptr addrspace(1) %r 987 ret void 988} 989 990define amdgpu_kernel void @fmin_v4f16_imm_a( 991; SI-LABEL: fmin_v4f16_imm_a: 992; SI: ; %bb.0: ; %entry 993; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 994; SI-NEXT: s_waitcnt lgkmcnt(0) 995; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 996; SI-NEXT: s_mov_b32 s3, 0xf000 997; SI-NEXT: s_mov_b32 s2, -1 998; SI-NEXT: s_waitcnt lgkmcnt(0) 999; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 1000; SI-NEXT: s_lshr_b32 s5, s5, 16 1001; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 1002; SI-NEXT: s_lshr_b32 s4, s4, 16 1003; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 1004; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 1005; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 1006; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 1007; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 1008; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1009; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 1010; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1011; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1012; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1013; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1014; SI-NEXT: v_or_b32_e32 v1, v1, v2 1015; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1016; SI-NEXT: v_or_b32_e32 v0, v0, v2 1017; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1018; SI-NEXT: s_endpgm 1019; 1020; VI-LABEL: fmin_v4f16_imm_a: 1021; VI: ; %bb.0: ; %entry 1022; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1023; VI-NEXT: v_mov_b32_e32 v0, 0x4400 1024; VI-NEXT: s_mov_b32 s7, 0xf000 1025; VI-NEXT: s_mov_b32 s6, -1 1026; VI-NEXT: s_waitcnt lgkmcnt(0) 1027; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1028; VI-NEXT: s_mov_b32 s4, s0 1029; VI-NEXT: s_mov_b32 s5, s1 1030; VI-NEXT: s_waitcnt lgkmcnt(0) 1031; VI-NEXT: s_lshr_b32 s0, s3, 16 1032; VI-NEXT: v_max_f16_e64 v1, s3, s3 1033; VI-NEXT: v_max_f16_e64 v3, s0, s0 1034; VI-NEXT: v_max_f16_e64 v2, s2, s2 1035; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 1036; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1037; VI-NEXT: s_lshr_b32 s0, s2, 16 1038; VI-NEXT: v_or_b32_e32 v1, v1, v0 1039; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 1040; VI-NEXT: v_max_f16_e64 v2, s0, s0 1041; VI-NEXT: v_mov_b32_e32 v3, 0x4000 1042; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1043; VI-NEXT: v_or_b32_e32 v0, v0, v2 1044; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1045; VI-NEXT: s_endpgm 1046; 1047; GFX9-LABEL: fmin_v4f16_imm_a: 1048; GFX9: ; %bb.0: ; %entry 1049; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1050; GFX9-NEXT: s_mov_b32 s8, 0x44004200 1051; GFX9-NEXT: s_mov_b32 s9, 0x40004800 1052; GFX9-NEXT: s_mov_b32 s7, 0xf000 1053; GFX9-NEXT: s_mov_b32 s6, -1 1054; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1055; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1056; GFX9-NEXT: s_mov_b32 s4, s0 1057; GFX9-NEXT: s_mov_b32 s5, s1 1058; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX9-NEXT: v_pk_max_f16 v0, s3, s3 1060; GFX9-NEXT: v_pk_max_f16 v2, s2, s2 1061; GFX9-NEXT: v_pk_min_f16 v1, v0, s8 1062; GFX9-NEXT: v_pk_min_f16 v0, v2, s9 1063; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1064; GFX9-NEXT: s_endpgm 1065; 1066; GFX10-LABEL: fmin_v4f16_imm_a: 1067; GFX10: ; %bb.0: ; %entry 1068; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1069; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1071; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1072; GFX10-NEXT: v_pk_max_f16 v0, s3, s3 1073; GFX10-NEXT: v_pk_max_f16 v2, s2, s2 1074; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1075; GFX10-NEXT: s_mov_b32 s2, -1 1076; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0 1077; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 1078; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1079; GFX10-NEXT: s_endpgm 1080; 1081; GFX11-LABEL: fmin_v4f16_imm_a: 1082; GFX11: ; %bb.0: ; %entry 1083; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1084; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1085; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 1086; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1087; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 1088; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 1089; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1090; GFX11-NEXT: s_mov_b32 s2, -1 1091; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 1092; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 1093; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1094; GFX11-NEXT: s_endpgm 1095 ptr addrspace(1) %r, 1096 ptr addrspace(1) %b) #0 { 1097entry: 1098 %b.val = load <4 x half>, ptr addrspace(1) %b 1099 %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, <4 x half> %b.val) 1100 store <4 x half> %r.val, ptr addrspace(1) %r 1101 ret void 1102} 1103 1104attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1105