1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX12 %s 6 7define amdgpu_kernel void @fcmp_f16_lt( 8; SI-LABEL: fcmp_f16_lt: 9; SI: ; %bb.0: ; %entry 10; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 11; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 12; SI-NEXT: s_mov_b32 s11, 0xf000 13; SI-NEXT: s_mov_b32 s10, -1 14; SI-NEXT: s_mov_b32 s14, s10 15; SI-NEXT: s_mov_b32 s15, s11 16; SI-NEXT: s_mov_b32 s6, s10 17; SI-NEXT: s_mov_b32 s7, s11 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b32 s12, s2 20; SI-NEXT: s_mov_b32 s13, s3 21; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: s_mov_b32 s8, s0 26; SI-NEXT: s_mov_b32 s9, s1 27; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 28; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 29; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 30; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 31; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 32; SI-NEXT: s_endpgm 33; 34; VI-LABEL: fcmp_f16_lt: 35; VI: ; %bb.0: ; %entry 36; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 37; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 38; VI-NEXT: s_mov_b32 s7, 0xf000 39; VI-NEXT: s_mov_b32 s6, -1 40; VI-NEXT: s_mov_b32 s14, s6 41; VI-NEXT: s_waitcnt lgkmcnt(0) 42; VI-NEXT: s_mov_b32 s12, s2 43; VI-NEXT: s_mov_b32 s13, s3 44; VI-NEXT: s_mov_b32 s15, s7 45; VI-NEXT: s_mov_b32 s10, s6 46; VI-NEXT: s_mov_b32 s11, s7 47; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 48; VI-NEXT: s_waitcnt vmcnt(0) 49; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 50; VI-NEXT: s_waitcnt vmcnt(0) 51; VI-NEXT: s_mov_b32 s4, s0 52; VI-NEXT: s_mov_b32 s5, s1 53; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 54; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 55; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 56; VI-NEXT: s_endpgm 57; 58; GFX11-LABEL: fcmp_f16_lt: 59; GFX11: ; %bb.0: ; %entry 60; GFX11-NEXT: s_clause 0x1 61; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 62; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 63; GFX11-NEXT: s_mov_b32 s10, -1 64; GFX11-NEXT: s_mov_b32 s11, 0x31016000 65; GFX11-NEXT: s_mov_b32 s14, s10 66; GFX11-NEXT: s_mov_b32 s15, s11 67; GFX11-NEXT: s_mov_b32 s6, s10 68; GFX11-NEXT: s_mov_b32 s7, s11 69; GFX11-NEXT: s_waitcnt lgkmcnt(0) 70; GFX11-NEXT: s_mov_b32 s12, s2 71; GFX11-NEXT: s_mov_b32 s13, s3 72; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 73; GFX11-NEXT: s_waitcnt vmcnt(0) 74; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 75; GFX11-NEXT: s_waitcnt vmcnt(0) 76; GFX11-NEXT: s_mov_b32 s8, s0 77; GFX11-NEXT: s_mov_b32 s9, s1 78; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 79; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 80; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 81; GFX11-NEXT: s_endpgm 82; 83; GFX12-LABEL: fcmp_f16_lt: 84; GFX12: ; %bb.0: ; %entry 85; GFX12-NEXT: s_clause 0x1 86; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 87; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 88; GFX12-NEXT: s_mov_b32 s10, -1 89; GFX12-NEXT: s_mov_b32 s11, 0x31016000 90; GFX12-NEXT: s_mov_b32 s14, s10 91; GFX12-NEXT: s_mov_b32 s15, s11 92; GFX12-NEXT: s_mov_b32 s6, s10 93; GFX12-NEXT: s_mov_b32 s7, s11 94; GFX12-NEXT: s_wait_kmcnt 0x0 95; GFX12-NEXT: s_mov_b32 s12, s2 96; GFX12-NEXT: s_mov_b32 s13, s3 97; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 98; GFX12-NEXT: s_wait_loadcnt 0x0 99; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 100; GFX12-NEXT: s_wait_loadcnt 0x0 101; GFX12-NEXT: s_mov_b32 s8, s0 102; GFX12-NEXT: s_mov_b32 s9, s1 103; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 104; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 105; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 106; GFX12-NEXT: s_endpgm 107 ptr addrspace(1) %r, 108 ptr addrspace(1) %a, 109 ptr addrspace(1) %b) { 110entry: 111 %a.val = load volatile half, ptr addrspace(1) %a 112 %b.val = load volatile half, ptr addrspace(1) %b 113 %r.val = fcmp olt half %a.val, %b.val 114 %r.val.sext = sext i1 %r.val to i32 115 store i32 %r.val.sext, ptr addrspace(1) %r 116 ret void 117} 118 119define amdgpu_kernel void @fcmp_f16_lt_abs( 120; SI-LABEL: fcmp_f16_lt_abs: 121; SI: ; %bb.0: ; %entry 122; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 123; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 124; SI-NEXT: s_mov_b32 s11, 0xf000 125; SI-NEXT: s_mov_b32 s10, -1 126; SI-NEXT: s_mov_b32 s14, s10 127; SI-NEXT: s_mov_b32 s15, s11 128; SI-NEXT: s_mov_b32 s6, s10 129; SI-NEXT: s_mov_b32 s7, s11 130; SI-NEXT: s_waitcnt lgkmcnt(0) 131; SI-NEXT: s_mov_b32 s12, s2 132; SI-NEXT: s_mov_b32 s13, s3 133; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 134; SI-NEXT: s_waitcnt vmcnt(0) 135; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 136; SI-NEXT: s_waitcnt vmcnt(0) 137; SI-NEXT: s_mov_b32 s8, s0 138; SI-NEXT: s_mov_b32 s9, s1 139; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 140; SI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 141; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 142; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 143; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 144; SI-NEXT: s_endpgm 145; 146; VI-LABEL: fcmp_f16_lt_abs: 147; VI: ; %bb.0: ; %entry 148; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 149; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 150; VI-NEXT: s_mov_b32 s7, 0xf000 151; VI-NEXT: s_mov_b32 s6, -1 152; VI-NEXT: s_mov_b32 s14, s6 153; VI-NEXT: s_waitcnt lgkmcnt(0) 154; VI-NEXT: s_mov_b32 s12, s2 155; VI-NEXT: s_mov_b32 s13, s3 156; VI-NEXT: s_mov_b32 s15, s7 157; VI-NEXT: s_mov_b32 s10, s6 158; VI-NEXT: s_mov_b32 s11, s7 159; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 160; VI-NEXT: s_waitcnt vmcnt(0) 161; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 162; VI-NEXT: s_waitcnt vmcnt(0) 163; VI-NEXT: s_mov_b32 s4, s0 164; VI-NEXT: s_mov_b32 s5, s1 165; VI-NEXT: v_cmp_lt_f16_e64 s[0:1], |v0|, |v1| 166; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] 167; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 168; VI-NEXT: s_endpgm 169; 170; GFX11-LABEL: fcmp_f16_lt_abs: 171; GFX11: ; %bb.0: ; %entry 172; GFX11-NEXT: s_clause 0x1 173; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 174; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 175; GFX11-NEXT: s_mov_b32 s10, -1 176; GFX11-NEXT: s_mov_b32 s11, 0x31016000 177; GFX11-NEXT: s_mov_b32 s14, s10 178; GFX11-NEXT: s_mov_b32 s15, s11 179; GFX11-NEXT: s_mov_b32 s6, s10 180; GFX11-NEXT: s_mov_b32 s7, s11 181; GFX11-NEXT: s_waitcnt lgkmcnt(0) 182; GFX11-NEXT: s_mov_b32 s12, s2 183; GFX11-NEXT: s_mov_b32 s13, s3 184; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 185; GFX11-NEXT: s_waitcnt vmcnt(0) 186; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 187; GFX11-NEXT: s_waitcnt vmcnt(0) 188; GFX11-NEXT: s_mov_b32 s8, s0 189; GFX11-NEXT: s_mov_b32 s9, s1 190; GFX11-NEXT: v_cmp_lt_f16_e64 s2, |v0|, |v1| 191; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 192; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 193; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 194; GFX11-NEXT: s_endpgm 195; 196; GFX12-LABEL: fcmp_f16_lt_abs: 197; GFX12: ; %bb.0: ; %entry 198; GFX12-NEXT: s_clause 0x1 199; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 200; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 201; GFX12-NEXT: s_mov_b32 s10, -1 202; GFX12-NEXT: s_mov_b32 s11, 0x31016000 203; GFX12-NEXT: s_mov_b32 s14, s10 204; GFX12-NEXT: s_mov_b32 s15, s11 205; GFX12-NEXT: s_mov_b32 s6, s10 206; GFX12-NEXT: s_mov_b32 s7, s11 207; GFX12-NEXT: s_wait_kmcnt 0x0 208; GFX12-NEXT: s_mov_b32 s12, s2 209; GFX12-NEXT: s_mov_b32 s13, s3 210; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 211; GFX12-NEXT: s_wait_loadcnt 0x0 212; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 213; GFX12-NEXT: s_wait_loadcnt 0x0 214; GFX12-NEXT: s_mov_b32 s8, s0 215; GFX12-NEXT: s_mov_b32 s9, s1 216; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff, v0 217; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v1 218; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 219; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 220; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 221; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 222; GFX12-NEXT: s_endpgm 223 ptr addrspace(1) %r, 224 ptr addrspace(1) %a, 225 ptr addrspace(1) %b) { 226entry: 227 %a.val = load volatile half, ptr addrspace(1) %a 228 %b.val = load volatile half, ptr addrspace(1) %b 229 %a.abs = call half @llvm.fabs.f16(half %a.val) 230 %b.abs = call half @llvm.fabs.f16(half %b.val) 231 %r.val = fcmp olt half %a.abs, %b.abs 232 %r.val.sext = sext i1 %r.val to i32 233 store i32 %r.val.sext, ptr addrspace(1) %r 234 ret void 235} 236 237define amdgpu_kernel void @fcmp_f16_eq( 238; SI-LABEL: fcmp_f16_eq: 239; SI: ; %bb.0: ; %entry 240; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 241; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 242; SI-NEXT: s_mov_b32 s11, 0xf000 243; SI-NEXT: s_mov_b32 s10, -1 244; SI-NEXT: s_mov_b32 s14, s10 245; SI-NEXT: s_mov_b32 s15, s11 246; SI-NEXT: s_mov_b32 s6, s10 247; SI-NEXT: s_mov_b32 s7, s11 248; SI-NEXT: s_waitcnt lgkmcnt(0) 249; SI-NEXT: s_mov_b32 s12, s2 250; SI-NEXT: s_mov_b32 s13, s3 251; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 252; SI-NEXT: s_waitcnt vmcnt(0) 253; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 254; SI-NEXT: s_waitcnt vmcnt(0) 255; SI-NEXT: s_mov_b32 s8, s0 256; SI-NEXT: s_mov_b32 s9, s1 257; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 258; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 259; SI-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 260; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 261; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 262; SI-NEXT: s_endpgm 263; 264; VI-LABEL: fcmp_f16_eq: 265; VI: ; %bb.0: ; %entry 266; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 267; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 268; VI-NEXT: s_mov_b32 s7, 0xf000 269; VI-NEXT: s_mov_b32 s6, -1 270; VI-NEXT: s_mov_b32 s14, s6 271; VI-NEXT: s_waitcnt lgkmcnt(0) 272; VI-NEXT: s_mov_b32 s12, s2 273; VI-NEXT: s_mov_b32 s13, s3 274; VI-NEXT: s_mov_b32 s15, s7 275; VI-NEXT: s_mov_b32 s10, s6 276; VI-NEXT: s_mov_b32 s11, s7 277; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 278; VI-NEXT: s_waitcnt vmcnt(0) 279; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 280; VI-NEXT: s_waitcnt vmcnt(0) 281; VI-NEXT: s_mov_b32 s4, s0 282; VI-NEXT: s_mov_b32 s5, s1 283; VI-NEXT: v_cmp_eq_f16_e32 vcc, v0, v1 284; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 285; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 286; VI-NEXT: s_endpgm 287; 288; GFX11-LABEL: fcmp_f16_eq: 289; GFX11: ; %bb.0: ; %entry 290; GFX11-NEXT: s_clause 0x1 291; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 292; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 293; GFX11-NEXT: s_mov_b32 s10, -1 294; GFX11-NEXT: s_mov_b32 s11, 0x31016000 295; GFX11-NEXT: s_mov_b32 s14, s10 296; GFX11-NEXT: s_mov_b32 s15, s11 297; GFX11-NEXT: s_mov_b32 s6, s10 298; GFX11-NEXT: s_mov_b32 s7, s11 299; GFX11-NEXT: s_waitcnt lgkmcnt(0) 300; GFX11-NEXT: s_mov_b32 s12, s2 301; GFX11-NEXT: s_mov_b32 s13, s3 302; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 303; GFX11-NEXT: s_waitcnt vmcnt(0) 304; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 305; GFX11-NEXT: s_waitcnt vmcnt(0) 306; GFX11-NEXT: s_mov_b32 s8, s0 307; GFX11-NEXT: s_mov_b32 s9, s1 308; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 309; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 310; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 311; GFX11-NEXT: s_endpgm 312; 313; GFX12-LABEL: fcmp_f16_eq: 314; GFX12: ; %bb.0: ; %entry 315; GFX12-NEXT: s_clause 0x1 316; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 317; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 318; GFX12-NEXT: s_mov_b32 s10, -1 319; GFX12-NEXT: s_mov_b32 s11, 0x31016000 320; GFX12-NEXT: s_mov_b32 s14, s10 321; GFX12-NEXT: s_mov_b32 s15, s11 322; GFX12-NEXT: s_mov_b32 s6, s10 323; GFX12-NEXT: s_mov_b32 s7, s11 324; GFX12-NEXT: s_wait_kmcnt 0x0 325; GFX12-NEXT: s_mov_b32 s12, s2 326; GFX12-NEXT: s_mov_b32 s13, s3 327; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 328; GFX12-NEXT: s_wait_loadcnt 0x0 329; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 330; GFX12-NEXT: s_wait_loadcnt 0x0 331; GFX12-NEXT: s_mov_b32 s8, s0 332; GFX12-NEXT: s_mov_b32 s9, s1 333; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 334; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 335; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 336; GFX12-NEXT: s_endpgm 337 ptr addrspace(1) %r, 338 ptr addrspace(1) %a, 339 ptr addrspace(1) %b) { 340entry: 341 %a.val = load volatile half, ptr addrspace(1) %a 342 %b.val = load volatile half, ptr addrspace(1) %b 343 %r.val = fcmp oeq half %a.val, %b.val 344 %r.val.sext = sext i1 %r.val to i32 345 store i32 %r.val.sext, ptr addrspace(1) %r 346 ret void 347} 348 349define amdgpu_kernel void @fcmp_f16_le( 350; SI-LABEL: fcmp_f16_le: 351; SI: ; %bb.0: ; %entry 352; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 353; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 354; SI-NEXT: s_mov_b32 s11, 0xf000 355; SI-NEXT: s_mov_b32 s10, -1 356; SI-NEXT: s_mov_b32 s14, s10 357; SI-NEXT: s_mov_b32 s15, s11 358; SI-NEXT: s_mov_b32 s6, s10 359; SI-NEXT: s_mov_b32 s7, s11 360; SI-NEXT: s_waitcnt lgkmcnt(0) 361; SI-NEXT: s_mov_b32 s12, s2 362; SI-NEXT: s_mov_b32 s13, s3 363; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 364; SI-NEXT: s_waitcnt vmcnt(0) 365; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 366; SI-NEXT: s_waitcnt vmcnt(0) 367; SI-NEXT: s_mov_b32 s8, s0 368; SI-NEXT: s_mov_b32 s9, s1 369; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 370; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 371; SI-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 372; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 373; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 374; SI-NEXT: s_endpgm 375; 376; VI-LABEL: fcmp_f16_le: 377; VI: ; %bb.0: ; %entry 378; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 379; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 380; VI-NEXT: s_mov_b32 s7, 0xf000 381; VI-NEXT: s_mov_b32 s6, -1 382; VI-NEXT: s_mov_b32 s14, s6 383; VI-NEXT: s_waitcnt lgkmcnt(0) 384; VI-NEXT: s_mov_b32 s12, s2 385; VI-NEXT: s_mov_b32 s13, s3 386; VI-NEXT: s_mov_b32 s15, s7 387; VI-NEXT: s_mov_b32 s10, s6 388; VI-NEXT: s_mov_b32 s11, s7 389; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 390; VI-NEXT: s_waitcnt vmcnt(0) 391; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 392; VI-NEXT: s_waitcnt vmcnt(0) 393; VI-NEXT: s_mov_b32 s4, s0 394; VI-NEXT: s_mov_b32 s5, s1 395; VI-NEXT: v_cmp_le_f16_e32 vcc, v0, v1 396; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 397; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 398; VI-NEXT: s_endpgm 399; 400; GFX11-LABEL: fcmp_f16_le: 401; GFX11: ; %bb.0: ; %entry 402; GFX11-NEXT: s_clause 0x1 403; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 404; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 405; GFX11-NEXT: s_mov_b32 s10, -1 406; GFX11-NEXT: s_mov_b32 s11, 0x31016000 407; GFX11-NEXT: s_mov_b32 s14, s10 408; GFX11-NEXT: s_mov_b32 s15, s11 409; GFX11-NEXT: s_mov_b32 s6, s10 410; GFX11-NEXT: s_mov_b32 s7, s11 411; GFX11-NEXT: s_waitcnt lgkmcnt(0) 412; GFX11-NEXT: s_mov_b32 s12, s2 413; GFX11-NEXT: s_mov_b32 s13, s3 414; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 415; GFX11-NEXT: s_waitcnt vmcnt(0) 416; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 417; GFX11-NEXT: s_waitcnt vmcnt(0) 418; GFX11-NEXT: s_mov_b32 s8, s0 419; GFX11-NEXT: s_mov_b32 s9, s1 420; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 421; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 422; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 423; GFX11-NEXT: s_endpgm 424; 425; GFX12-LABEL: fcmp_f16_le: 426; GFX12: ; %bb.0: ; %entry 427; GFX12-NEXT: s_clause 0x1 428; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 429; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 430; GFX12-NEXT: s_mov_b32 s10, -1 431; GFX12-NEXT: s_mov_b32 s11, 0x31016000 432; GFX12-NEXT: s_mov_b32 s14, s10 433; GFX12-NEXT: s_mov_b32 s15, s11 434; GFX12-NEXT: s_mov_b32 s6, s10 435; GFX12-NEXT: s_mov_b32 s7, s11 436; GFX12-NEXT: s_wait_kmcnt 0x0 437; GFX12-NEXT: s_mov_b32 s12, s2 438; GFX12-NEXT: s_mov_b32 s13, s3 439; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 440; GFX12-NEXT: s_wait_loadcnt 0x0 441; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 442; GFX12-NEXT: s_wait_loadcnt 0x0 443; GFX12-NEXT: s_mov_b32 s8, s0 444; GFX12-NEXT: s_mov_b32 s9, s1 445; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 446; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 447; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 448; GFX12-NEXT: s_endpgm 449 ptr addrspace(1) %r, 450 ptr addrspace(1) %a, 451 ptr addrspace(1) %b) { 452entry: 453 %a.val = load volatile half, ptr addrspace(1) %a 454 %b.val = load volatile half, ptr addrspace(1) %b 455 %r.val = fcmp ole half %a.val, %b.val 456 %r.val.sext = sext i1 %r.val to i32 457 store i32 %r.val.sext, ptr addrspace(1) %r 458 ret void 459} 460 461define amdgpu_kernel void @fcmp_f16_gt( 462; SI-LABEL: fcmp_f16_gt: 463; SI: ; %bb.0: ; %entry 464; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 465; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 466; SI-NEXT: s_mov_b32 s11, 0xf000 467; SI-NEXT: s_mov_b32 s10, -1 468; SI-NEXT: s_mov_b32 s14, s10 469; SI-NEXT: s_mov_b32 s15, s11 470; SI-NEXT: s_mov_b32 s6, s10 471; SI-NEXT: s_mov_b32 s7, s11 472; SI-NEXT: s_waitcnt lgkmcnt(0) 473; SI-NEXT: s_mov_b32 s12, s2 474; SI-NEXT: s_mov_b32 s13, s3 475; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 476; SI-NEXT: s_waitcnt vmcnt(0) 477; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 478; SI-NEXT: s_waitcnt vmcnt(0) 479; SI-NEXT: s_mov_b32 s8, s0 480; SI-NEXT: s_mov_b32 s9, s1 481; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 482; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 483; SI-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 484; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 485; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 486; SI-NEXT: s_endpgm 487; 488; VI-LABEL: fcmp_f16_gt: 489; VI: ; %bb.0: ; %entry 490; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 491; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 492; VI-NEXT: s_mov_b32 s7, 0xf000 493; VI-NEXT: s_mov_b32 s6, -1 494; VI-NEXT: s_mov_b32 s14, s6 495; VI-NEXT: s_waitcnt lgkmcnt(0) 496; VI-NEXT: s_mov_b32 s12, s2 497; VI-NEXT: s_mov_b32 s13, s3 498; VI-NEXT: s_mov_b32 s15, s7 499; VI-NEXT: s_mov_b32 s10, s6 500; VI-NEXT: s_mov_b32 s11, s7 501; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 502; VI-NEXT: s_waitcnt vmcnt(0) 503; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 504; VI-NEXT: s_waitcnt vmcnt(0) 505; VI-NEXT: s_mov_b32 s4, s0 506; VI-NEXT: s_mov_b32 s5, s1 507; VI-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 508; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 509; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 510; VI-NEXT: s_endpgm 511; 512; GFX11-LABEL: fcmp_f16_gt: 513; GFX11: ; %bb.0: ; %entry 514; GFX11-NEXT: s_clause 0x1 515; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 516; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 517; GFX11-NEXT: s_mov_b32 s10, -1 518; GFX11-NEXT: s_mov_b32 s11, 0x31016000 519; GFX11-NEXT: s_mov_b32 s14, s10 520; GFX11-NEXT: s_mov_b32 s15, s11 521; GFX11-NEXT: s_mov_b32 s6, s10 522; GFX11-NEXT: s_mov_b32 s7, s11 523; GFX11-NEXT: s_waitcnt lgkmcnt(0) 524; GFX11-NEXT: s_mov_b32 s12, s2 525; GFX11-NEXT: s_mov_b32 s13, s3 526; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 527; GFX11-NEXT: s_waitcnt vmcnt(0) 528; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 529; GFX11-NEXT: s_waitcnt vmcnt(0) 530; GFX11-NEXT: s_mov_b32 s8, s0 531; GFX11-NEXT: s_mov_b32 s9, s1 532; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 533; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 534; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 535; GFX11-NEXT: s_endpgm 536; 537; GFX12-LABEL: fcmp_f16_gt: 538; GFX12: ; %bb.0: ; %entry 539; GFX12-NEXT: s_clause 0x1 540; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 541; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 542; GFX12-NEXT: s_mov_b32 s10, -1 543; GFX12-NEXT: s_mov_b32 s11, 0x31016000 544; GFX12-NEXT: s_mov_b32 s14, s10 545; GFX12-NEXT: s_mov_b32 s15, s11 546; GFX12-NEXT: s_mov_b32 s6, s10 547; GFX12-NEXT: s_mov_b32 s7, s11 548; GFX12-NEXT: s_wait_kmcnt 0x0 549; GFX12-NEXT: s_mov_b32 s12, s2 550; GFX12-NEXT: s_mov_b32 s13, s3 551; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 552; GFX12-NEXT: s_wait_loadcnt 0x0 553; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 554; GFX12-NEXT: s_wait_loadcnt 0x0 555; GFX12-NEXT: s_mov_b32 s8, s0 556; GFX12-NEXT: s_mov_b32 s9, s1 557; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 558; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 559; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 560; GFX12-NEXT: s_endpgm 561 ptr addrspace(1) %r, 562 ptr addrspace(1) %a, 563 ptr addrspace(1) %b) { 564entry: 565 %a.val = load volatile half, ptr addrspace(1) %a 566 %b.val = load volatile half, ptr addrspace(1) %b 567 %r.val = fcmp ogt half %a.val, %b.val 568 %r.val.sext = sext i1 %r.val to i32 569 store i32 %r.val.sext, ptr addrspace(1) %r 570 ret void 571} 572 573define amdgpu_kernel void @fcmp_f16_lg( 574; SI-LABEL: fcmp_f16_lg: 575; SI: ; %bb.0: ; %entry 576; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 577; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 578; SI-NEXT: s_mov_b32 s11, 0xf000 579; SI-NEXT: s_mov_b32 s10, -1 580; SI-NEXT: s_mov_b32 s14, s10 581; SI-NEXT: s_mov_b32 s15, s11 582; SI-NEXT: s_mov_b32 s6, s10 583; SI-NEXT: s_mov_b32 s7, s11 584; SI-NEXT: s_waitcnt lgkmcnt(0) 585; SI-NEXT: s_mov_b32 s12, s2 586; SI-NEXT: s_mov_b32 s13, s3 587; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 588; SI-NEXT: s_waitcnt vmcnt(0) 589; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 590; SI-NEXT: s_waitcnt vmcnt(0) 591; SI-NEXT: s_mov_b32 s8, s0 592; SI-NEXT: s_mov_b32 s9, s1 593; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 594; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 595; SI-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 596; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 597; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 598; SI-NEXT: s_endpgm 599; 600; VI-LABEL: fcmp_f16_lg: 601; VI: ; %bb.0: ; %entry 602; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 603; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 604; VI-NEXT: s_mov_b32 s7, 0xf000 605; VI-NEXT: s_mov_b32 s6, -1 606; VI-NEXT: s_mov_b32 s14, s6 607; VI-NEXT: s_waitcnt lgkmcnt(0) 608; VI-NEXT: s_mov_b32 s12, s2 609; VI-NEXT: s_mov_b32 s13, s3 610; VI-NEXT: s_mov_b32 s15, s7 611; VI-NEXT: s_mov_b32 s10, s6 612; VI-NEXT: s_mov_b32 s11, s7 613; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 614; VI-NEXT: s_waitcnt vmcnt(0) 615; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 616; VI-NEXT: s_waitcnt vmcnt(0) 617; VI-NEXT: s_mov_b32 s4, s0 618; VI-NEXT: s_mov_b32 s5, s1 619; VI-NEXT: v_cmp_lg_f16_e32 vcc, v0, v1 620; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 621; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 622; VI-NEXT: s_endpgm 623; 624; GFX11-LABEL: fcmp_f16_lg: 625; GFX11: ; %bb.0: ; %entry 626; GFX11-NEXT: s_clause 0x1 627; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 628; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 629; GFX11-NEXT: s_mov_b32 s10, -1 630; GFX11-NEXT: s_mov_b32 s11, 0x31016000 631; GFX11-NEXT: s_mov_b32 s14, s10 632; GFX11-NEXT: s_mov_b32 s15, s11 633; GFX11-NEXT: s_mov_b32 s6, s10 634; GFX11-NEXT: s_mov_b32 s7, s11 635; GFX11-NEXT: s_waitcnt lgkmcnt(0) 636; GFX11-NEXT: s_mov_b32 s12, s2 637; GFX11-NEXT: s_mov_b32 s13, s3 638; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 639; GFX11-NEXT: s_waitcnt vmcnt(0) 640; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 641; GFX11-NEXT: s_waitcnt vmcnt(0) 642; GFX11-NEXT: s_mov_b32 s8, s0 643; GFX11-NEXT: s_mov_b32 s9, s1 644; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 645; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 646; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 647; GFX11-NEXT: s_endpgm 648; 649; GFX12-LABEL: fcmp_f16_lg: 650; GFX12: ; %bb.0: ; %entry 651; GFX12-NEXT: s_clause 0x1 652; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 653; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 654; GFX12-NEXT: s_mov_b32 s10, -1 655; GFX12-NEXT: s_mov_b32 s11, 0x31016000 656; GFX12-NEXT: s_mov_b32 s14, s10 657; GFX12-NEXT: s_mov_b32 s15, s11 658; GFX12-NEXT: s_mov_b32 s6, s10 659; GFX12-NEXT: s_mov_b32 s7, s11 660; GFX12-NEXT: s_wait_kmcnt 0x0 661; GFX12-NEXT: s_mov_b32 s12, s2 662; GFX12-NEXT: s_mov_b32 s13, s3 663; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 664; GFX12-NEXT: s_wait_loadcnt 0x0 665; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 666; GFX12-NEXT: s_wait_loadcnt 0x0 667; GFX12-NEXT: s_mov_b32 s8, s0 668; GFX12-NEXT: s_mov_b32 s9, s1 669; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 670; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 671; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 672; GFX12-NEXT: s_endpgm 673 ptr addrspace(1) %r, 674 ptr addrspace(1) %a, 675 ptr addrspace(1) %b) { 676entry: 677 %a.val = load volatile half, ptr addrspace(1) %a 678 %b.val = load volatile half, ptr addrspace(1) %b 679 %r.val = fcmp one half %a.val, %b.val 680 %r.val.sext = sext i1 %r.val to i32 681 store i32 %r.val.sext, ptr addrspace(1) %r 682 ret void 683} 684 685define amdgpu_kernel void @fcmp_f16_ge( 686; SI-LABEL: fcmp_f16_ge: 687; SI: ; %bb.0: ; %entry 688; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 689; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 690; SI-NEXT: s_mov_b32 s11, 0xf000 691; SI-NEXT: s_mov_b32 s10, -1 692; SI-NEXT: s_mov_b32 s14, s10 693; SI-NEXT: s_mov_b32 s15, s11 694; SI-NEXT: s_mov_b32 s6, s10 695; SI-NEXT: s_mov_b32 s7, s11 696; SI-NEXT: s_waitcnt lgkmcnt(0) 697; SI-NEXT: s_mov_b32 s12, s2 698; SI-NEXT: s_mov_b32 s13, s3 699; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 700; SI-NEXT: s_waitcnt vmcnt(0) 701; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 702; SI-NEXT: s_waitcnt vmcnt(0) 703; SI-NEXT: s_mov_b32 s8, s0 704; SI-NEXT: s_mov_b32 s9, s1 705; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 706; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 707; SI-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 708; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 709; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 710; SI-NEXT: s_endpgm 711; 712; VI-LABEL: fcmp_f16_ge: 713; VI: ; %bb.0: ; %entry 714; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 715; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 716; VI-NEXT: s_mov_b32 s7, 0xf000 717; VI-NEXT: s_mov_b32 s6, -1 718; VI-NEXT: s_mov_b32 s14, s6 719; VI-NEXT: s_waitcnt lgkmcnt(0) 720; VI-NEXT: s_mov_b32 s12, s2 721; VI-NEXT: s_mov_b32 s13, s3 722; VI-NEXT: s_mov_b32 s15, s7 723; VI-NEXT: s_mov_b32 s10, s6 724; VI-NEXT: s_mov_b32 s11, s7 725; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 726; VI-NEXT: s_waitcnt vmcnt(0) 727; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 728; VI-NEXT: s_waitcnt vmcnt(0) 729; VI-NEXT: s_mov_b32 s4, s0 730; VI-NEXT: s_mov_b32 s5, s1 731; VI-NEXT: v_cmp_ge_f16_e32 vcc, v0, v1 732; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 733; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 734; VI-NEXT: s_endpgm 735; 736; GFX11-LABEL: fcmp_f16_ge: 737; GFX11: ; %bb.0: ; %entry 738; GFX11-NEXT: s_clause 0x1 739; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 740; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 741; GFX11-NEXT: s_mov_b32 s10, -1 742; GFX11-NEXT: s_mov_b32 s11, 0x31016000 743; GFX11-NEXT: s_mov_b32 s14, s10 744; GFX11-NEXT: s_mov_b32 s15, s11 745; GFX11-NEXT: s_mov_b32 s6, s10 746; GFX11-NEXT: s_mov_b32 s7, s11 747; GFX11-NEXT: s_waitcnt lgkmcnt(0) 748; GFX11-NEXT: s_mov_b32 s12, s2 749; GFX11-NEXT: s_mov_b32 s13, s3 750; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 751; GFX11-NEXT: s_waitcnt vmcnt(0) 752; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 753; GFX11-NEXT: s_waitcnt vmcnt(0) 754; GFX11-NEXT: s_mov_b32 s8, s0 755; GFX11-NEXT: s_mov_b32 s9, s1 756; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 757; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 758; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 759; GFX11-NEXT: s_endpgm 760; 761; GFX12-LABEL: fcmp_f16_ge: 762; GFX12: ; %bb.0: ; %entry 763; GFX12-NEXT: s_clause 0x1 764; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 765; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 766; GFX12-NEXT: s_mov_b32 s10, -1 767; GFX12-NEXT: s_mov_b32 s11, 0x31016000 768; GFX12-NEXT: s_mov_b32 s14, s10 769; GFX12-NEXT: s_mov_b32 s15, s11 770; GFX12-NEXT: s_mov_b32 s6, s10 771; GFX12-NEXT: s_mov_b32 s7, s11 772; GFX12-NEXT: s_wait_kmcnt 0x0 773; GFX12-NEXT: s_mov_b32 s12, s2 774; GFX12-NEXT: s_mov_b32 s13, s3 775; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 776; GFX12-NEXT: s_wait_loadcnt 0x0 777; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 778; GFX12-NEXT: s_wait_loadcnt 0x0 779; GFX12-NEXT: s_mov_b32 s8, s0 780; GFX12-NEXT: s_mov_b32 s9, s1 781; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 782; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 783; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 784; GFX12-NEXT: s_endpgm 785 ptr addrspace(1) %r, 786 ptr addrspace(1) %a, 787 ptr addrspace(1) %b) { 788entry: 789 %a.val = load volatile half, ptr addrspace(1) %a 790 %b.val = load volatile half, ptr addrspace(1) %b 791 %r.val = fcmp oge half %a.val, %b.val 792 %r.val.sext = sext i1 %r.val to i32 793 store i32 %r.val.sext, ptr addrspace(1) %r 794 ret void 795} 796 797define amdgpu_kernel void @fcmp_f16_o( 798; SI-LABEL: fcmp_f16_o: 799; SI: ; %bb.0: ; %entry 800; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 801; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 802; SI-NEXT: s_mov_b32 s11, 0xf000 803; SI-NEXT: s_mov_b32 s10, -1 804; SI-NEXT: s_mov_b32 s14, s10 805; SI-NEXT: s_mov_b32 s15, s11 806; SI-NEXT: s_mov_b32 s6, s10 807; SI-NEXT: s_mov_b32 s7, s11 808; SI-NEXT: s_waitcnt lgkmcnt(0) 809; SI-NEXT: s_mov_b32 s12, s2 810; SI-NEXT: s_mov_b32 s13, s3 811; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 812; SI-NEXT: s_waitcnt vmcnt(0) 813; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 814; SI-NEXT: s_waitcnt vmcnt(0) 815; SI-NEXT: s_mov_b32 s8, s0 816; SI-NEXT: s_mov_b32 s9, s1 817; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 818; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 819; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 820; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 821; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 822; SI-NEXT: s_endpgm 823; 824; VI-LABEL: fcmp_f16_o: 825; VI: ; %bb.0: ; %entry 826; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 827; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 828; VI-NEXT: s_mov_b32 s7, 0xf000 829; VI-NEXT: s_mov_b32 s6, -1 830; VI-NEXT: s_mov_b32 s14, s6 831; VI-NEXT: s_waitcnt lgkmcnt(0) 832; VI-NEXT: s_mov_b32 s12, s2 833; VI-NEXT: s_mov_b32 s13, s3 834; VI-NEXT: s_mov_b32 s15, s7 835; VI-NEXT: s_mov_b32 s10, s6 836; VI-NEXT: s_mov_b32 s11, s7 837; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 838; VI-NEXT: s_waitcnt vmcnt(0) 839; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 840; VI-NEXT: s_waitcnt vmcnt(0) 841; VI-NEXT: s_mov_b32 s4, s0 842; VI-NEXT: s_mov_b32 s5, s1 843; VI-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 844; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 845; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 846; VI-NEXT: s_endpgm 847; 848; GFX11-LABEL: fcmp_f16_o: 849; GFX11: ; %bb.0: ; %entry 850; GFX11-NEXT: s_clause 0x1 851; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 852; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 853; GFX11-NEXT: s_mov_b32 s10, -1 854; GFX11-NEXT: s_mov_b32 s11, 0x31016000 855; GFX11-NEXT: s_mov_b32 s14, s10 856; GFX11-NEXT: s_mov_b32 s15, s11 857; GFX11-NEXT: s_mov_b32 s6, s10 858; GFX11-NEXT: s_mov_b32 s7, s11 859; GFX11-NEXT: s_waitcnt lgkmcnt(0) 860; GFX11-NEXT: s_mov_b32 s12, s2 861; GFX11-NEXT: s_mov_b32 s13, s3 862; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 863; GFX11-NEXT: s_waitcnt vmcnt(0) 864; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 865; GFX11-NEXT: s_waitcnt vmcnt(0) 866; GFX11-NEXT: s_mov_b32 s8, s0 867; GFX11-NEXT: s_mov_b32 s9, s1 868; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 869; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 870; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 871; GFX11-NEXT: s_endpgm 872; 873; GFX12-LABEL: fcmp_f16_o: 874; GFX12: ; %bb.0: ; %entry 875; GFX12-NEXT: s_clause 0x1 876; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 877; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 878; GFX12-NEXT: s_mov_b32 s10, -1 879; GFX12-NEXT: s_mov_b32 s11, 0x31016000 880; GFX12-NEXT: s_mov_b32 s14, s10 881; GFX12-NEXT: s_mov_b32 s15, s11 882; GFX12-NEXT: s_mov_b32 s6, s10 883; GFX12-NEXT: s_mov_b32 s7, s11 884; GFX12-NEXT: s_wait_kmcnt 0x0 885; GFX12-NEXT: s_mov_b32 s12, s2 886; GFX12-NEXT: s_mov_b32 s13, s3 887; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 888; GFX12-NEXT: s_wait_loadcnt 0x0 889; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 890; GFX12-NEXT: s_wait_loadcnt 0x0 891; GFX12-NEXT: s_mov_b32 s8, s0 892; GFX12-NEXT: s_mov_b32 s9, s1 893; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 894; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 895; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 896; GFX12-NEXT: s_endpgm 897 ptr addrspace(1) %r, 898 ptr addrspace(1) %a, 899 ptr addrspace(1) %b) { 900entry: 901 %a.val = load volatile half, ptr addrspace(1) %a 902 %b.val = load volatile half, ptr addrspace(1) %b 903 %r.val = fcmp ord half %a.val, %b.val 904 %r.val.sext = sext i1 %r.val to i32 905 store i32 %r.val.sext, ptr addrspace(1) %r 906 ret void 907} 908 909define amdgpu_kernel void @fcmp_f16_u( 910; SI-LABEL: fcmp_f16_u: 911; SI: ; %bb.0: ; %entry 912; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 913; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 914; SI-NEXT: s_mov_b32 s11, 0xf000 915; SI-NEXT: s_mov_b32 s10, -1 916; SI-NEXT: s_mov_b32 s14, s10 917; SI-NEXT: s_mov_b32 s15, s11 918; SI-NEXT: s_mov_b32 s6, s10 919; SI-NEXT: s_mov_b32 s7, s11 920; SI-NEXT: s_waitcnt lgkmcnt(0) 921; SI-NEXT: s_mov_b32 s12, s2 922; SI-NEXT: s_mov_b32 s13, s3 923; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 924; SI-NEXT: s_waitcnt vmcnt(0) 925; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 926; SI-NEXT: s_waitcnt vmcnt(0) 927; SI-NEXT: s_mov_b32 s8, s0 928; SI-NEXT: s_mov_b32 s9, s1 929; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 930; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 931; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 932; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 933; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 934; SI-NEXT: s_endpgm 935; 936; VI-LABEL: fcmp_f16_u: 937; VI: ; %bb.0: ; %entry 938; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 939; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 940; VI-NEXT: s_mov_b32 s7, 0xf000 941; VI-NEXT: s_mov_b32 s6, -1 942; VI-NEXT: s_mov_b32 s14, s6 943; VI-NEXT: s_waitcnt lgkmcnt(0) 944; VI-NEXT: s_mov_b32 s12, s2 945; VI-NEXT: s_mov_b32 s13, s3 946; VI-NEXT: s_mov_b32 s15, s7 947; VI-NEXT: s_mov_b32 s10, s6 948; VI-NEXT: s_mov_b32 s11, s7 949; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 950; VI-NEXT: s_waitcnt vmcnt(0) 951; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 952; VI-NEXT: s_waitcnt vmcnt(0) 953; VI-NEXT: s_mov_b32 s4, s0 954; VI-NEXT: s_mov_b32 s5, s1 955; VI-NEXT: v_cmp_u_f16_e32 vcc, v0, v1 956; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 957; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 958; VI-NEXT: s_endpgm 959; 960; GFX11-LABEL: fcmp_f16_u: 961; GFX11: ; %bb.0: ; %entry 962; GFX11-NEXT: s_clause 0x1 963; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 964; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 965; GFX11-NEXT: s_mov_b32 s10, -1 966; GFX11-NEXT: s_mov_b32 s11, 0x31016000 967; GFX11-NEXT: s_mov_b32 s14, s10 968; GFX11-NEXT: s_mov_b32 s15, s11 969; GFX11-NEXT: s_mov_b32 s6, s10 970; GFX11-NEXT: s_mov_b32 s7, s11 971; GFX11-NEXT: s_waitcnt lgkmcnt(0) 972; GFX11-NEXT: s_mov_b32 s12, s2 973; GFX11-NEXT: s_mov_b32 s13, s3 974; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 975; GFX11-NEXT: s_waitcnt vmcnt(0) 976; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 977; GFX11-NEXT: s_waitcnt vmcnt(0) 978; GFX11-NEXT: s_mov_b32 s8, s0 979; GFX11-NEXT: s_mov_b32 s9, s1 980; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 981; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 982; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 983; GFX11-NEXT: s_endpgm 984; 985; GFX12-LABEL: fcmp_f16_u: 986; GFX12: ; %bb.0: ; %entry 987; GFX12-NEXT: s_clause 0x1 988; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 989; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 990; GFX12-NEXT: s_mov_b32 s10, -1 991; GFX12-NEXT: s_mov_b32 s11, 0x31016000 992; GFX12-NEXT: s_mov_b32 s14, s10 993; GFX12-NEXT: s_mov_b32 s15, s11 994; GFX12-NEXT: s_mov_b32 s6, s10 995; GFX12-NEXT: s_mov_b32 s7, s11 996; GFX12-NEXT: s_wait_kmcnt 0x0 997; GFX12-NEXT: s_mov_b32 s12, s2 998; GFX12-NEXT: s_mov_b32 s13, s3 999; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1000; GFX12-NEXT: s_wait_loadcnt 0x0 1001; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1002; GFX12-NEXT: s_wait_loadcnt 0x0 1003; GFX12-NEXT: s_mov_b32 s8, s0 1004; GFX12-NEXT: s_mov_b32 s9, s1 1005; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 1006; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1007; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1008; GFX12-NEXT: s_endpgm 1009 ptr addrspace(1) %r, 1010 ptr addrspace(1) %a, 1011 ptr addrspace(1) %b) { 1012entry: 1013 %a.val = load volatile half, ptr addrspace(1) %a 1014 %b.val = load volatile half, ptr addrspace(1) %b 1015 %r.val = fcmp uno half %a.val, %b.val 1016 %r.val.sext = sext i1 %r.val to i32 1017 store i32 %r.val.sext, ptr addrspace(1) %r 1018 ret void 1019} 1020 1021define amdgpu_kernel void @fcmp_f16_nge( 1022; SI-LABEL: fcmp_f16_nge: 1023; SI: ; %bb.0: ; %entry 1024; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1025; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1026; SI-NEXT: s_mov_b32 s11, 0xf000 1027; SI-NEXT: s_mov_b32 s10, -1 1028; SI-NEXT: s_mov_b32 s14, s10 1029; SI-NEXT: s_mov_b32 s15, s11 1030; SI-NEXT: s_mov_b32 s6, s10 1031; SI-NEXT: s_mov_b32 s7, s11 1032; SI-NEXT: s_waitcnt lgkmcnt(0) 1033; SI-NEXT: s_mov_b32 s12, s2 1034; SI-NEXT: s_mov_b32 s13, s3 1035; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1036; SI-NEXT: s_waitcnt vmcnt(0) 1037; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1038; SI-NEXT: s_waitcnt vmcnt(0) 1039; SI-NEXT: s_mov_b32 s8, s0 1040; SI-NEXT: s_mov_b32 s9, s1 1041; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1042; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1043; SI-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 1044; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1045; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1046; SI-NEXT: s_endpgm 1047; 1048; VI-LABEL: fcmp_f16_nge: 1049; VI: ; %bb.0: ; %entry 1050; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1051; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1052; VI-NEXT: s_mov_b32 s7, 0xf000 1053; VI-NEXT: s_mov_b32 s6, -1 1054; VI-NEXT: s_mov_b32 s14, s6 1055; VI-NEXT: s_waitcnt lgkmcnt(0) 1056; VI-NEXT: s_mov_b32 s12, s2 1057; VI-NEXT: s_mov_b32 s13, s3 1058; VI-NEXT: s_mov_b32 s15, s7 1059; VI-NEXT: s_mov_b32 s10, s6 1060; VI-NEXT: s_mov_b32 s11, s7 1061; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1062; VI-NEXT: s_waitcnt vmcnt(0) 1063; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1064; VI-NEXT: s_waitcnt vmcnt(0) 1065; VI-NEXT: s_mov_b32 s4, s0 1066; VI-NEXT: s_mov_b32 s5, s1 1067; VI-NEXT: v_cmp_nge_f16_e32 vcc, v0, v1 1068; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1069; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1070; VI-NEXT: s_endpgm 1071; 1072; GFX11-LABEL: fcmp_f16_nge: 1073; GFX11: ; %bb.0: ; %entry 1074; GFX11-NEXT: s_clause 0x1 1075; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1076; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1077; GFX11-NEXT: s_mov_b32 s10, -1 1078; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1079; GFX11-NEXT: s_mov_b32 s14, s10 1080; GFX11-NEXT: s_mov_b32 s15, s11 1081; GFX11-NEXT: s_mov_b32 s6, s10 1082; GFX11-NEXT: s_mov_b32 s7, s11 1083; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1084; GFX11-NEXT: s_mov_b32 s12, s2 1085; GFX11-NEXT: s_mov_b32 s13, s3 1086; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1087; GFX11-NEXT: s_waitcnt vmcnt(0) 1088; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1089; GFX11-NEXT: s_waitcnt vmcnt(0) 1090; GFX11-NEXT: s_mov_b32 s8, s0 1091; GFX11-NEXT: s_mov_b32 s9, s1 1092; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 1093; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1094; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1095; GFX11-NEXT: s_endpgm 1096; 1097; GFX12-LABEL: fcmp_f16_nge: 1098; GFX12: ; %bb.0: ; %entry 1099; GFX12-NEXT: s_clause 0x1 1100; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1101; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1102; GFX12-NEXT: s_mov_b32 s10, -1 1103; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1104; GFX12-NEXT: s_mov_b32 s14, s10 1105; GFX12-NEXT: s_mov_b32 s15, s11 1106; GFX12-NEXT: s_mov_b32 s6, s10 1107; GFX12-NEXT: s_mov_b32 s7, s11 1108; GFX12-NEXT: s_wait_kmcnt 0x0 1109; GFX12-NEXT: s_mov_b32 s12, s2 1110; GFX12-NEXT: s_mov_b32 s13, s3 1111; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1112; GFX12-NEXT: s_wait_loadcnt 0x0 1113; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1114; GFX12-NEXT: s_wait_loadcnt 0x0 1115; GFX12-NEXT: s_mov_b32 s8, s0 1116; GFX12-NEXT: s_mov_b32 s9, s1 1117; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 1118; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1119; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1120; GFX12-NEXT: s_endpgm 1121 ptr addrspace(1) %r, 1122 ptr addrspace(1) %a, 1123 ptr addrspace(1) %b) { 1124entry: 1125 %a.val = load volatile half, ptr addrspace(1) %a 1126 %b.val = load volatile half, ptr addrspace(1) %b 1127 %r.val = fcmp ult half %a.val, %b.val 1128 %r.val.sext = sext i1 %r.val to i32 1129 store i32 %r.val.sext, ptr addrspace(1) %r 1130 ret void 1131} 1132 1133define amdgpu_kernel void @fcmp_f16_nlg( 1134; SI-LABEL: fcmp_f16_nlg: 1135; SI: ; %bb.0: ; %entry 1136; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1137; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1138; SI-NEXT: s_mov_b32 s11, 0xf000 1139; SI-NEXT: s_mov_b32 s10, -1 1140; SI-NEXT: s_mov_b32 s14, s10 1141; SI-NEXT: s_mov_b32 s15, s11 1142; SI-NEXT: s_mov_b32 s6, s10 1143; SI-NEXT: s_mov_b32 s7, s11 1144; SI-NEXT: s_waitcnt lgkmcnt(0) 1145; SI-NEXT: s_mov_b32 s12, s2 1146; SI-NEXT: s_mov_b32 s13, s3 1147; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1148; SI-NEXT: s_waitcnt vmcnt(0) 1149; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1150; SI-NEXT: s_waitcnt vmcnt(0) 1151; SI-NEXT: s_mov_b32 s8, s0 1152; SI-NEXT: s_mov_b32 s9, s1 1153; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1154; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1155; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 1156; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1157; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1158; SI-NEXT: s_endpgm 1159; 1160; VI-LABEL: fcmp_f16_nlg: 1161; VI: ; %bb.0: ; %entry 1162; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1163; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1164; VI-NEXT: s_mov_b32 s7, 0xf000 1165; VI-NEXT: s_mov_b32 s6, -1 1166; VI-NEXT: s_mov_b32 s14, s6 1167; VI-NEXT: s_waitcnt lgkmcnt(0) 1168; VI-NEXT: s_mov_b32 s12, s2 1169; VI-NEXT: s_mov_b32 s13, s3 1170; VI-NEXT: s_mov_b32 s15, s7 1171; VI-NEXT: s_mov_b32 s10, s6 1172; VI-NEXT: s_mov_b32 s11, s7 1173; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1174; VI-NEXT: s_waitcnt vmcnt(0) 1175; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1176; VI-NEXT: s_waitcnt vmcnt(0) 1177; VI-NEXT: s_mov_b32 s4, s0 1178; VI-NEXT: s_mov_b32 s5, s1 1179; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v0, v1 1180; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1181; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1182; VI-NEXT: s_endpgm 1183; 1184; GFX11-LABEL: fcmp_f16_nlg: 1185; GFX11: ; %bb.0: ; %entry 1186; GFX11-NEXT: s_clause 0x1 1187; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1188; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1189; GFX11-NEXT: s_mov_b32 s10, -1 1190; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1191; GFX11-NEXT: s_mov_b32 s14, s10 1192; GFX11-NEXT: s_mov_b32 s15, s11 1193; GFX11-NEXT: s_mov_b32 s6, s10 1194; GFX11-NEXT: s_mov_b32 s7, s11 1195; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1196; GFX11-NEXT: s_mov_b32 s12, s2 1197; GFX11-NEXT: s_mov_b32 s13, s3 1198; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1199; GFX11-NEXT: s_waitcnt vmcnt(0) 1200; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1201; GFX11-NEXT: s_waitcnt vmcnt(0) 1202; GFX11-NEXT: s_mov_b32 s8, s0 1203; GFX11-NEXT: s_mov_b32 s9, s1 1204; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 1205; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1206; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1207; GFX11-NEXT: s_endpgm 1208; 1209; GFX12-LABEL: fcmp_f16_nlg: 1210; GFX12: ; %bb.0: ; %entry 1211; GFX12-NEXT: s_clause 0x1 1212; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1213; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1214; GFX12-NEXT: s_mov_b32 s10, -1 1215; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1216; GFX12-NEXT: s_mov_b32 s14, s10 1217; GFX12-NEXT: s_mov_b32 s15, s11 1218; GFX12-NEXT: s_mov_b32 s6, s10 1219; GFX12-NEXT: s_mov_b32 s7, s11 1220; GFX12-NEXT: s_wait_kmcnt 0x0 1221; GFX12-NEXT: s_mov_b32 s12, s2 1222; GFX12-NEXT: s_mov_b32 s13, s3 1223; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1224; GFX12-NEXT: s_wait_loadcnt 0x0 1225; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1226; GFX12-NEXT: s_wait_loadcnt 0x0 1227; GFX12-NEXT: s_mov_b32 s8, s0 1228; GFX12-NEXT: s_mov_b32 s9, s1 1229; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 1230; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1231; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1232; GFX12-NEXT: s_endpgm 1233 ptr addrspace(1) %r, 1234 ptr addrspace(1) %a, 1235 ptr addrspace(1) %b) { 1236entry: 1237 %a.val = load volatile half, ptr addrspace(1) %a 1238 %b.val = load volatile half, ptr addrspace(1) %b 1239 %r.val = fcmp ueq half %a.val, %b.val 1240 %r.val.sext = sext i1 %r.val to i32 1241 store i32 %r.val.sext, ptr addrspace(1) %r 1242 ret void 1243} 1244 1245define amdgpu_kernel void @fcmp_f16_ngt( 1246; SI-LABEL: fcmp_f16_ngt: 1247; SI: ; %bb.0: ; %entry 1248; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1249; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1250; SI-NEXT: s_mov_b32 s11, 0xf000 1251; SI-NEXT: s_mov_b32 s10, -1 1252; SI-NEXT: s_mov_b32 s14, s10 1253; SI-NEXT: s_mov_b32 s15, s11 1254; SI-NEXT: s_mov_b32 s6, s10 1255; SI-NEXT: s_mov_b32 s7, s11 1256; SI-NEXT: s_waitcnt lgkmcnt(0) 1257; SI-NEXT: s_mov_b32 s12, s2 1258; SI-NEXT: s_mov_b32 s13, s3 1259; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1260; SI-NEXT: s_waitcnt vmcnt(0) 1261; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1262; SI-NEXT: s_waitcnt vmcnt(0) 1263; SI-NEXT: s_mov_b32 s8, s0 1264; SI-NEXT: s_mov_b32 s9, s1 1265; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1266; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1267; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 1268; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1269; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1270; SI-NEXT: s_endpgm 1271; 1272; VI-LABEL: fcmp_f16_ngt: 1273; VI: ; %bb.0: ; %entry 1274; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1275; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1276; VI-NEXT: s_mov_b32 s7, 0xf000 1277; VI-NEXT: s_mov_b32 s6, -1 1278; VI-NEXT: s_mov_b32 s14, s6 1279; VI-NEXT: s_waitcnt lgkmcnt(0) 1280; VI-NEXT: s_mov_b32 s12, s2 1281; VI-NEXT: s_mov_b32 s13, s3 1282; VI-NEXT: s_mov_b32 s15, s7 1283; VI-NEXT: s_mov_b32 s10, s6 1284; VI-NEXT: s_mov_b32 s11, s7 1285; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1286; VI-NEXT: s_waitcnt vmcnt(0) 1287; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1288; VI-NEXT: s_waitcnt vmcnt(0) 1289; VI-NEXT: s_mov_b32 s4, s0 1290; VI-NEXT: s_mov_b32 s5, s1 1291; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v0, v1 1292; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1293; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1294; VI-NEXT: s_endpgm 1295; 1296; GFX11-LABEL: fcmp_f16_ngt: 1297; GFX11: ; %bb.0: ; %entry 1298; GFX11-NEXT: s_clause 0x1 1299; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1300; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1301; GFX11-NEXT: s_mov_b32 s10, -1 1302; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1303; GFX11-NEXT: s_mov_b32 s14, s10 1304; GFX11-NEXT: s_mov_b32 s15, s11 1305; GFX11-NEXT: s_mov_b32 s6, s10 1306; GFX11-NEXT: s_mov_b32 s7, s11 1307; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1308; GFX11-NEXT: s_mov_b32 s12, s2 1309; GFX11-NEXT: s_mov_b32 s13, s3 1310; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1311; GFX11-NEXT: s_waitcnt vmcnt(0) 1312; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1313; GFX11-NEXT: s_waitcnt vmcnt(0) 1314; GFX11-NEXT: s_mov_b32 s8, s0 1315; GFX11-NEXT: s_mov_b32 s9, s1 1316; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 1317; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1318; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1319; GFX11-NEXT: s_endpgm 1320; 1321; GFX12-LABEL: fcmp_f16_ngt: 1322; GFX12: ; %bb.0: ; %entry 1323; GFX12-NEXT: s_clause 0x1 1324; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1325; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1326; GFX12-NEXT: s_mov_b32 s10, -1 1327; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1328; GFX12-NEXT: s_mov_b32 s14, s10 1329; GFX12-NEXT: s_mov_b32 s15, s11 1330; GFX12-NEXT: s_mov_b32 s6, s10 1331; GFX12-NEXT: s_mov_b32 s7, s11 1332; GFX12-NEXT: s_wait_kmcnt 0x0 1333; GFX12-NEXT: s_mov_b32 s12, s2 1334; GFX12-NEXT: s_mov_b32 s13, s3 1335; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1336; GFX12-NEXT: s_wait_loadcnt 0x0 1337; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1338; GFX12-NEXT: s_wait_loadcnt 0x0 1339; GFX12-NEXT: s_mov_b32 s8, s0 1340; GFX12-NEXT: s_mov_b32 s9, s1 1341; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 1342; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1343; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1344; GFX12-NEXT: s_endpgm 1345 ptr addrspace(1) %r, 1346 ptr addrspace(1) %a, 1347 ptr addrspace(1) %b) { 1348entry: 1349 %a.val = load volatile half, ptr addrspace(1) %a 1350 %b.val = load volatile half, ptr addrspace(1) %b 1351 %r.val = fcmp ule half %a.val, %b.val 1352 %r.val.sext = sext i1 %r.val to i32 1353 store i32 %r.val.sext, ptr addrspace(1) %r 1354 ret void 1355} 1356 1357define amdgpu_kernel void @fcmp_f16_nle( 1358; SI-LABEL: fcmp_f16_nle: 1359; SI: ; %bb.0: ; %entry 1360; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1361; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1362; SI-NEXT: s_mov_b32 s11, 0xf000 1363; SI-NEXT: s_mov_b32 s10, -1 1364; SI-NEXT: s_mov_b32 s14, s10 1365; SI-NEXT: s_mov_b32 s15, s11 1366; SI-NEXT: s_mov_b32 s6, s10 1367; SI-NEXT: s_mov_b32 s7, s11 1368; SI-NEXT: s_waitcnt lgkmcnt(0) 1369; SI-NEXT: s_mov_b32 s12, s2 1370; SI-NEXT: s_mov_b32 s13, s3 1371; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1372; SI-NEXT: s_waitcnt vmcnt(0) 1373; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1374; SI-NEXT: s_waitcnt vmcnt(0) 1375; SI-NEXT: s_mov_b32 s8, s0 1376; SI-NEXT: s_mov_b32 s9, s1 1377; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1378; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1379; SI-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 1380; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1381; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1382; SI-NEXT: s_endpgm 1383; 1384; VI-LABEL: fcmp_f16_nle: 1385; VI: ; %bb.0: ; %entry 1386; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1387; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1388; VI-NEXT: s_mov_b32 s7, 0xf000 1389; VI-NEXT: s_mov_b32 s6, -1 1390; VI-NEXT: s_mov_b32 s14, s6 1391; VI-NEXT: s_waitcnt lgkmcnt(0) 1392; VI-NEXT: s_mov_b32 s12, s2 1393; VI-NEXT: s_mov_b32 s13, s3 1394; VI-NEXT: s_mov_b32 s15, s7 1395; VI-NEXT: s_mov_b32 s10, s6 1396; VI-NEXT: s_mov_b32 s11, s7 1397; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1398; VI-NEXT: s_waitcnt vmcnt(0) 1399; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1400; VI-NEXT: s_waitcnt vmcnt(0) 1401; VI-NEXT: s_mov_b32 s4, s0 1402; VI-NEXT: s_mov_b32 s5, s1 1403; VI-NEXT: v_cmp_nle_f16_e32 vcc, v0, v1 1404; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1405; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1406; VI-NEXT: s_endpgm 1407; 1408; GFX11-LABEL: fcmp_f16_nle: 1409; GFX11: ; %bb.0: ; %entry 1410; GFX11-NEXT: s_clause 0x1 1411; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1412; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1413; GFX11-NEXT: s_mov_b32 s10, -1 1414; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1415; GFX11-NEXT: s_mov_b32 s14, s10 1416; GFX11-NEXT: s_mov_b32 s15, s11 1417; GFX11-NEXT: s_mov_b32 s6, s10 1418; GFX11-NEXT: s_mov_b32 s7, s11 1419; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1420; GFX11-NEXT: s_mov_b32 s12, s2 1421; GFX11-NEXT: s_mov_b32 s13, s3 1422; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1423; GFX11-NEXT: s_waitcnt vmcnt(0) 1424; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1425; GFX11-NEXT: s_waitcnt vmcnt(0) 1426; GFX11-NEXT: s_mov_b32 s8, s0 1427; GFX11-NEXT: s_mov_b32 s9, s1 1428; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 1429; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1430; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1431; GFX11-NEXT: s_endpgm 1432; 1433; GFX12-LABEL: fcmp_f16_nle: 1434; GFX12: ; %bb.0: ; %entry 1435; GFX12-NEXT: s_clause 0x1 1436; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1437; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1438; GFX12-NEXT: s_mov_b32 s10, -1 1439; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1440; GFX12-NEXT: s_mov_b32 s14, s10 1441; GFX12-NEXT: s_mov_b32 s15, s11 1442; GFX12-NEXT: s_mov_b32 s6, s10 1443; GFX12-NEXT: s_mov_b32 s7, s11 1444; GFX12-NEXT: s_wait_kmcnt 0x0 1445; GFX12-NEXT: s_mov_b32 s12, s2 1446; GFX12-NEXT: s_mov_b32 s13, s3 1447; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1448; GFX12-NEXT: s_wait_loadcnt 0x0 1449; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1450; GFX12-NEXT: s_wait_loadcnt 0x0 1451; GFX12-NEXT: s_mov_b32 s8, s0 1452; GFX12-NEXT: s_mov_b32 s9, s1 1453; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 1454; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1455; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1456; GFX12-NEXT: s_endpgm 1457 ptr addrspace(1) %r, 1458 ptr addrspace(1) %a, 1459 ptr addrspace(1) %b) { 1460entry: 1461 %a.val = load volatile half, ptr addrspace(1) %a 1462 %b.val = load volatile half, ptr addrspace(1) %b 1463 %r.val = fcmp ugt half %a.val, %b.val 1464 %r.val.sext = sext i1 %r.val to i32 1465 store i32 %r.val.sext, ptr addrspace(1) %r 1466 ret void 1467} 1468 1469define amdgpu_kernel void @fcmp_f16_neq( 1470; SI-LABEL: fcmp_f16_neq: 1471; SI: ; %bb.0: ; %entry 1472; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1473; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1474; SI-NEXT: s_mov_b32 s11, 0xf000 1475; SI-NEXT: s_mov_b32 s10, -1 1476; SI-NEXT: s_mov_b32 s14, s10 1477; SI-NEXT: s_mov_b32 s15, s11 1478; SI-NEXT: s_mov_b32 s6, s10 1479; SI-NEXT: s_mov_b32 s7, s11 1480; SI-NEXT: s_waitcnt lgkmcnt(0) 1481; SI-NEXT: s_mov_b32 s12, s2 1482; SI-NEXT: s_mov_b32 s13, s3 1483; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1484; SI-NEXT: s_waitcnt vmcnt(0) 1485; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1486; SI-NEXT: s_waitcnt vmcnt(0) 1487; SI-NEXT: s_mov_b32 s8, s0 1488; SI-NEXT: s_mov_b32 s9, s1 1489; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1490; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1491; SI-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 1492; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1493; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1494; SI-NEXT: s_endpgm 1495; 1496; VI-LABEL: fcmp_f16_neq: 1497; VI: ; %bb.0: ; %entry 1498; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1499; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1500; VI-NEXT: s_mov_b32 s7, 0xf000 1501; VI-NEXT: s_mov_b32 s6, -1 1502; VI-NEXT: s_mov_b32 s14, s6 1503; VI-NEXT: s_waitcnt lgkmcnt(0) 1504; VI-NEXT: s_mov_b32 s12, s2 1505; VI-NEXT: s_mov_b32 s13, s3 1506; VI-NEXT: s_mov_b32 s15, s7 1507; VI-NEXT: s_mov_b32 s10, s6 1508; VI-NEXT: s_mov_b32 s11, s7 1509; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1510; VI-NEXT: s_waitcnt vmcnt(0) 1511; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1512; VI-NEXT: s_waitcnt vmcnt(0) 1513; VI-NEXT: s_mov_b32 s4, s0 1514; VI-NEXT: s_mov_b32 s5, s1 1515; VI-NEXT: v_cmp_neq_f16_e32 vcc, v0, v1 1516; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1517; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1518; VI-NEXT: s_endpgm 1519; 1520; GFX11-LABEL: fcmp_f16_neq: 1521; GFX11: ; %bb.0: ; %entry 1522; GFX11-NEXT: s_clause 0x1 1523; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1524; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1525; GFX11-NEXT: s_mov_b32 s10, -1 1526; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1527; GFX11-NEXT: s_mov_b32 s14, s10 1528; GFX11-NEXT: s_mov_b32 s15, s11 1529; GFX11-NEXT: s_mov_b32 s6, s10 1530; GFX11-NEXT: s_mov_b32 s7, s11 1531; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1532; GFX11-NEXT: s_mov_b32 s12, s2 1533; GFX11-NEXT: s_mov_b32 s13, s3 1534; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1535; GFX11-NEXT: s_waitcnt vmcnt(0) 1536; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1537; GFX11-NEXT: s_waitcnt vmcnt(0) 1538; GFX11-NEXT: s_mov_b32 s8, s0 1539; GFX11-NEXT: s_mov_b32 s9, s1 1540; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 1541; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1542; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1543; GFX11-NEXT: s_endpgm 1544; 1545; GFX12-LABEL: fcmp_f16_neq: 1546; GFX12: ; %bb.0: ; %entry 1547; GFX12-NEXT: s_clause 0x1 1548; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1549; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1550; GFX12-NEXT: s_mov_b32 s10, -1 1551; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1552; GFX12-NEXT: s_mov_b32 s14, s10 1553; GFX12-NEXT: s_mov_b32 s15, s11 1554; GFX12-NEXT: s_mov_b32 s6, s10 1555; GFX12-NEXT: s_mov_b32 s7, s11 1556; GFX12-NEXT: s_wait_kmcnt 0x0 1557; GFX12-NEXT: s_mov_b32 s12, s2 1558; GFX12-NEXT: s_mov_b32 s13, s3 1559; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1560; GFX12-NEXT: s_wait_loadcnt 0x0 1561; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1562; GFX12-NEXT: s_wait_loadcnt 0x0 1563; GFX12-NEXT: s_mov_b32 s8, s0 1564; GFX12-NEXT: s_mov_b32 s9, s1 1565; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 1566; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1567; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1568; GFX12-NEXT: s_endpgm 1569 ptr addrspace(1) %r, 1570 ptr addrspace(1) %a, 1571 ptr addrspace(1) %b) { 1572entry: 1573 %a.val = load volatile half, ptr addrspace(1) %a 1574 %b.val = load volatile half, ptr addrspace(1) %b 1575 %r.val = fcmp une half %a.val, %b.val 1576 %r.val.sext = sext i1 %r.val to i32 1577 store i32 %r.val.sext, ptr addrspace(1) %r 1578 ret void 1579} 1580 1581define amdgpu_kernel void @fcmp_f16_nlt( 1582; SI-LABEL: fcmp_f16_nlt: 1583; SI: ; %bb.0: ; %entry 1584; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1585; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1586; SI-NEXT: s_mov_b32 s11, 0xf000 1587; SI-NEXT: s_mov_b32 s10, -1 1588; SI-NEXT: s_mov_b32 s14, s10 1589; SI-NEXT: s_mov_b32 s15, s11 1590; SI-NEXT: s_mov_b32 s6, s10 1591; SI-NEXT: s_mov_b32 s7, s11 1592; SI-NEXT: s_waitcnt lgkmcnt(0) 1593; SI-NEXT: s_mov_b32 s12, s2 1594; SI-NEXT: s_mov_b32 s13, s3 1595; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1596; SI-NEXT: s_waitcnt vmcnt(0) 1597; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 glc 1598; SI-NEXT: s_waitcnt vmcnt(0) 1599; SI-NEXT: s_mov_b32 s8, s0 1600; SI-NEXT: s_mov_b32 s9, s1 1601; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1602; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1603; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 1604; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1605; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1606; SI-NEXT: s_endpgm 1607; 1608; VI-LABEL: fcmp_f16_nlt: 1609; VI: ; %bb.0: ; %entry 1610; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1611; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1612; VI-NEXT: s_mov_b32 s7, 0xf000 1613; VI-NEXT: s_mov_b32 s6, -1 1614; VI-NEXT: s_mov_b32 s14, s6 1615; VI-NEXT: s_waitcnt lgkmcnt(0) 1616; VI-NEXT: s_mov_b32 s12, s2 1617; VI-NEXT: s_mov_b32 s13, s3 1618; VI-NEXT: s_mov_b32 s15, s7 1619; VI-NEXT: s_mov_b32 s10, s6 1620; VI-NEXT: s_mov_b32 s11, s7 1621; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc 1622; VI-NEXT: s_waitcnt vmcnt(0) 1623; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc 1624; VI-NEXT: s_waitcnt vmcnt(0) 1625; VI-NEXT: s_mov_b32 s4, s0 1626; VI-NEXT: s_mov_b32 s5, s1 1627; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1 1628; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1629; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1630; VI-NEXT: s_endpgm 1631; 1632; GFX11-LABEL: fcmp_f16_nlt: 1633; GFX11: ; %bb.0: ; %entry 1634; GFX11-NEXT: s_clause 0x1 1635; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1636; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1637; GFX11-NEXT: s_mov_b32 s10, -1 1638; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1639; GFX11-NEXT: s_mov_b32 s14, s10 1640; GFX11-NEXT: s_mov_b32 s15, s11 1641; GFX11-NEXT: s_mov_b32 s6, s10 1642; GFX11-NEXT: s_mov_b32 s7, s11 1643; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1644; GFX11-NEXT: s_mov_b32 s12, s2 1645; GFX11-NEXT: s_mov_b32 s13, s3 1646; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc 1647; GFX11-NEXT: s_waitcnt vmcnt(0) 1648; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc 1649; GFX11-NEXT: s_waitcnt vmcnt(0) 1650; GFX11-NEXT: s_mov_b32 s8, s0 1651; GFX11-NEXT: s_mov_b32 s9, s1 1652; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 1653; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1654; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 1655; GFX11-NEXT: s_endpgm 1656; 1657; GFX12-LABEL: fcmp_f16_nlt: 1658; GFX12: ; %bb.0: ; %entry 1659; GFX12-NEXT: s_clause 0x1 1660; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1661; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1662; GFX12-NEXT: s_mov_b32 s10, -1 1663; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1664; GFX12-NEXT: s_mov_b32 s14, s10 1665; GFX12-NEXT: s_mov_b32 s15, s11 1666; GFX12-NEXT: s_mov_b32 s6, s10 1667; GFX12-NEXT: s_mov_b32 s7, s11 1668; GFX12-NEXT: s_wait_kmcnt 0x0 1669; GFX12-NEXT: s_mov_b32 s12, s2 1670; GFX12-NEXT: s_mov_b32 s13, s3 1671; GFX12-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS 1672; GFX12-NEXT: s_wait_loadcnt 0x0 1673; GFX12-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS 1674; GFX12-NEXT: s_wait_loadcnt 0x0 1675; GFX12-NEXT: s_mov_b32 s8, s0 1676; GFX12-NEXT: s_mov_b32 s9, s1 1677; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 1678; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1679; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 1680; GFX12-NEXT: s_endpgm 1681 ptr addrspace(1) %r, 1682 ptr addrspace(1) %a, 1683 ptr addrspace(1) %b) { 1684entry: 1685 %a.val = load volatile half, ptr addrspace(1) %a 1686 %b.val = load volatile half, ptr addrspace(1) %b 1687 %r.val = fcmp uge half %a.val, %b.val 1688 %r.val.sext = sext i1 %r.val to i32 1689 store i32 %r.val.sext, ptr addrspace(1) %r 1690 ret void 1691} 1692 1693define amdgpu_kernel void @fcmp_v2f16_lt( 1694; SI-LABEL: fcmp_v2f16_lt: 1695; SI: ; %bb.0: ; %entry 1696; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1697; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1698; SI-NEXT: s_mov_b32 s11, 0xf000 1699; SI-NEXT: s_mov_b32 s10, -1 1700; SI-NEXT: s_mov_b32 s14, s10 1701; SI-NEXT: s_mov_b32 s15, s11 1702; SI-NEXT: s_waitcnt lgkmcnt(0) 1703; SI-NEXT: s_mov_b32 s12, s2 1704; SI-NEXT: s_mov_b32 s13, s3 1705; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 1706; SI-NEXT: s_mov_b32 s6, s10 1707; SI-NEXT: s_mov_b32 s7, s11 1708; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 1709; SI-NEXT: s_mov_b32 s8, s0 1710; SI-NEXT: s_mov_b32 s9, s1 1711; SI-NEXT: s_waitcnt vmcnt(1) 1712; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1713; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1714; SI-NEXT: s_waitcnt vmcnt(0) 1715; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 1716; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1717; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1718; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1719; SI-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 1720; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1721; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v1 1722; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 1723; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1724; SI-NEXT: s_endpgm 1725; 1726; VI-LABEL: fcmp_v2f16_lt: 1727; VI: ; %bb.0: ; %entry 1728; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1729; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1730; VI-NEXT: s_mov_b32 s7, 0xf000 1731; VI-NEXT: s_mov_b32 s6, -1 1732; VI-NEXT: s_mov_b32 s10, s6 1733; VI-NEXT: s_mov_b32 s11, s7 1734; VI-NEXT: s_waitcnt lgkmcnt(0) 1735; VI-NEXT: s_mov_b32 s12, s2 1736; VI-NEXT: s_mov_b32 s13, s3 1737; VI-NEXT: s_mov_b32 s14, s6 1738; VI-NEXT: s_mov_b32 s15, s7 1739; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1740; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 1741; VI-NEXT: s_mov_b32 s4, s0 1742; VI-NEXT: s_mov_b32 s5, s1 1743; VI-NEXT: s_waitcnt vmcnt(1) 1744; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1745; VI-NEXT: s_waitcnt vmcnt(0) 1746; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1747; VI-NEXT: v_cmp_lt_f16_e32 vcc, v1, v0 1748; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1749; VI-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 1750; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 1751; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1752; VI-NEXT: s_endpgm 1753; 1754; GFX11-LABEL: fcmp_v2f16_lt: 1755; GFX11: ; %bb.0: ; %entry 1756; GFX11-NEXT: s_clause 0x1 1757; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1758; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1759; GFX11-NEXT: s_mov_b32 s10, -1 1760; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1761; GFX11-NEXT: s_mov_b32 s6, s10 1762; GFX11-NEXT: s_mov_b32 s7, s11 1763; GFX11-NEXT: s_mov_b32 s14, s10 1764; GFX11-NEXT: s_mov_b32 s15, s11 1765; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1766; GFX11-NEXT: s_mov_b32 s12, s2 1767; GFX11-NEXT: s_mov_b32 s13, s3 1768; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 1769; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 1770; GFX11-NEXT: s_mov_b32 s8, s0 1771; GFX11-NEXT: s_mov_b32 s9, s1 1772; GFX11-NEXT: s_waitcnt vmcnt(1) 1773; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1774; GFX11-NEXT: s_waitcnt vmcnt(0) 1775; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1776; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 1777; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1778; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 1779; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 1780; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 1781; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 1782; GFX11-NEXT: s_endpgm 1783; 1784; GFX12-LABEL: fcmp_v2f16_lt: 1785; GFX12: ; %bb.0: ; %entry 1786; GFX12-NEXT: s_clause 0x1 1787; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1788; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1789; GFX12-NEXT: s_mov_b32 s10, -1 1790; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1791; GFX12-NEXT: s_mov_b32 s6, s10 1792; GFX12-NEXT: s_mov_b32 s7, s11 1793; GFX12-NEXT: s_mov_b32 s14, s10 1794; GFX12-NEXT: s_mov_b32 s15, s11 1795; GFX12-NEXT: s_wait_kmcnt 0x0 1796; GFX12-NEXT: s_mov_b32 s12, s2 1797; GFX12-NEXT: s_mov_b32 s13, s3 1798; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 1799; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 1800; GFX12-NEXT: s_mov_b32 s8, s0 1801; GFX12-NEXT: s_mov_b32 s9, s1 1802; GFX12-NEXT: s_wait_loadcnt 0x1 1803; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1804; GFX12-NEXT: s_wait_loadcnt 0x0 1805; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1806; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 1807; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1808; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 1809; GFX12-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 1810; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 1811; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 1812; GFX12-NEXT: s_endpgm 1813 ptr addrspace(1) %r, 1814 ptr addrspace(1) %a, 1815 ptr addrspace(1) %b) { 1816entry: 1817 %a.val = load <2 x half>, ptr addrspace(1) %a 1818 %b.val = load <2 x half>, ptr addrspace(1) %b 1819 %r.val = fcmp olt <2 x half> %a.val, %b.val 1820 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 1821 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 1822 ret void 1823} 1824 1825 1826define amdgpu_kernel void @fcmp_v2f16_eq( 1827; SI-LABEL: fcmp_v2f16_eq: 1828; SI: ; %bb.0: ; %entry 1829; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1830; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1831; SI-NEXT: s_mov_b32 s11, 0xf000 1832; SI-NEXT: s_mov_b32 s10, -1 1833; SI-NEXT: s_mov_b32 s14, s10 1834; SI-NEXT: s_mov_b32 s15, s11 1835; SI-NEXT: s_waitcnt lgkmcnt(0) 1836; SI-NEXT: s_mov_b32 s12, s2 1837; SI-NEXT: s_mov_b32 s13, s3 1838; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 1839; SI-NEXT: s_mov_b32 s6, s10 1840; SI-NEXT: s_mov_b32 s7, s11 1841; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 1842; SI-NEXT: s_mov_b32 s8, s0 1843; SI-NEXT: s_mov_b32 s9, s1 1844; SI-NEXT: s_waitcnt vmcnt(1) 1845; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1846; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1847; SI-NEXT: s_waitcnt vmcnt(0) 1848; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 1849; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1850; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1851; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1852; SI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v3 1853; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1854; SI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v1 1855; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 1856; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1857; SI-NEXT: s_endpgm 1858; 1859; VI-LABEL: fcmp_v2f16_eq: 1860; VI: ; %bb.0: ; %entry 1861; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1862; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1863; VI-NEXT: s_mov_b32 s7, 0xf000 1864; VI-NEXT: s_mov_b32 s6, -1 1865; VI-NEXT: s_mov_b32 s10, s6 1866; VI-NEXT: s_mov_b32 s11, s7 1867; VI-NEXT: s_waitcnt lgkmcnt(0) 1868; VI-NEXT: s_mov_b32 s12, s2 1869; VI-NEXT: s_mov_b32 s13, s3 1870; VI-NEXT: s_mov_b32 s14, s6 1871; VI-NEXT: s_mov_b32 s15, s7 1872; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1873; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 1874; VI-NEXT: s_mov_b32 s4, s0 1875; VI-NEXT: s_mov_b32 s5, s1 1876; VI-NEXT: s_waitcnt vmcnt(1) 1877; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1878; VI-NEXT: s_waitcnt vmcnt(0) 1879; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1880; VI-NEXT: v_cmp_eq_f16_e32 vcc, v1, v0 1881; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1882; VI-NEXT: v_cmp_eq_f16_e32 vcc, v3, v2 1883; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 1884; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1885; VI-NEXT: s_endpgm 1886; 1887; GFX11-LABEL: fcmp_v2f16_eq: 1888; GFX11: ; %bb.0: ; %entry 1889; GFX11-NEXT: s_clause 0x1 1890; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1891; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1892; GFX11-NEXT: s_mov_b32 s10, -1 1893; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1894; GFX11-NEXT: s_mov_b32 s6, s10 1895; GFX11-NEXT: s_mov_b32 s7, s11 1896; GFX11-NEXT: s_mov_b32 s14, s10 1897; GFX11-NEXT: s_mov_b32 s15, s11 1898; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1899; GFX11-NEXT: s_mov_b32 s12, s2 1900; GFX11-NEXT: s_mov_b32 s13, s3 1901; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 1902; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 1903; GFX11-NEXT: s_mov_b32 s8, s0 1904; GFX11-NEXT: s_mov_b32 s9, s1 1905; GFX11-NEXT: s_waitcnt vmcnt(1) 1906; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1907; GFX11-NEXT: s_waitcnt vmcnt(0) 1908; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1909; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0 1910; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1911; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 1912; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 1913; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 1914; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 1915; GFX11-NEXT: s_endpgm 1916; 1917; GFX12-LABEL: fcmp_v2f16_eq: 1918; GFX12: ; %bb.0: ; %entry 1919; GFX12-NEXT: s_clause 0x1 1920; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1921; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1922; GFX12-NEXT: s_mov_b32 s10, -1 1923; GFX12-NEXT: s_mov_b32 s11, 0x31016000 1924; GFX12-NEXT: s_mov_b32 s6, s10 1925; GFX12-NEXT: s_mov_b32 s7, s11 1926; GFX12-NEXT: s_mov_b32 s14, s10 1927; GFX12-NEXT: s_mov_b32 s15, s11 1928; GFX12-NEXT: s_wait_kmcnt 0x0 1929; GFX12-NEXT: s_mov_b32 s12, s2 1930; GFX12-NEXT: s_mov_b32 s13, s3 1931; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 1932; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 1933; GFX12-NEXT: s_mov_b32 s8, s0 1934; GFX12-NEXT: s_mov_b32 s9, s1 1935; GFX12-NEXT: s_wait_loadcnt 0x1 1936; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1937; GFX12-NEXT: s_wait_loadcnt 0x0 1938; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1939; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v1, v0 1940; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 1941; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 1942; GFX12-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 1943; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 1944; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 1945; GFX12-NEXT: s_endpgm 1946 ptr addrspace(1) %r, 1947 ptr addrspace(1) %a, 1948 ptr addrspace(1) %b) { 1949entry: 1950 %a.val = load <2 x half>, ptr addrspace(1) %a 1951 %b.val = load <2 x half>, ptr addrspace(1) %b 1952 %r.val = fcmp oeq <2 x half> %a.val, %b.val 1953 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 1954 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 1955 ret void 1956} 1957 1958define amdgpu_kernel void @fcmp_v2f16_le( 1959; SI-LABEL: fcmp_v2f16_le: 1960; SI: ; %bb.0: ; %entry 1961; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1962; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1963; SI-NEXT: s_mov_b32 s11, 0xf000 1964; SI-NEXT: s_mov_b32 s10, -1 1965; SI-NEXT: s_mov_b32 s14, s10 1966; SI-NEXT: s_mov_b32 s15, s11 1967; SI-NEXT: s_waitcnt lgkmcnt(0) 1968; SI-NEXT: s_mov_b32 s12, s2 1969; SI-NEXT: s_mov_b32 s13, s3 1970; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 1971; SI-NEXT: s_mov_b32 s6, s10 1972; SI-NEXT: s_mov_b32 s7, s11 1973; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 1974; SI-NEXT: s_mov_b32 s8, s0 1975; SI-NEXT: s_mov_b32 s9, s1 1976; SI-NEXT: s_waitcnt vmcnt(1) 1977; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 1978; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1979; SI-NEXT: s_waitcnt vmcnt(0) 1980; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 1981; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1982; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1983; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1984; SI-NEXT: v_cmp_le_f32_e32 vcc, v2, v3 1985; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 1986; SI-NEXT: v_cmp_le_f32_e32 vcc, v4, v1 1987; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 1988; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1989; SI-NEXT: s_endpgm 1990; 1991; VI-LABEL: fcmp_v2f16_le: 1992; VI: ; %bb.0: ; %entry 1993; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1994; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1995; VI-NEXT: s_mov_b32 s7, 0xf000 1996; VI-NEXT: s_mov_b32 s6, -1 1997; VI-NEXT: s_mov_b32 s10, s6 1998; VI-NEXT: s_mov_b32 s11, s7 1999; VI-NEXT: s_waitcnt lgkmcnt(0) 2000; VI-NEXT: s_mov_b32 s12, s2 2001; VI-NEXT: s_mov_b32 s13, s3 2002; VI-NEXT: s_mov_b32 s14, s6 2003; VI-NEXT: s_mov_b32 s15, s7 2004; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2005; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2006; VI-NEXT: s_mov_b32 s4, s0 2007; VI-NEXT: s_mov_b32 s5, s1 2008; VI-NEXT: s_waitcnt vmcnt(1) 2009; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2010; VI-NEXT: s_waitcnt vmcnt(0) 2011; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2012; VI-NEXT: v_cmp_le_f16_e32 vcc, v1, v0 2013; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2014; VI-NEXT: v_cmp_le_f16_e32 vcc, v3, v2 2015; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2016; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2017; VI-NEXT: s_endpgm 2018; 2019; GFX11-LABEL: fcmp_v2f16_le: 2020; GFX11: ; %bb.0: ; %entry 2021; GFX11-NEXT: s_clause 0x1 2022; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2023; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2024; GFX11-NEXT: s_mov_b32 s10, -1 2025; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2026; GFX11-NEXT: s_mov_b32 s6, s10 2027; GFX11-NEXT: s_mov_b32 s7, s11 2028; GFX11-NEXT: s_mov_b32 s14, s10 2029; GFX11-NEXT: s_mov_b32 s15, s11 2030; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2031; GFX11-NEXT: s_mov_b32 s12, s2 2032; GFX11-NEXT: s_mov_b32 s13, s3 2033; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2034; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2035; GFX11-NEXT: s_mov_b32 s8, s0 2036; GFX11-NEXT: s_mov_b32 s9, s1 2037; GFX11-NEXT: s_waitcnt vmcnt(1) 2038; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2039; GFX11-NEXT: s_waitcnt vmcnt(0) 2040; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2041; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0 2042; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2043; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2044; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 2045; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2046; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2047; GFX11-NEXT: s_endpgm 2048; 2049; GFX12-LABEL: fcmp_v2f16_le: 2050; GFX12: ; %bb.0: ; %entry 2051; GFX12-NEXT: s_clause 0x1 2052; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2053; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2054; GFX12-NEXT: s_mov_b32 s10, -1 2055; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2056; GFX12-NEXT: s_mov_b32 s6, s10 2057; GFX12-NEXT: s_mov_b32 s7, s11 2058; GFX12-NEXT: s_mov_b32 s14, s10 2059; GFX12-NEXT: s_mov_b32 s15, s11 2060; GFX12-NEXT: s_wait_kmcnt 0x0 2061; GFX12-NEXT: s_mov_b32 s12, s2 2062; GFX12-NEXT: s_mov_b32 s13, s3 2063; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2064; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2065; GFX12-NEXT: s_mov_b32 s8, s0 2066; GFX12-NEXT: s_mov_b32 s9, s1 2067; GFX12-NEXT: s_wait_loadcnt 0x1 2068; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2069; GFX12-NEXT: s_wait_loadcnt 0x0 2070; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2071; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v1, v0 2072; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2073; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2074; GFX12-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 2075; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2076; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2077; GFX12-NEXT: s_endpgm 2078 ptr addrspace(1) %r, 2079 ptr addrspace(1) %a, 2080 ptr addrspace(1) %b) { 2081entry: 2082 %a.val = load <2 x half>, ptr addrspace(1) %a 2083 %b.val = load <2 x half>, ptr addrspace(1) %b 2084 %r.val = fcmp ole <2 x half> %a.val, %b.val 2085 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2086 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2087 ret void 2088} 2089 2090define amdgpu_kernel void @fcmp_v2f16_gt( 2091; SI-LABEL: fcmp_v2f16_gt: 2092; SI: ; %bb.0: ; %entry 2093; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2094; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2095; SI-NEXT: s_mov_b32 s11, 0xf000 2096; SI-NEXT: s_mov_b32 s10, -1 2097; SI-NEXT: s_mov_b32 s14, s10 2098; SI-NEXT: s_mov_b32 s15, s11 2099; SI-NEXT: s_waitcnt lgkmcnt(0) 2100; SI-NEXT: s_mov_b32 s12, s2 2101; SI-NEXT: s_mov_b32 s13, s3 2102; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2103; SI-NEXT: s_mov_b32 s6, s10 2104; SI-NEXT: s_mov_b32 s7, s11 2105; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2106; SI-NEXT: s_mov_b32 s8, s0 2107; SI-NEXT: s_mov_b32 s9, s1 2108; SI-NEXT: s_waitcnt vmcnt(1) 2109; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2110; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2111; SI-NEXT: s_waitcnt vmcnt(0) 2112; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2113; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2114; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2115; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2116; SI-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 2117; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2118; SI-NEXT: v_cmp_gt_f32_e32 vcc, v4, v1 2119; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2120; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2121; SI-NEXT: s_endpgm 2122; 2123; VI-LABEL: fcmp_v2f16_gt: 2124; VI: ; %bb.0: ; %entry 2125; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2126; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2127; VI-NEXT: s_mov_b32 s7, 0xf000 2128; VI-NEXT: s_mov_b32 s6, -1 2129; VI-NEXT: s_mov_b32 s10, s6 2130; VI-NEXT: s_mov_b32 s11, s7 2131; VI-NEXT: s_waitcnt lgkmcnt(0) 2132; VI-NEXT: s_mov_b32 s12, s2 2133; VI-NEXT: s_mov_b32 s13, s3 2134; VI-NEXT: s_mov_b32 s14, s6 2135; VI-NEXT: s_mov_b32 s15, s7 2136; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2137; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2138; VI-NEXT: s_mov_b32 s4, s0 2139; VI-NEXT: s_mov_b32 s5, s1 2140; VI-NEXT: s_waitcnt vmcnt(1) 2141; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2142; VI-NEXT: s_waitcnt vmcnt(0) 2143; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2144; VI-NEXT: v_cmp_gt_f16_e32 vcc, v1, v0 2145; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2146; VI-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 2147; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2148; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2149; VI-NEXT: s_endpgm 2150; 2151; GFX11-LABEL: fcmp_v2f16_gt: 2152; GFX11: ; %bb.0: ; %entry 2153; GFX11-NEXT: s_clause 0x1 2154; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2155; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2156; GFX11-NEXT: s_mov_b32 s10, -1 2157; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2158; GFX11-NEXT: s_mov_b32 s6, s10 2159; GFX11-NEXT: s_mov_b32 s7, s11 2160; GFX11-NEXT: s_mov_b32 s14, s10 2161; GFX11-NEXT: s_mov_b32 s15, s11 2162; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2163; GFX11-NEXT: s_mov_b32 s12, s2 2164; GFX11-NEXT: s_mov_b32 s13, s3 2165; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2166; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2167; GFX11-NEXT: s_mov_b32 s8, s0 2168; GFX11-NEXT: s_mov_b32 s9, s1 2169; GFX11-NEXT: s_waitcnt vmcnt(1) 2170; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2171; GFX11-NEXT: s_waitcnt vmcnt(0) 2172; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2173; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0 2174; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2175; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2176; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 2177; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2178; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2179; GFX11-NEXT: s_endpgm 2180; 2181; GFX12-LABEL: fcmp_v2f16_gt: 2182; GFX12: ; %bb.0: ; %entry 2183; GFX12-NEXT: s_clause 0x1 2184; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2185; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2186; GFX12-NEXT: s_mov_b32 s10, -1 2187; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2188; GFX12-NEXT: s_mov_b32 s6, s10 2189; GFX12-NEXT: s_mov_b32 s7, s11 2190; GFX12-NEXT: s_mov_b32 s14, s10 2191; GFX12-NEXT: s_mov_b32 s15, s11 2192; GFX12-NEXT: s_wait_kmcnt 0x0 2193; GFX12-NEXT: s_mov_b32 s12, s2 2194; GFX12-NEXT: s_mov_b32 s13, s3 2195; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2196; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2197; GFX12-NEXT: s_mov_b32 s8, s0 2198; GFX12-NEXT: s_mov_b32 s9, s1 2199; GFX12-NEXT: s_wait_loadcnt 0x1 2200; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2201; GFX12-NEXT: s_wait_loadcnt 0x0 2202; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2203; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v1, v0 2204; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2205; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2206; GFX12-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 2207; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2208; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2209; GFX12-NEXT: s_endpgm 2210 ptr addrspace(1) %r, 2211 ptr addrspace(1) %a, 2212 ptr addrspace(1) %b) { 2213entry: 2214 %a.val = load <2 x half>, ptr addrspace(1) %a 2215 %b.val = load <2 x half>, ptr addrspace(1) %b 2216 %r.val = fcmp ogt <2 x half> %a.val, %b.val 2217 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2218 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2219 ret void 2220} 2221 2222 2223define amdgpu_kernel void @fcmp_v2f16_lg( 2224; SI-LABEL: fcmp_v2f16_lg: 2225; SI: ; %bb.0: ; %entry 2226; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2227; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2228; SI-NEXT: s_mov_b32 s11, 0xf000 2229; SI-NEXT: s_mov_b32 s10, -1 2230; SI-NEXT: s_mov_b32 s14, s10 2231; SI-NEXT: s_mov_b32 s15, s11 2232; SI-NEXT: s_waitcnt lgkmcnt(0) 2233; SI-NEXT: s_mov_b32 s12, s2 2234; SI-NEXT: s_mov_b32 s13, s3 2235; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2236; SI-NEXT: s_mov_b32 s6, s10 2237; SI-NEXT: s_mov_b32 s7, s11 2238; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2239; SI-NEXT: s_mov_b32 s8, s0 2240; SI-NEXT: s_mov_b32 s9, s1 2241; SI-NEXT: s_waitcnt vmcnt(1) 2242; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2243; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2244; SI-NEXT: s_waitcnt vmcnt(0) 2245; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2246; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2247; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2248; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2249; SI-NEXT: v_cmp_lg_f32_e32 vcc, v2, v3 2250; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2251; SI-NEXT: v_cmp_lg_f32_e32 vcc, v4, v1 2252; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2253; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2254; SI-NEXT: s_endpgm 2255; 2256; VI-LABEL: fcmp_v2f16_lg: 2257; VI: ; %bb.0: ; %entry 2258; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2259; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2260; VI-NEXT: s_mov_b32 s7, 0xf000 2261; VI-NEXT: s_mov_b32 s6, -1 2262; VI-NEXT: s_mov_b32 s10, s6 2263; VI-NEXT: s_mov_b32 s11, s7 2264; VI-NEXT: s_waitcnt lgkmcnt(0) 2265; VI-NEXT: s_mov_b32 s12, s2 2266; VI-NEXT: s_mov_b32 s13, s3 2267; VI-NEXT: s_mov_b32 s14, s6 2268; VI-NEXT: s_mov_b32 s15, s7 2269; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2270; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2271; VI-NEXT: s_mov_b32 s4, s0 2272; VI-NEXT: s_mov_b32 s5, s1 2273; VI-NEXT: s_waitcnt vmcnt(1) 2274; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2275; VI-NEXT: s_waitcnt vmcnt(0) 2276; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2277; VI-NEXT: v_cmp_lg_f16_e32 vcc, v1, v0 2278; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2279; VI-NEXT: v_cmp_lg_f16_e32 vcc, v3, v2 2280; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2281; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2282; VI-NEXT: s_endpgm 2283; 2284; GFX11-LABEL: fcmp_v2f16_lg: 2285; GFX11: ; %bb.0: ; %entry 2286; GFX11-NEXT: s_clause 0x1 2287; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2288; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2289; GFX11-NEXT: s_mov_b32 s10, -1 2290; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2291; GFX11-NEXT: s_mov_b32 s6, s10 2292; GFX11-NEXT: s_mov_b32 s7, s11 2293; GFX11-NEXT: s_mov_b32 s14, s10 2294; GFX11-NEXT: s_mov_b32 s15, s11 2295; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2296; GFX11-NEXT: s_mov_b32 s12, s2 2297; GFX11-NEXT: s_mov_b32 s13, s3 2298; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2299; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2300; GFX11-NEXT: s_mov_b32 s8, s0 2301; GFX11-NEXT: s_mov_b32 s9, s1 2302; GFX11-NEXT: s_waitcnt vmcnt(1) 2303; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2304; GFX11-NEXT: s_waitcnt vmcnt(0) 2305; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2306; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0 2307; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2308; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2309; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 2310; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2311; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2312; GFX11-NEXT: s_endpgm 2313; 2314; GFX12-LABEL: fcmp_v2f16_lg: 2315; GFX12: ; %bb.0: ; %entry 2316; GFX12-NEXT: s_clause 0x1 2317; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2318; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2319; GFX12-NEXT: s_mov_b32 s10, -1 2320; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2321; GFX12-NEXT: s_mov_b32 s6, s10 2322; GFX12-NEXT: s_mov_b32 s7, s11 2323; GFX12-NEXT: s_mov_b32 s14, s10 2324; GFX12-NEXT: s_mov_b32 s15, s11 2325; GFX12-NEXT: s_wait_kmcnt 0x0 2326; GFX12-NEXT: s_mov_b32 s12, s2 2327; GFX12-NEXT: s_mov_b32 s13, s3 2328; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2329; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2330; GFX12-NEXT: s_mov_b32 s8, s0 2331; GFX12-NEXT: s_mov_b32 s9, s1 2332; GFX12-NEXT: s_wait_loadcnt 0x1 2333; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2334; GFX12-NEXT: s_wait_loadcnt 0x0 2335; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2336; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v1, v0 2337; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2338; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2339; GFX12-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 2340; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2341; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2342; GFX12-NEXT: s_endpgm 2343 ptr addrspace(1) %r, 2344 ptr addrspace(1) %a, 2345 ptr addrspace(1) %b) { 2346entry: 2347 %a.val = load <2 x half>, ptr addrspace(1) %a 2348 %b.val = load <2 x half>, ptr addrspace(1) %b 2349 %r.val = fcmp one <2 x half> %a.val, %b.val 2350 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2351 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2352 ret void 2353} 2354 2355 2356define amdgpu_kernel void @fcmp_v2f16_ge( 2357; SI-LABEL: fcmp_v2f16_ge: 2358; SI: ; %bb.0: ; %entry 2359; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2360; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2361; SI-NEXT: s_mov_b32 s11, 0xf000 2362; SI-NEXT: s_mov_b32 s10, -1 2363; SI-NEXT: s_mov_b32 s14, s10 2364; SI-NEXT: s_mov_b32 s15, s11 2365; SI-NEXT: s_waitcnt lgkmcnt(0) 2366; SI-NEXT: s_mov_b32 s12, s2 2367; SI-NEXT: s_mov_b32 s13, s3 2368; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2369; SI-NEXT: s_mov_b32 s6, s10 2370; SI-NEXT: s_mov_b32 s7, s11 2371; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2372; SI-NEXT: s_mov_b32 s8, s0 2373; SI-NEXT: s_mov_b32 s9, s1 2374; SI-NEXT: s_waitcnt vmcnt(1) 2375; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2376; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2377; SI-NEXT: s_waitcnt vmcnt(0) 2378; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2379; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2380; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2381; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2382; SI-NEXT: v_cmp_ge_f32_e32 vcc, v2, v3 2383; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2384; SI-NEXT: v_cmp_ge_f32_e32 vcc, v4, v1 2385; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2386; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2387; SI-NEXT: s_endpgm 2388; 2389; VI-LABEL: fcmp_v2f16_ge: 2390; VI: ; %bb.0: ; %entry 2391; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2392; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2393; VI-NEXT: s_mov_b32 s7, 0xf000 2394; VI-NEXT: s_mov_b32 s6, -1 2395; VI-NEXT: s_mov_b32 s10, s6 2396; VI-NEXT: s_mov_b32 s11, s7 2397; VI-NEXT: s_waitcnt lgkmcnt(0) 2398; VI-NEXT: s_mov_b32 s12, s2 2399; VI-NEXT: s_mov_b32 s13, s3 2400; VI-NEXT: s_mov_b32 s14, s6 2401; VI-NEXT: s_mov_b32 s15, s7 2402; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2403; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2404; VI-NEXT: s_mov_b32 s4, s0 2405; VI-NEXT: s_mov_b32 s5, s1 2406; VI-NEXT: s_waitcnt vmcnt(1) 2407; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2408; VI-NEXT: s_waitcnt vmcnt(0) 2409; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2410; VI-NEXT: v_cmp_ge_f16_e32 vcc, v1, v0 2411; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2412; VI-NEXT: v_cmp_ge_f16_e32 vcc, v3, v2 2413; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2414; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2415; VI-NEXT: s_endpgm 2416; 2417; GFX11-LABEL: fcmp_v2f16_ge: 2418; GFX11: ; %bb.0: ; %entry 2419; GFX11-NEXT: s_clause 0x1 2420; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2421; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2422; GFX11-NEXT: s_mov_b32 s10, -1 2423; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2424; GFX11-NEXT: s_mov_b32 s6, s10 2425; GFX11-NEXT: s_mov_b32 s7, s11 2426; GFX11-NEXT: s_mov_b32 s14, s10 2427; GFX11-NEXT: s_mov_b32 s15, s11 2428; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2429; GFX11-NEXT: s_mov_b32 s12, s2 2430; GFX11-NEXT: s_mov_b32 s13, s3 2431; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2432; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2433; GFX11-NEXT: s_mov_b32 s8, s0 2434; GFX11-NEXT: s_mov_b32 s9, s1 2435; GFX11-NEXT: s_waitcnt vmcnt(1) 2436; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2437; GFX11-NEXT: s_waitcnt vmcnt(0) 2438; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2439; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0 2440; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2441; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2442; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 2443; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2444; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2445; GFX11-NEXT: s_endpgm 2446; 2447; GFX12-LABEL: fcmp_v2f16_ge: 2448; GFX12: ; %bb.0: ; %entry 2449; GFX12-NEXT: s_clause 0x1 2450; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2451; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2452; GFX12-NEXT: s_mov_b32 s10, -1 2453; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2454; GFX12-NEXT: s_mov_b32 s6, s10 2455; GFX12-NEXT: s_mov_b32 s7, s11 2456; GFX12-NEXT: s_mov_b32 s14, s10 2457; GFX12-NEXT: s_mov_b32 s15, s11 2458; GFX12-NEXT: s_wait_kmcnt 0x0 2459; GFX12-NEXT: s_mov_b32 s12, s2 2460; GFX12-NEXT: s_mov_b32 s13, s3 2461; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2462; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2463; GFX12-NEXT: s_mov_b32 s8, s0 2464; GFX12-NEXT: s_mov_b32 s9, s1 2465; GFX12-NEXT: s_wait_loadcnt 0x1 2466; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2467; GFX12-NEXT: s_wait_loadcnt 0x0 2468; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2469; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v1, v0 2470; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2471; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2472; GFX12-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 2473; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2474; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2475; GFX12-NEXT: s_endpgm 2476 ptr addrspace(1) %r, 2477 ptr addrspace(1) %a, 2478 ptr addrspace(1) %b) { 2479entry: 2480 %a.val = load <2 x half>, ptr addrspace(1) %a 2481 %b.val = load <2 x half>, ptr addrspace(1) %b 2482 %r.val = fcmp oge <2 x half> %a.val, %b.val 2483 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2484 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2485 ret void 2486} 2487 2488 2489define amdgpu_kernel void @fcmp_v2f16_o( 2490; SI-LABEL: fcmp_v2f16_o: 2491; SI: ; %bb.0: ; %entry 2492; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2493; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2494; SI-NEXT: s_mov_b32 s11, 0xf000 2495; SI-NEXT: s_mov_b32 s10, -1 2496; SI-NEXT: s_mov_b32 s14, s10 2497; SI-NEXT: s_mov_b32 s15, s11 2498; SI-NEXT: s_waitcnt lgkmcnt(0) 2499; SI-NEXT: s_mov_b32 s12, s2 2500; SI-NEXT: s_mov_b32 s13, s3 2501; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2502; SI-NEXT: s_mov_b32 s6, s10 2503; SI-NEXT: s_mov_b32 s7, s11 2504; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2505; SI-NEXT: s_mov_b32 s8, s0 2506; SI-NEXT: s_mov_b32 s9, s1 2507; SI-NEXT: s_waitcnt vmcnt(1) 2508; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2509; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2510; SI-NEXT: s_waitcnt vmcnt(0) 2511; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2512; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2513; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2514; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2515; SI-NEXT: v_cmp_o_f32_e32 vcc, v2, v3 2516; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2517; SI-NEXT: v_cmp_o_f32_e32 vcc, v4, v1 2518; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2519; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2520; SI-NEXT: s_endpgm 2521; 2522; VI-LABEL: fcmp_v2f16_o: 2523; VI: ; %bb.0: ; %entry 2524; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2525; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2526; VI-NEXT: s_mov_b32 s7, 0xf000 2527; VI-NEXT: s_mov_b32 s6, -1 2528; VI-NEXT: s_mov_b32 s10, s6 2529; VI-NEXT: s_mov_b32 s11, s7 2530; VI-NEXT: s_waitcnt lgkmcnt(0) 2531; VI-NEXT: s_mov_b32 s12, s2 2532; VI-NEXT: s_mov_b32 s13, s3 2533; VI-NEXT: s_mov_b32 s14, s6 2534; VI-NEXT: s_mov_b32 s15, s7 2535; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2536; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2537; VI-NEXT: s_mov_b32 s4, s0 2538; VI-NEXT: s_mov_b32 s5, s1 2539; VI-NEXT: s_waitcnt vmcnt(1) 2540; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2541; VI-NEXT: s_waitcnt vmcnt(0) 2542; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2543; VI-NEXT: v_cmp_o_f16_e32 vcc, v1, v0 2544; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2545; VI-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 2546; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2547; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2548; VI-NEXT: s_endpgm 2549; 2550; GFX11-LABEL: fcmp_v2f16_o: 2551; GFX11: ; %bb.0: ; %entry 2552; GFX11-NEXT: s_clause 0x1 2553; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2554; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2555; GFX11-NEXT: s_mov_b32 s10, -1 2556; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2557; GFX11-NEXT: s_mov_b32 s6, s10 2558; GFX11-NEXT: s_mov_b32 s7, s11 2559; GFX11-NEXT: s_mov_b32 s14, s10 2560; GFX11-NEXT: s_mov_b32 s15, s11 2561; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX11-NEXT: s_mov_b32 s12, s2 2563; GFX11-NEXT: s_mov_b32 s13, s3 2564; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2565; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2566; GFX11-NEXT: s_mov_b32 s8, s0 2567; GFX11-NEXT: s_mov_b32 s9, s1 2568; GFX11-NEXT: s_waitcnt vmcnt(1) 2569; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2570; GFX11-NEXT: s_waitcnt vmcnt(0) 2571; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2572; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0 2573; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2574; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2575; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 2576; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2577; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2578; GFX11-NEXT: s_endpgm 2579; 2580; GFX12-LABEL: fcmp_v2f16_o: 2581; GFX12: ; %bb.0: ; %entry 2582; GFX12-NEXT: s_clause 0x1 2583; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2584; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2585; GFX12-NEXT: s_mov_b32 s10, -1 2586; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2587; GFX12-NEXT: s_mov_b32 s6, s10 2588; GFX12-NEXT: s_mov_b32 s7, s11 2589; GFX12-NEXT: s_mov_b32 s14, s10 2590; GFX12-NEXT: s_mov_b32 s15, s11 2591; GFX12-NEXT: s_wait_kmcnt 0x0 2592; GFX12-NEXT: s_mov_b32 s12, s2 2593; GFX12-NEXT: s_mov_b32 s13, s3 2594; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2595; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2596; GFX12-NEXT: s_mov_b32 s8, s0 2597; GFX12-NEXT: s_mov_b32 s9, s1 2598; GFX12-NEXT: s_wait_loadcnt 0x1 2599; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2600; GFX12-NEXT: s_wait_loadcnt 0x0 2601; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2602; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v0 2603; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2604; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2605; GFX12-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 2606; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2607; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2608; GFX12-NEXT: s_endpgm 2609 ptr addrspace(1) %r, 2610 ptr addrspace(1) %a, 2611 ptr addrspace(1) %b) { 2612entry: 2613 %a.val = load <2 x half>, ptr addrspace(1) %a 2614 %b.val = load <2 x half>, ptr addrspace(1) %b 2615 %r.val = fcmp ord <2 x half> %a.val, %b.val 2616 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2617 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2618 ret void 2619} 2620 2621 2622define amdgpu_kernel void @fcmp_v2f16_u( 2623; SI-LABEL: fcmp_v2f16_u: 2624; SI: ; %bb.0: ; %entry 2625; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2626; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2627; SI-NEXT: s_mov_b32 s11, 0xf000 2628; SI-NEXT: s_mov_b32 s10, -1 2629; SI-NEXT: s_mov_b32 s14, s10 2630; SI-NEXT: s_mov_b32 s15, s11 2631; SI-NEXT: s_waitcnt lgkmcnt(0) 2632; SI-NEXT: s_mov_b32 s12, s2 2633; SI-NEXT: s_mov_b32 s13, s3 2634; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2635; SI-NEXT: s_mov_b32 s6, s10 2636; SI-NEXT: s_mov_b32 s7, s11 2637; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2638; SI-NEXT: s_mov_b32 s8, s0 2639; SI-NEXT: s_mov_b32 s9, s1 2640; SI-NEXT: s_waitcnt vmcnt(1) 2641; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2642; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2643; SI-NEXT: s_waitcnt vmcnt(0) 2644; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2645; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2646; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2647; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2648; SI-NEXT: v_cmp_u_f32_e32 vcc, v2, v3 2649; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2650; SI-NEXT: v_cmp_u_f32_e32 vcc, v4, v1 2651; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2652; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2653; SI-NEXT: s_endpgm 2654; 2655; VI-LABEL: fcmp_v2f16_u: 2656; VI: ; %bb.0: ; %entry 2657; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2658; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2659; VI-NEXT: s_mov_b32 s7, 0xf000 2660; VI-NEXT: s_mov_b32 s6, -1 2661; VI-NEXT: s_mov_b32 s10, s6 2662; VI-NEXT: s_mov_b32 s11, s7 2663; VI-NEXT: s_waitcnt lgkmcnt(0) 2664; VI-NEXT: s_mov_b32 s12, s2 2665; VI-NEXT: s_mov_b32 s13, s3 2666; VI-NEXT: s_mov_b32 s14, s6 2667; VI-NEXT: s_mov_b32 s15, s7 2668; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2669; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2670; VI-NEXT: s_mov_b32 s4, s0 2671; VI-NEXT: s_mov_b32 s5, s1 2672; VI-NEXT: s_waitcnt vmcnt(1) 2673; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2674; VI-NEXT: s_waitcnt vmcnt(0) 2675; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2676; VI-NEXT: v_cmp_u_f16_e32 vcc, v1, v0 2677; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2678; VI-NEXT: v_cmp_u_f16_e32 vcc, v3, v2 2679; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2680; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2681; VI-NEXT: s_endpgm 2682; 2683; GFX11-LABEL: fcmp_v2f16_u: 2684; GFX11: ; %bb.0: ; %entry 2685; GFX11-NEXT: s_clause 0x1 2686; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2687; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2688; GFX11-NEXT: s_mov_b32 s10, -1 2689; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2690; GFX11-NEXT: s_mov_b32 s6, s10 2691; GFX11-NEXT: s_mov_b32 s7, s11 2692; GFX11-NEXT: s_mov_b32 s14, s10 2693; GFX11-NEXT: s_mov_b32 s15, s11 2694; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2695; GFX11-NEXT: s_mov_b32 s12, s2 2696; GFX11-NEXT: s_mov_b32 s13, s3 2697; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2698; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2699; GFX11-NEXT: s_mov_b32 s8, s0 2700; GFX11-NEXT: s_mov_b32 s9, s1 2701; GFX11-NEXT: s_waitcnt vmcnt(1) 2702; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2703; GFX11-NEXT: s_waitcnt vmcnt(0) 2704; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2705; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0 2706; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2708; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 2709; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2710; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2711; GFX11-NEXT: s_endpgm 2712; 2713; GFX12-LABEL: fcmp_v2f16_u: 2714; GFX12: ; %bb.0: ; %entry 2715; GFX12-NEXT: s_clause 0x1 2716; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2717; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2718; GFX12-NEXT: s_mov_b32 s10, -1 2719; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2720; GFX12-NEXT: s_mov_b32 s6, s10 2721; GFX12-NEXT: s_mov_b32 s7, s11 2722; GFX12-NEXT: s_mov_b32 s14, s10 2723; GFX12-NEXT: s_mov_b32 s15, s11 2724; GFX12-NEXT: s_wait_kmcnt 0x0 2725; GFX12-NEXT: s_mov_b32 s12, s2 2726; GFX12-NEXT: s_mov_b32 s13, s3 2727; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2728; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2729; GFX12-NEXT: s_mov_b32 s8, s0 2730; GFX12-NEXT: s_mov_b32 s9, s1 2731; GFX12-NEXT: s_wait_loadcnt 0x1 2732; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2733; GFX12-NEXT: s_wait_loadcnt 0x0 2734; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2735; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v0 2736; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2737; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2738; GFX12-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 2739; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2740; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2741; GFX12-NEXT: s_endpgm 2742 ptr addrspace(1) %r, 2743 ptr addrspace(1) %a, 2744 ptr addrspace(1) %b) { 2745entry: 2746 %a.val = load <2 x half>, ptr addrspace(1) %a 2747 %b.val = load <2 x half>, ptr addrspace(1) %b 2748 %r.val = fcmp uno <2 x half> %a.val, %b.val 2749 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2750 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2751 ret void 2752} 2753 2754define amdgpu_kernel void @fcmp_v2f16_nge( 2755; SI-LABEL: fcmp_v2f16_nge: 2756; SI: ; %bb.0: ; %entry 2757; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2758; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2759; SI-NEXT: s_mov_b32 s11, 0xf000 2760; SI-NEXT: s_mov_b32 s10, -1 2761; SI-NEXT: s_mov_b32 s14, s10 2762; SI-NEXT: s_mov_b32 s15, s11 2763; SI-NEXT: s_waitcnt lgkmcnt(0) 2764; SI-NEXT: s_mov_b32 s12, s2 2765; SI-NEXT: s_mov_b32 s13, s3 2766; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2767; SI-NEXT: s_mov_b32 s6, s10 2768; SI-NEXT: s_mov_b32 s7, s11 2769; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2770; SI-NEXT: s_mov_b32 s8, s0 2771; SI-NEXT: s_mov_b32 s9, s1 2772; SI-NEXT: s_waitcnt vmcnt(1) 2773; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2774; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2775; SI-NEXT: s_waitcnt vmcnt(0) 2776; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2777; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2778; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2779; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2780; SI-NEXT: v_cmp_nge_f32_e32 vcc, v2, v3 2781; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2782; SI-NEXT: v_cmp_nge_f32_e32 vcc, v4, v1 2783; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2784; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2785; SI-NEXT: s_endpgm 2786; 2787; VI-LABEL: fcmp_v2f16_nge: 2788; VI: ; %bb.0: ; %entry 2789; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2790; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2791; VI-NEXT: s_mov_b32 s7, 0xf000 2792; VI-NEXT: s_mov_b32 s6, -1 2793; VI-NEXT: s_mov_b32 s10, s6 2794; VI-NEXT: s_mov_b32 s11, s7 2795; VI-NEXT: s_waitcnt lgkmcnt(0) 2796; VI-NEXT: s_mov_b32 s12, s2 2797; VI-NEXT: s_mov_b32 s13, s3 2798; VI-NEXT: s_mov_b32 s14, s6 2799; VI-NEXT: s_mov_b32 s15, s7 2800; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2801; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2802; VI-NEXT: s_mov_b32 s4, s0 2803; VI-NEXT: s_mov_b32 s5, s1 2804; VI-NEXT: s_waitcnt vmcnt(1) 2805; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2806; VI-NEXT: s_waitcnt vmcnt(0) 2807; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2808; VI-NEXT: v_cmp_nge_f16_e32 vcc, v1, v0 2809; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2810; VI-NEXT: v_cmp_nge_f16_e32 vcc, v3, v2 2811; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2812; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2813; VI-NEXT: s_endpgm 2814; 2815; GFX11-LABEL: fcmp_v2f16_nge: 2816; GFX11: ; %bb.0: ; %entry 2817; GFX11-NEXT: s_clause 0x1 2818; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2819; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2820; GFX11-NEXT: s_mov_b32 s10, -1 2821; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2822; GFX11-NEXT: s_mov_b32 s6, s10 2823; GFX11-NEXT: s_mov_b32 s7, s11 2824; GFX11-NEXT: s_mov_b32 s14, s10 2825; GFX11-NEXT: s_mov_b32 s15, s11 2826; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX11-NEXT: s_mov_b32 s12, s2 2828; GFX11-NEXT: s_mov_b32 s13, s3 2829; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2830; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2831; GFX11-NEXT: s_mov_b32 s8, s0 2832; GFX11-NEXT: s_mov_b32 s9, s1 2833; GFX11-NEXT: s_waitcnt vmcnt(1) 2834; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2835; GFX11-NEXT: s_waitcnt vmcnt(0) 2836; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2837; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0 2838; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2839; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2840; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 2841; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2842; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2843; GFX11-NEXT: s_endpgm 2844; 2845; GFX12-LABEL: fcmp_v2f16_nge: 2846; GFX12: ; %bb.0: ; %entry 2847; GFX12-NEXT: s_clause 0x1 2848; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2849; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2850; GFX12-NEXT: s_mov_b32 s10, -1 2851; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2852; GFX12-NEXT: s_mov_b32 s6, s10 2853; GFX12-NEXT: s_mov_b32 s7, s11 2854; GFX12-NEXT: s_mov_b32 s14, s10 2855; GFX12-NEXT: s_mov_b32 s15, s11 2856; GFX12-NEXT: s_wait_kmcnt 0x0 2857; GFX12-NEXT: s_mov_b32 s12, s2 2858; GFX12-NEXT: s_mov_b32 s13, s3 2859; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2860; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2861; GFX12-NEXT: s_mov_b32 s8, s0 2862; GFX12-NEXT: s_mov_b32 s9, s1 2863; GFX12-NEXT: s_wait_loadcnt 0x1 2864; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2865; GFX12-NEXT: s_wait_loadcnt 0x0 2866; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2867; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v1, v0 2868; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2869; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 2870; GFX12-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 2871; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2872; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2873; GFX12-NEXT: s_endpgm 2874 ptr addrspace(1) %r, 2875 ptr addrspace(1) %a, 2876 ptr addrspace(1) %b) { 2877entry: 2878 %a.val = load <2 x half>, ptr addrspace(1) %a 2879 %b.val = load <2 x half>, ptr addrspace(1) %b 2880 %r.val = fcmp ult <2 x half> %a.val, %b.val 2881 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 2882 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 2883 ret void 2884} 2885 2886define amdgpu_kernel void @fcmp_v2f16_nlg( 2887; SI-LABEL: fcmp_v2f16_nlg: 2888; SI: ; %bb.0: ; %entry 2889; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2890; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2891; SI-NEXT: s_mov_b32 s11, 0xf000 2892; SI-NEXT: s_mov_b32 s10, -1 2893; SI-NEXT: s_mov_b32 s14, s10 2894; SI-NEXT: s_mov_b32 s15, s11 2895; SI-NEXT: s_waitcnt lgkmcnt(0) 2896; SI-NEXT: s_mov_b32 s12, s2 2897; SI-NEXT: s_mov_b32 s13, s3 2898; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 2899; SI-NEXT: s_mov_b32 s6, s10 2900; SI-NEXT: s_mov_b32 s7, s11 2901; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 2902; SI-NEXT: s_mov_b32 s8, s0 2903; SI-NEXT: s_mov_b32 s9, s1 2904; SI-NEXT: s_waitcnt vmcnt(1) 2905; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2906; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2907; SI-NEXT: s_waitcnt vmcnt(0) 2908; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 2909; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2910; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 2911; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2912; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3 2913; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2914; SI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v1 2915; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2916; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2917; SI-NEXT: s_endpgm 2918; 2919; VI-LABEL: fcmp_v2f16_nlg: 2920; VI: ; %bb.0: ; %entry 2921; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2922; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 2923; VI-NEXT: s_mov_b32 s7, 0xf000 2924; VI-NEXT: s_mov_b32 s6, -1 2925; VI-NEXT: s_mov_b32 s10, s6 2926; VI-NEXT: s_mov_b32 s11, s7 2927; VI-NEXT: s_waitcnt lgkmcnt(0) 2928; VI-NEXT: s_mov_b32 s12, s2 2929; VI-NEXT: s_mov_b32 s13, s3 2930; VI-NEXT: s_mov_b32 s14, s6 2931; VI-NEXT: s_mov_b32 s15, s7 2932; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 2933; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 2934; VI-NEXT: s_mov_b32 s4, s0 2935; VI-NEXT: s_mov_b32 s5, s1 2936; VI-NEXT: s_waitcnt vmcnt(1) 2937; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2938; VI-NEXT: s_waitcnt vmcnt(0) 2939; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2940; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v1, v0 2941; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2942; VI-NEXT: v_cmp_nlg_f16_e32 vcc, v3, v2 2943; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2944; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2945; VI-NEXT: s_endpgm 2946; 2947; GFX11-LABEL: fcmp_v2f16_nlg: 2948; GFX11: ; %bb.0: ; %entry 2949; GFX11-NEXT: s_clause 0x1 2950; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2951; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2952; GFX11-NEXT: s_mov_b32 s10, -1 2953; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2954; GFX11-NEXT: s_mov_b32 s6, s10 2955; GFX11-NEXT: s_mov_b32 s7, s11 2956; GFX11-NEXT: s_mov_b32 s14, s10 2957; GFX11-NEXT: s_mov_b32 s15, s11 2958; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2959; GFX11-NEXT: s_mov_b32 s12, s2 2960; GFX11-NEXT: s_mov_b32 s13, s3 2961; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2962; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 2963; GFX11-NEXT: s_mov_b32 s8, s0 2964; GFX11-NEXT: s_mov_b32 s9, s1 2965; GFX11-NEXT: s_waitcnt vmcnt(1) 2966; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2967; GFX11-NEXT: s_waitcnt vmcnt(0) 2968; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2969; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0 2970; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2971; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 2972; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 2973; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2974; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2975; GFX11-NEXT: s_endpgm 2976; 2977; GFX12-LABEL: fcmp_v2f16_nlg: 2978; GFX12: ; %bb.0: ; %entry 2979; GFX12-NEXT: s_clause 0x1 2980; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2981; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2982; GFX12-NEXT: s_mov_b32 s10, -1 2983; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2984; GFX12-NEXT: s_mov_b32 s6, s10 2985; GFX12-NEXT: s_mov_b32 s7, s11 2986; GFX12-NEXT: s_mov_b32 s14, s10 2987; GFX12-NEXT: s_mov_b32 s15, s11 2988; GFX12-NEXT: s_wait_kmcnt 0x0 2989; GFX12-NEXT: s_mov_b32 s12, s2 2990; GFX12-NEXT: s_mov_b32 s13, s3 2991; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2992; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 2993; GFX12-NEXT: s_mov_b32 s8, s0 2994; GFX12-NEXT: s_mov_b32 s9, s1 2995; GFX12-NEXT: s_wait_loadcnt 0x1 2996; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2997; GFX12-NEXT: s_wait_loadcnt 0x0 2998; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2999; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v1, v0 3000; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3001; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 3002; GFX12-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 3003; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3004; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 3005; GFX12-NEXT: s_endpgm 3006 ptr addrspace(1) %r, 3007 ptr addrspace(1) %a, 3008 ptr addrspace(1) %b) { 3009entry: 3010 %a.val = load <2 x half>, ptr addrspace(1) %a 3011 %b.val = load <2 x half>, ptr addrspace(1) %b 3012 %r.val = fcmp ueq <2 x half> %a.val, %b.val 3013 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 3014 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 3015 ret void 3016} 3017 3018 3019define amdgpu_kernel void @fcmp_v2f16_ngt( 3020; SI-LABEL: fcmp_v2f16_ngt: 3021; SI: ; %bb.0: ; %entry 3022; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3023; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3024; SI-NEXT: s_mov_b32 s11, 0xf000 3025; SI-NEXT: s_mov_b32 s10, -1 3026; SI-NEXT: s_mov_b32 s14, s10 3027; SI-NEXT: s_mov_b32 s15, s11 3028; SI-NEXT: s_waitcnt lgkmcnt(0) 3029; SI-NEXT: s_mov_b32 s12, s2 3030; SI-NEXT: s_mov_b32 s13, s3 3031; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 3032; SI-NEXT: s_mov_b32 s6, s10 3033; SI-NEXT: s_mov_b32 s7, s11 3034; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 3035; SI-NEXT: s_mov_b32 s8, s0 3036; SI-NEXT: s_mov_b32 s9, s1 3037; SI-NEXT: s_waitcnt vmcnt(1) 3038; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 3039; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3040; SI-NEXT: s_waitcnt vmcnt(0) 3041; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 3042; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3043; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 3044; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 3045; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v3 3046; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3047; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v1 3048; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3049; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 3050; SI-NEXT: s_endpgm 3051; 3052; VI-LABEL: fcmp_v2f16_ngt: 3053; VI: ; %bb.0: ; %entry 3054; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3055; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 3056; VI-NEXT: s_mov_b32 s7, 0xf000 3057; VI-NEXT: s_mov_b32 s6, -1 3058; VI-NEXT: s_mov_b32 s10, s6 3059; VI-NEXT: s_mov_b32 s11, s7 3060; VI-NEXT: s_waitcnt lgkmcnt(0) 3061; VI-NEXT: s_mov_b32 s12, s2 3062; VI-NEXT: s_mov_b32 s13, s3 3063; VI-NEXT: s_mov_b32 s14, s6 3064; VI-NEXT: s_mov_b32 s15, s7 3065; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 3066; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 3067; VI-NEXT: s_mov_b32 s4, s0 3068; VI-NEXT: s_mov_b32 s5, s1 3069; VI-NEXT: s_waitcnt vmcnt(1) 3070; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3071; VI-NEXT: s_waitcnt vmcnt(0) 3072; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3073; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v1, v0 3074; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3075; VI-NEXT: v_cmp_ngt_f16_e32 vcc, v3, v2 3076; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3077; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3078; VI-NEXT: s_endpgm 3079; 3080; GFX11-LABEL: fcmp_v2f16_ngt: 3081; GFX11: ; %bb.0: ; %entry 3082; GFX11-NEXT: s_clause 0x1 3083; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3084; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3085; GFX11-NEXT: s_mov_b32 s10, -1 3086; GFX11-NEXT: s_mov_b32 s11, 0x31016000 3087; GFX11-NEXT: s_mov_b32 s6, s10 3088; GFX11-NEXT: s_mov_b32 s7, s11 3089; GFX11-NEXT: s_mov_b32 s14, s10 3090; GFX11-NEXT: s_mov_b32 s15, s11 3091; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3092; GFX11-NEXT: s_mov_b32 s12, s2 3093; GFX11-NEXT: s_mov_b32 s13, s3 3094; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 3095; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 3096; GFX11-NEXT: s_mov_b32 s8, s0 3097; GFX11-NEXT: s_mov_b32 s9, s1 3098; GFX11-NEXT: s_waitcnt vmcnt(1) 3099; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3100; GFX11-NEXT: s_waitcnt vmcnt(0) 3101; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3102; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0 3103; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3104; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3105; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 3106; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3107; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 3108; GFX11-NEXT: s_endpgm 3109; 3110; GFX12-LABEL: fcmp_v2f16_ngt: 3111; GFX12: ; %bb.0: ; %entry 3112; GFX12-NEXT: s_clause 0x1 3113; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3114; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3115; GFX12-NEXT: s_mov_b32 s10, -1 3116; GFX12-NEXT: s_mov_b32 s11, 0x31016000 3117; GFX12-NEXT: s_mov_b32 s6, s10 3118; GFX12-NEXT: s_mov_b32 s7, s11 3119; GFX12-NEXT: s_mov_b32 s14, s10 3120; GFX12-NEXT: s_mov_b32 s15, s11 3121; GFX12-NEXT: s_wait_kmcnt 0x0 3122; GFX12-NEXT: s_mov_b32 s12, s2 3123; GFX12-NEXT: s_mov_b32 s13, s3 3124; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 3125; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 3126; GFX12-NEXT: s_mov_b32 s8, s0 3127; GFX12-NEXT: s_mov_b32 s9, s1 3128; GFX12-NEXT: s_wait_loadcnt 0x1 3129; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3130; GFX12-NEXT: s_wait_loadcnt 0x0 3131; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3132; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v0 3133; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3134; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 3135; GFX12-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 3136; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3137; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 3138; GFX12-NEXT: s_endpgm 3139 ptr addrspace(1) %r, 3140 ptr addrspace(1) %a, 3141 ptr addrspace(1) %b) { 3142entry: 3143 %a.val = load <2 x half>, ptr addrspace(1) %a 3144 %b.val = load <2 x half>, ptr addrspace(1) %b 3145 %r.val = fcmp ule <2 x half> %a.val, %b.val 3146 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 3147 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 3148 ret void 3149} 3150 3151define amdgpu_kernel void @fcmp_v2f16_nle( 3152; SI-LABEL: fcmp_v2f16_nle: 3153; SI: ; %bb.0: ; %entry 3154; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3155; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3156; SI-NEXT: s_mov_b32 s11, 0xf000 3157; SI-NEXT: s_mov_b32 s10, -1 3158; SI-NEXT: s_mov_b32 s14, s10 3159; SI-NEXT: s_mov_b32 s15, s11 3160; SI-NEXT: s_waitcnt lgkmcnt(0) 3161; SI-NEXT: s_mov_b32 s12, s2 3162; SI-NEXT: s_mov_b32 s13, s3 3163; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 3164; SI-NEXT: s_mov_b32 s6, s10 3165; SI-NEXT: s_mov_b32 s7, s11 3166; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 3167; SI-NEXT: s_mov_b32 s8, s0 3168; SI-NEXT: s_mov_b32 s9, s1 3169; SI-NEXT: s_waitcnt vmcnt(1) 3170; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 3171; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3172; SI-NEXT: s_waitcnt vmcnt(0) 3173; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 3174; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3175; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 3176; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 3177; SI-NEXT: v_cmp_nle_f32_e32 vcc, v2, v3 3178; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3179; SI-NEXT: v_cmp_nle_f32_e32 vcc, v4, v1 3180; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3181; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 3182; SI-NEXT: s_endpgm 3183; 3184; VI-LABEL: fcmp_v2f16_nle: 3185; VI: ; %bb.0: ; %entry 3186; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3187; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 3188; VI-NEXT: s_mov_b32 s7, 0xf000 3189; VI-NEXT: s_mov_b32 s6, -1 3190; VI-NEXT: s_mov_b32 s10, s6 3191; VI-NEXT: s_mov_b32 s11, s7 3192; VI-NEXT: s_waitcnt lgkmcnt(0) 3193; VI-NEXT: s_mov_b32 s12, s2 3194; VI-NEXT: s_mov_b32 s13, s3 3195; VI-NEXT: s_mov_b32 s14, s6 3196; VI-NEXT: s_mov_b32 s15, s7 3197; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 3198; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 3199; VI-NEXT: s_mov_b32 s4, s0 3200; VI-NEXT: s_mov_b32 s5, s1 3201; VI-NEXT: s_waitcnt vmcnt(1) 3202; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3203; VI-NEXT: s_waitcnt vmcnt(0) 3204; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3205; VI-NEXT: v_cmp_nle_f16_e32 vcc, v1, v0 3206; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3207; VI-NEXT: v_cmp_nle_f16_e32 vcc, v3, v2 3208; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3209; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3210; VI-NEXT: s_endpgm 3211; 3212; GFX11-LABEL: fcmp_v2f16_nle: 3213; GFX11: ; %bb.0: ; %entry 3214; GFX11-NEXT: s_clause 0x1 3215; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3216; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3217; GFX11-NEXT: s_mov_b32 s10, -1 3218; GFX11-NEXT: s_mov_b32 s11, 0x31016000 3219; GFX11-NEXT: s_mov_b32 s6, s10 3220; GFX11-NEXT: s_mov_b32 s7, s11 3221; GFX11-NEXT: s_mov_b32 s14, s10 3222; GFX11-NEXT: s_mov_b32 s15, s11 3223; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3224; GFX11-NEXT: s_mov_b32 s12, s2 3225; GFX11-NEXT: s_mov_b32 s13, s3 3226; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 3227; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 3228; GFX11-NEXT: s_mov_b32 s8, s0 3229; GFX11-NEXT: s_mov_b32 s9, s1 3230; GFX11-NEXT: s_waitcnt vmcnt(1) 3231; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3232; GFX11-NEXT: s_waitcnt vmcnt(0) 3233; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3234; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0 3235; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3237; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 3238; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3239; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 3240; GFX11-NEXT: s_endpgm 3241; 3242; GFX12-LABEL: fcmp_v2f16_nle: 3243; GFX12: ; %bb.0: ; %entry 3244; GFX12-NEXT: s_clause 0x1 3245; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3246; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3247; GFX12-NEXT: s_mov_b32 s10, -1 3248; GFX12-NEXT: s_mov_b32 s11, 0x31016000 3249; GFX12-NEXT: s_mov_b32 s6, s10 3250; GFX12-NEXT: s_mov_b32 s7, s11 3251; GFX12-NEXT: s_mov_b32 s14, s10 3252; GFX12-NEXT: s_mov_b32 s15, s11 3253; GFX12-NEXT: s_wait_kmcnt 0x0 3254; GFX12-NEXT: s_mov_b32 s12, s2 3255; GFX12-NEXT: s_mov_b32 s13, s3 3256; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 3257; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 3258; GFX12-NEXT: s_mov_b32 s8, s0 3259; GFX12-NEXT: s_mov_b32 s9, s1 3260; GFX12-NEXT: s_wait_loadcnt 0x1 3261; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3262; GFX12-NEXT: s_wait_loadcnt 0x0 3263; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3264; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v0 3265; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3266; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 3267; GFX12-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 3268; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3269; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 3270; GFX12-NEXT: s_endpgm 3271 ptr addrspace(1) %r, 3272 ptr addrspace(1) %a, 3273 ptr addrspace(1) %b) { 3274entry: 3275 %a.val = load <2 x half>, ptr addrspace(1) %a 3276 %b.val = load <2 x half>, ptr addrspace(1) %b 3277 %r.val = fcmp ugt <2 x half> %a.val, %b.val 3278 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 3279 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 3280 ret void 3281} 3282 3283define amdgpu_kernel void @fcmp_v2f16_neq( 3284; SI-LABEL: fcmp_v2f16_neq: 3285; SI: ; %bb.0: ; %entry 3286; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3287; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3288; SI-NEXT: s_mov_b32 s11, 0xf000 3289; SI-NEXT: s_mov_b32 s10, -1 3290; SI-NEXT: s_mov_b32 s14, s10 3291; SI-NEXT: s_mov_b32 s15, s11 3292; SI-NEXT: s_waitcnt lgkmcnt(0) 3293; SI-NEXT: s_mov_b32 s12, s2 3294; SI-NEXT: s_mov_b32 s13, s3 3295; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 3296; SI-NEXT: s_mov_b32 s6, s10 3297; SI-NEXT: s_mov_b32 s7, s11 3298; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 3299; SI-NEXT: s_mov_b32 s8, s0 3300; SI-NEXT: s_mov_b32 s9, s1 3301; SI-NEXT: s_waitcnt vmcnt(1) 3302; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 3303; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3304; SI-NEXT: s_waitcnt vmcnt(0) 3305; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 3306; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3307; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 3308; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 3309; SI-NEXT: v_cmp_neq_f32_e32 vcc, v2, v3 3310; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3311; SI-NEXT: v_cmp_neq_f32_e32 vcc, v4, v1 3312; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3313; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 3314; SI-NEXT: s_endpgm 3315; 3316; VI-LABEL: fcmp_v2f16_neq: 3317; VI: ; %bb.0: ; %entry 3318; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3319; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 3320; VI-NEXT: s_mov_b32 s7, 0xf000 3321; VI-NEXT: s_mov_b32 s6, -1 3322; VI-NEXT: s_mov_b32 s10, s6 3323; VI-NEXT: s_mov_b32 s11, s7 3324; VI-NEXT: s_waitcnt lgkmcnt(0) 3325; VI-NEXT: s_mov_b32 s12, s2 3326; VI-NEXT: s_mov_b32 s13, s3 3327; VI-NEXT: s_mov_b32 s14, s6 3328; VI-NEXT: s_mov_b32 s15, s7 3329; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 3330; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 3331; VI-NEXT: s_mov_b32 s4, s0 3332; VI-NEXT: s_mov_b32 s5, s1 3333; VI-NEXT: s_waitcnt vmcnt(1) 3334; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3335; VI-NEXT: s_waitcnt vmcnt(0) 3336; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3337; VI-NEXT: v_cmp_neq_f16_e32 vcc, v1, v0 3338; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3339; VI-NEXT: v_cmp_neq_f16_e32 vcc, v3, v2 3340; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3341; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3342; VI-NEXT: s_endpgm 3343; 3344; GFX11-LABEL: fcmp_v2f16_neq: 3345; GFX11: ; %bb.0: ; %entry 3346; GFX11-NEXT: s_clause 0x1 3347; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3348; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3349; GFX11-NEXT: s_mov_b32 s10, -1 3350; GFX11-NEXT: s_mov_b32 s11, 0x31016000 3351; GFX11-NEXT: s_mov_b32 s6, s10 3352; GFX11-NEXT: s_mov_b32 s7, s11 3353; GFX11-NEXT: s_mov_b32 s14, s10 3354; GFX11-NEXT: s_mov_b32 s15, s11 3355; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3356; GFX11-NEXT: s_mov_b32 s12, s2 3357; GFX11-NEXT: s_mov_b32 s13, s3 3358; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 3359; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 3360; GFX11-NEXT: s_mov_b32 s8, s0 3361; GFX11-NEXT: s_mov_b32 s9, s1 3362; GFX11-NEXT: s_waitcnt vmcnt(1) 3363; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3364; GFX11-NEXT: s_waitcnt vmcnt(0) 3365; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3366; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0 3367; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3369; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 3370; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3371; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 3372; GFX11-NEXT: s_endpgm 3373; 3374; GFX12-LABEL: fcmp_v2f16_neq: 3375; GFX12: ; %bb.0: ; %entry 3376; GFX12-NEXT: s_clause 0x1 3377; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3378; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3379; GFX12-NEXT: s_mov_b32 s10, -1 3380; GFX12-NEXT: s_mov_b32 s11, 0x31016000 3381; GFX12-NEXT: s_mov_b32 s6, s10 3382; GFX12-NEXT: s_mov_b32 s7, s11 3383; GFX12-NEXT: s_mov_b32 s14, s10 3384; GFX12-NEXT: s_mov_b32 s15, s11 3385; GFX12-NEXT: s_wait_kmcnt 0x0 3386; GFX12-NEXT: s_mov_b32 s12, s2 3387; GFX12-NEXT: s_mov_b32 s13, s3 3388; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 3389; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 3390; GFX12-NEXT: s_mov_b32 s8, s0 3391; GFX12-NEXT: s_mov_b32 s9, s1 3392; GFX12-NEXT: s_wait_loadcnt 0x1 3393; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3394; GFX12-NEXT: s_wait_loadcnt 0x0 3395; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3396; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v1, v0 3397; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3398; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 3399; GFX12-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 3400; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3401; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 3402; GFX12-NEXT: s_endpgm 3403 ptr addrspace(1) %r, 3404 ptr addrspace(1) %a, 3405 ptr addrspace(1) %b) { 3406entry: 3407 %a.val = load <2 x half>, ptr addrspace(1) %a 3408 %b.val = load <2 x half>, ptr addrspace(1) %b 3409 %r.val = fcmp une <2 x half> %a.val, %b.val 3410 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 3411 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 3412 ret void 3413} 3414 3415define amdgpu_kernel void @fcmp_v2f16_nlt( 3416; SI-LABEL: fcmp_v2f16_nlt: 3417; SI: ; %bb.0: ; %entry 3418; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3419; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3420; SI-NEXT: s_mov_b32 s11, 0xf000 3421; SI-NEXT: s_mov_b32 s10, -1 3422; SI-NEXT: s_mov_b32 s14, s10 3423; SI-NEXT: s_mov_b32 s15, s11 3424; SI-NEXT: s_waitcnt lgkmcnt(0) 3425; SI-NEXT: s_mov_b32 s12, s2 3426; SI-NEXT: s_mov_b32 s13, s3 3427; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 3428; SI-NEXT: s_mov_b32 s6, s10 3429; SI-NEXT: s_mov_b32 s7, s11 3430; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 3431; SI-NEXT: s_mov_b32 s8, s0 3432; SI-NEXT: s_mov_b32 s9, s1 3433; SI-NEXT: s_waitcnt vmcnt(1) 3434; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 3435; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 3436; SI-NEXT: s_waitcnt vmcnt(0) 3437; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 3438; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 3439; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 3440; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 3441; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v2, v3 3442; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3443; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1 3444; SI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3445; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 3446; SI-NEXT: s_endpgm 3447; 3448; VI-LABEL: fcmp_v2f16_nlt: 3449; VI: ; %bb.0: ; %entry 3450; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3451; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 3452; VI-NEXT: s_mov_b32 s7, 0xf000 3453; VI-NEXT: s_mov_b32 s6, -1 3454; VI-NEXT: s_mov_b32 s10, s6 3455; VI-NEXT: s_mov_b32 s11, s7 3456; VI-NEXT: s_waitcnt lgkmcnt(0) 3457; VI-NEXT: s_mov_b32 s12, s2 3458; VI-NEXT: s_mov_b32 s13, s3 3459; VI-NEXT: s_mov_b32 s14, s6 3460; VI-NEXT: s_mov_b32 s15, s7 3461; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 3462; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 3463; VI-NEXT: s_mov_b32 s4, s0 3464; VI-NEXT: s_mov_b32 s5, s1 3465; VI-NEXT: s_waitcnt vmcnt(1) 3466; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3467; VI-NEXT: s_waitcnt vmcnt(0) 3468; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3469; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v1, v0 3470; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 3471; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v3, v2 3472; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 3473; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3474; VI-NEXT: s_endpgm 3475; 3476; GFX11-LABEL: fcmp_v2f16_nlt: 3477; GFX11: ; %bb.0: ; %entry 3478; GFX11-NEXT: s_clause 0x1 3479; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3480; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3481; GFX11-NEXT: s_mov_b32 s10, -1 3482; GFX11-NEXT: s_mov_b32 s11, 0x31016000 3483; GFX11-NEXT: s_mov_b32 s6, s10 3484; GFX11-NEXT: s_mov_b32 s7, s11 3485; GFX11-NEXT: s_mov_b32 s14, s10 3486; GFX11-NEXT: s_mov_b32 s15, s11 3487; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3488; GFX11-NEXT: s_mov_b32 s12, s2 3489; GFX11-NEXT: s_mov_b32 s13, s3 3490; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 3491; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 3492; GFX11-NEXT: s_mov_b32 s8, s0 3493; GFX11-NEXT: s_mov_b32 s9, s1 3494; GFX11-NEXT: s_waitcnt vmcnt(1) 3495; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3496; GFX11-NEXT: s_waitcnt vmcnt(0) 3497; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3498; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 3499; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3500; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 3501; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 3502; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3503; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 3504; GFX11-NEXT: s_endpgm 3505; 3506; GFX12-LABEL: fcmp_v2f16_nlt: 3507; GFX12: ; %bb.0: ; %entry 3508; GFX12-NEXT: s_clause 0x1 3509; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3510; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3511; GFX12-NEXT: s_mov_b32 s10, -1 3512; GFX12-NEXT: s_mov_b32 s11, 0x31016000 3513; GFX12-NEXT: s_mov_b32 s6, s10 3514; GFX12-NEXT: s_mov_b32 s7, s11 3515; GFX12-NEXT: s_mov_b32 s14, s10 3516; GFX12-NEXT: s_mov_b32 s15, s11 3517; GFX12-NEXT: s_wait_kmcnt 0x0 3518; GFX12-NEXT: s_mov_b32 s12, s2 3519; GFX12-NEXT: s_mov_b32 s13, s3 3520; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 3521; GFX12-NEXT: buffer_load_b32 v1, off, s[12:15], null 3522; GFX12-NEXT: s_mov_b32 s8, s0 3523; GFX12-NEXT: s_mov_b32 s9, s1 3524; GFX12-NEXT: s_wait_loadcnt 0x1 3525; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0 3526; GFX12-NEXT: s_wait_loadcnt 0x0 3527; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3528; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 3529; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 3530; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) 3531; GFX12-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 3532; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 3533; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 3534; GFX12-NEXT: s_endpgm 3535 ptr addrspace(1) %r, 3536 ptr addrspace(1) %a, 3537 ptr addrspace(1) %b) { 3538entry: 3539 %a.val = load <2 x half>, ptr addrspace(1) %a 3540 %b.val = load <2 x half>, ptr addrspace(1) %b 3541 %r.val = fcmp uge <2 x half> %a.val, %b.val 3542 %r.val.sext = sext <2 x i1> %r.val to <2 x i32> 3543 store <2 x i32> %r.val.sext, ptr addrspace(1) %r 3544 ret void 3545} 3546 3547declare half @llvm.fabs.f16(half) #1 3548 3549attributes #0 = { nounwind } 3550attributes #1 = { nounwind readnone } 3551