1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s 9 10define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 11; SI-LABEL: frem_f16: 12; SI: ; %bb.0: 13; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 14; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 15; SI-NEXT: s_mov_b32 s11, 0xf000 16; SI-NEXT: s_mov_b32 s10, -1 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: s_mov_b32 s8, s0 19; SI-NEXT: s_mov_b32 s9, s1 20; SI-NEXT: s_mov_b32 s0, s2 21; SI-NEXT: s_mov_b32 s1, s3 22; SI-NEXT: s_mov_b32 s2, s10 23; SI-NEXT: s_mov_b32 s3, s11 24; SI-NEXT: s_mov_b32 s6, s10 25; SI-NEXT: s_mov_b32 s7, s11 26; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 27; SI-NEXT: s_waitcnt vmcnt(0) 28; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 29; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 32; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 33; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 34; SI-NEXT: v_rcp_f32_e32 v4, v3 35; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 36; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 37; SI-NEXT: v_fma_f32 v4, v5, v4, v4 38; SI-NEXT: v_mul_f32_e32 v5, v2, v4 39; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 40; SI-NEXT: v_fma_f32 v5, v6, v4, v5 41; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 42; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 43; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 44; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 45; SI-NEXT: v_trunc_f32_e32 v2, v2 46; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 47; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 48; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 49; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 50; SI-NEXT: s_endpgm 51; 52; CI-LABEL: frem_f16: 53; CI: ; %bb.0: 54; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 55; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 56; CI-NEXT: s_mov_b32 s11, 0xf000 57; CI-NEXT: s_mov_b32 s10, -1 58; CI-NEXT: s_mov_b32 s6, s10 59; CI-NEXT: s_waitcnt lgkmcnt(0) 60; CI-NEXT: s_mov_b32 s8, s0 61; CI-NEXT: s_mov_b32 s9, s1 62; CI-NEXT: s_mov_b32 s0, s2 63; CI-NEXT: s_mov_b32 s1, s3 64; CI-NEXT: s_mov_b32 s2, s10 65; CI-NEXT: s_mov_b32 s3, s11 66; CI-NEXT: s_mov_b32 s7, s11 67; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 68; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 69; CI-NEXT: s_waitcnt vmcnt(1) 70; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 71; CI-NEXT: s_waitcnt vmcnt(0) 72; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 73; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 74; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 75; CI-NEXT: v_rcp_f32_e32 v4, v3 76; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 77; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 78; CI-NEXT: v_fma_f32 v4, v5, v4, v4 79; CI-NEXT: v_mul_f32_e32 v5, v2, v4 80; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 81; CI-NEXT: v_fma_f32 v5, v6, v4, v5 82; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 83; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 84; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 85; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 86; CI-NEXT: v_trunc_f32_e32 v2, v2 87; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 88; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 89; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 90; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 91; CI-NEXT: s_endpgm 92; 93; VI-LABEL: frem_f16: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 96; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: v_mov_b32_e32 v0, s0 99; VI-NEXT: s_add_u32 s0, s4, 8 100; VI-NEXT: v_mov_b32_e32 v1, s1 101; VI-NEXT: v_mov_b32_e32 v2, s2 102; VI-NEXT: v_mov_b32_e32 v3, s3 103; VI-NEXT: s_addc_u32 s1, s5, 0 104; VI-NEXT: flat_load_ushort v4, v[2:3] 105; VI-NEXT: v_mov_b32_e32 v3, s1 106; VI-NEXT: v_mov_b32_e32 v2, s0 107; VI-NEXT: flat_load_ushort v2, v[2:3] 108; VI-NEXT: s_waitcnt vmcnt(1) 109; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 110; VI-NEXT: s_waitcnt vmcnt(0) 111; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 112; VI-NEXT: v_rcp_f32_e32 v6, v5 113; VI-NEXT: v_mul_f32_e32 v7, v3, v6 114; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 115; VI-NEXT: v_mac_f32_e32 v7, v8, v6 116; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 117; VI-NEXT: v_mul_f32_e32 v3, v3, v6 118; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 119; VI-NEXT: v_add_f32_e32 v3, v3, v7 120; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 121; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 122; VI-NEXT: v_trunc_f16_e32 v3, v3 123; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 124; VI-NEXT: flat_store_short v[0:1], v2 125; VI-NEXT: s_endpgm 126; 127; GFX9-LABEL: frem_f16: 128; GFX9: ; %bb.0: 129; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 130; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 131; GFX9-NEXT: v_mov_b32_e32 v0, 0 132; GFX9-NEXT: s_waitcnt lgkmcnt(0) 133; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 134; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 135; GFX9-NEXT: s_waitcnt vmcnt(1) 136; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 137; GFX9-NEXT: s_waitcnt vmcnt(0) 138; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 139; GFX9-NEXT: v_rcp_f32_e32 v4, v4 140; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 141; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 142; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 143; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 144; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 145; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 146; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 147; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 148; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 149; GFX9-NEXT: v_trunc_f16_e32 v3, v3 150; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 151; GFX9-NEXT: global_store_short v0, v1, s[0:1] 152; GFX9-NEXT: s_endpgm 153; 154; GFX10-LABEL: frem_f16: 155; GFX10: ; %bb.0: 156; GFX10-NEXT: s_clause 0x1 157; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 158; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 159; GFX10-NEXT: v_mov_b32_e32 v0, 0 160; GFX10-NEXT: s_waitcnt lgkmcnt(0) 161; GFX10-NEXT: s_clause 0x1 162; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 163; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 164; GFX10-NEXT: s_waitcnt vmcnt(1) 165; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 166; GFX10-NEXT: s_waitcnt vmcnt(0) 167; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 168; GFX10-NEXT: v_rcp_f32_e32 v5, v4 169; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 170; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 171; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 172; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 173; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 174; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 175; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 176; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 177; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 178; GFX10-NEXT: v_trunc_f16_e32 v3, v3 179; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 180; GFX10-NEXT: global_store_short v0, v1, s[0:1] 181; GFX10-NEXT: s_endpgm 182; 183; GFX11-LABEL: frem_f16: 184; GFX11: ; %bb.0: 185; GFX11-NEXT: s_clause 0x1 186; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 187; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 188; GFX11-NEXT: v_mov_b32_e32 v0, 0 189; GFX11-NEXT: s_waitcnt lgkmcnt(0) 190; GFX11-NEXT: s_clause 0x1 191; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 192; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 193; GFX11-NEXT: s_waitcnt vmcnt(1) 194; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 195; GFX11-NEXT: s_waitcnt vmcnt(0) 196; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 197; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 198; GFX11-NEXT: v_rcp_f32_e32 v4, v4 199; GFX11-NEXT: s_waitcnt_depctr 0xfff 200; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 201; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 202; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 203; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4 204; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 205; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 206; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 207; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 208; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 209; GFX11-NEXT: v_add_f32_e32 v3, v4, v3 210; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 211; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 212; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 213; GFX11-NEXT: v_trunc_f16_e32 v3, v3 214; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 215; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 216; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 217; GFX11-NEXT: s_endpgm 218; 219; GFX1150-LABEL: frem_f16: 220; GFX1150: ; %bb.0: 221; GFX1150-NEXT: s_clause 0x1 222; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 223; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 224; GFX1150-NEXT: v_mov_b32_e32 v0, 0 225; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 226; GFX1150-NEXT: s_clause 0x1 227; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] 228; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 229; GFX1150-NEXT: s_waitcnt vmcnt(1) 230; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1 231; GFX1150-NEXT: s_waitcnt vmcnt(0) 232; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2 233; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 234; GFX1150-NEXT: v_rcp_f32_e32 v4, v4 235; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4 236; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 237; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 238; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4 239; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 240; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 241; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4 242; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 243; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4 244; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3 245; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 246; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3 247; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1 248; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 249; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 250; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 251; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) 252; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 253; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] 254; GFX1150-NEXT: s_endpgm 255 ptr addrspace(1) %in2) #0 { 256 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 257 %r0 = load half, ptr addrspace(1) %in1, align 4 258 %r1 = load half, ptr addrspace(1) %gep2, align 4 259 %r2 = frem half %r0, %r1 260 store half %r2, ptr addrspace(1) %out, align 4 261 ret void 262} 263 264define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 265; SI-LABEL: fast_frem_f16: 266; SI: ; %bb.0: 267; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 268; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 269; SI-NEXT: s_mov_b32 s11, 0xf000 270; SI-NEXT: s_mov_b32 s10, -1 271; SI-NEXT: s_waitcnt lgkmcnt(0) 272; SI-NEXT: s_mov_b32 s8, s0 273; SI-NEXT: s_mov_b32 s9, s1 274; SI-NEXT: s_mov_b32 s0, s2 275; SI-NEXT: s_mov_b32 s1, s3 276; SI-NEXT: s_mov_b32 s2, s10 277; SI-NEXT: s_mov_b32 s3, s11 278; SI-NEXT: s_mov_b32 s6, s10 279; SI-NEXT: s_mov_b32 s7, s11 280; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 281; SI-NEXT: s_waitcnt vmcnt(0) 282; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 283; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 284; SI-NEXT: s_waitcnt vmcnt(0) 285; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 286; SI-NEXT: v_rcp_f32_e32 v2, v1 287; SI-NEXT: v_mul_f32_e32 v2, v0, v2 288; SI-NEXT: v_trunc_f32_e32 v2, v2 289; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 290; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 291; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 292; SI-NEXT: s_endpgm 293; 294; CI-LABEL: fast_frem_f16: 295; CI: ; %bb.0: 296; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 297; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 298; CI-NEXT: s_mov_b32 s11, 0xf000 299; CI-NEXT: s_mov_b32 s10, -1 300; CI-NEXT: s_mov_b32 s6, s10 301; CI-NEXT: s_mov_b32 s7, s11 302; CI-NEXT: s_waitcnt lgkmcnt(0) 303; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 304; CI-NEXT: s_mov_b32 s8, s0 305; CI-NEXT: s_mov_b32 s9, s1 306; CI-NEXT: s_mov_b32 s0, s2 307; CI-NEXT: s_mov_b32 s1, s3 308; CI-NEXT: s_mov_b32 s2, s10 309; CI-NEXT: s_mov_b32 s3, s11 310; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 311; CI-NEXT: s_waitcnt vmcnt(1) 312; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 313; CI-NEXT: v_rcp_f32_e32 v2, v1 314; CI-NEXT: s_waitcnt vmcnt(0) 315; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 316; CI-NEXT: v_mul_f32_e32 v2, v0, v2 317; CI-NEXT: v_trunc_f32_e32 v2, v2 318; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 319; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 320; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 321; CI-NEXT: s_endpgm 322; 323; VI-LABEL: fast_frem_f16: 324; VI: ; %bb.0: 325; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 326; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 327; VI-NEXT: s_waitcnt lgkmcnt(0) 328; VI-NEXT: v_mov_b32_e32 v0, s0 329; VI-NEXT: s_add_u32 s0, s4, 8 330; VI-NEXT: v_mov_b32_e32 v1, s1 331; VI-NEXT: v_mov_b32_e32 v2, s2 332; VI-NEXT: v_mov_b32_e32 v3, s3 333; VI-NEXT: s_addc_u32 s1, s5, 0 334; VI-NEXT: flat_load_ushort v4, v[2:3] 335; VI-NEXT: v_mov_b32_e32 v3, s1 336; VI-NEXT: v_mov_b32_e32 v2, s0 337; VI-NEXT: flat_load_ushort v2, v[2:3] 338; VI-NEXT: s_waitcnt vmcnt(0) 339; VI-NEXT: v_rcp_f16_e32 v3, v2 340; VI-NEXT: v_mul_f16_e32 v3, v4, v3 341; VI-NEXT: v_trunc_f16_e32 v3, v3 342; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 343; VI-NEXT: flat_store_short v[0:1], v2 344; VI-NEXT: s_endpgm 345; 346; GFX9-LABEL: fast_frem_f16: 347; GFX9: ; %bb.0: 348; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 349; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 350; GFX9-NEXT: v_mov_b32_e32 v0, 0 351; GFX9-NEXT: s_waitcnt lgkmcnt(0) 352; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 353; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 354; GFX9-NEXT: s_waitcnt vmcnt(0) 355; GFX9-NEXT: v_rcp_f16_e32 v3, v2 356; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 357; GFX9-NEXT: v_trunc_f16_e32 v3, v3 358; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 359; GFX9-NEXT: global_store_short v0, v1, s[0:1] 360; GFX9-NEXT: s_endpgm 361; 362; GFX10-LABEL: fast_frem_f16: 363; GFX10: ; %bb.0: 364; GFX10-NEXT: s_clause 0x1 365; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 366; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 367; GFX10-NEXT: v_mov_b32_e32 v0, 0 368; GFX10-NEXT: s_waitcnt lgkmcnt(0) 369; GFX10-NEXT: s_clause 0x1 370; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 371; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 372; GFX10-NEXT: s_waitcnt vmcnt(0) 373; GFX10-NEXT: v_rcp_f16_e32 v3, v2 374; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 375; GFX10-NEXT: v_trunc_f16_e32 v3, v3 376; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 377; GFX10-NEXT: global_store_short v0, v1, s[0:1] 378; GFX10-NEXT: s_endpgm 379; 380; GFX11-LABEL: fast_frem_f16: 381; GFX11: ; %bb.0: 382; GFX11-NEXT: s_clause 0x1 383; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 384; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 385; GFX11-NEXT: v_mov_b32_e32 v0, 0 386; GFX11-NEXT: s_waitcnt lgkmcnt(0) 387; GFX11-NEXT: s_clause 0x1 388; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 389; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 390; GFX11-NEXT: s_waitcnt vmcnt(0) 391; GFX11-NEXT: v_rcp_f16_e32 v3, v2 392; GFX11-NEXT: s_waitcnt_depctr 0xfff 393; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 394; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 395; GFX11-NEXT: v_trunc_f16_e32 v3, v3 396; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 397; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 398; GFX11-NEXT: s_endpgm 399; 400; GFX1150-LABEL: fast_frem_f16: 401; GFX1150: ; %bb.0: 402; GFX1150-NEXT: s_clause 0x1 403; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 404; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 405; GFX1150-NEXT: v_mov_b32_e32 v0, 0 406; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 407; GFX1150-NEXT: s_clause 0x1 408; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] 409; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 410; GFX1150-NEXT: s_waitcnt vmcnt(0) 411; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 412; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 413; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 414; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 415; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 416; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 417; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 418; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] 419; GFX1150-NEXT: s_endpgm 420 ptr addrspace(1) %in2) #0 { 421 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 422 %r0 = load half, ptr addrspace(1) %in1, align 4 423 %r1 = load half, ptr addrspace(1) %gep2, align 4 424 %r2 = frem fast half %r0, %r1 425 store half %r2, ptr addrspace(1) %out, align 4 426 ret void 427} 428 429define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 430; SI-LABEL: unsafe_frem_f16: 431; SI: ; %bb.0: 432; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 433; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 434; SI-NEXT: s_mov_b32 s11, 0xf000 435; SI-NEXT: s_mov_b32 s10, -1 436; SI-NEXT: s_waitcnt lgkmcnt(0) 437; SI-NEXT: s_mov_b32 s8, s0 438; SI-NEXT: s_mov_b32 s9, s1 439; SI-NEXT: s_mov_b32 s0, s2 440; SI-NEXT: s_mov_b32 s1, s3 441; SI-NEXT: s_mov_b32 s2, s10 442; SI-NEXT: s_mov_b32 s3, s11 443; SI-NEXT: s_mov_b32 s6, s10 444; SI-NEXT: s_mov_b32 s7, s11 445; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 446; SI-NEXT: s_waitcnt vmcnt(0) 447; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 448; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 449; SI-NEXT: s_waitcnt vmcnt(0) 450; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 451; SI-NEXT: v_rcp_f32_e32 v2, v1 452; SI-NEXT: v_mul_f32_e32 v2, v0, v2 453; SI-NEXT: v_trunc_f32_e32 v2, v2 454; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 455; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 456; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 457; SI-NEXT: s_endpgm 458; 459; CI-LABEL: unsafe_frem_f16: 460; CI: ; %bb.0: 461; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 462; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 463; CI-NEXT: s_mov_b32 s11, 0xf000 464; CI-NEXT: s_mov_b32 s10, -1 465; CI-NEXT: s_mov_b32 s6, s10 466; CI-NEXT: s_mov_b32 s7, s11 467; CI-NEXT: s_waitcnt lgkmcnt(0) 468; CI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:8 469; CI-NEXT: s_mov_b32 s8, s0 470; CI-NEXT: s_mov_b32 s9, s1 471; CI-NEXT: s_mov_b32 s0, s2 472; CI-NEXT: s_mov_b32 s1, s3 473; CI-NEXT: s_mov_b32 s2, s10 474; CI-NEXT: s_mov_b32 s3, s11 475; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 476; CI-NEXT: s_waitcnt vmcnt(1) 477; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 478; CI-NEXT: v_rcp_f32_e32 v2, v1 479; CI-NEXT: s_waitcnt vmcnt(0) 480; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 481; CI-NEXT: v_mul_f32_e32 v2, v0, v2 482; CI-NEXT: v_trunc_f32_e32 v2, v2 483; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 484; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 485; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 486; CI-NEXT: s_endpgm 487; 488; VI-LABEL: unsafe_frem_f16: 489; VI: ; %bb.0: 490; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 491; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 492; VI-NEXT: s_waitcnt lgkmcnt(0) 493; VI-NEXT: v_mov_b32_e32 v0, s0 494; VI-NEXT: s_add_u32 s0, s4, 8 495; VI-NEXT: v_mov_b32_e32 v1, s1 496; VI-NEXT: v_mov_b32_e32 v2, s2 497; VI-NEXT: v_mov_b32_e32 v3, s3 498; VI-NEXT: s_addc_u32 s1, s5, 0 499; VI-NEXT: flat_load_ushort v4, v[2:3] 500; VI-NEXT: v_mov_b32_e32 v3, s1 501; VI-NEXT: v_mov_b32_e32 v2, s0 502; VI-NEXT: flat_load_ushort v2, v[2:3] 503; VI-NEXT: s_waitcnt vmcnt(0) 504; VI-NEXT: v_rcp_f16_e32 v3, v2 505; VI-NEXT: v_mul_f16_e32 v3, v4, v3 506; VI-NEXT: v_trunc_f16_e32 v3, v3 507; VI-NEXT: v_fma_f16 v2, -v3, v2, v4 508; VI-NEXT: flat_store_short v[0:1], v2 509; VI-NEXT: s_endpgm 510; 511; GFX9-LABEL: unsafe_frem_f16: 512; GFX9: ; %bb.0: 513; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 514; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 515; GFX9-NEXT: v_mov_b32_e32 v0, 0 516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 517; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 518; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 519; GFX9-NEXT: s_waitcnt vmcnt(0) 520; GFX9-NEXT: v_rcp_f16_e32 v3, v2 521; GFX9-NEXT: v_mul_f16_e32 v3, v1, v3 522; GFX9-NEXT: v_trunc_f16_e32 v3, v3 523; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 524; GFX9-NEXT: global_store_short v0, v1, s[0:1] 525; GFX9-NEXT: s_endpgm 526; 527; GFX10-LABEL: unsafe_frem_f16: 528; GFX10: ; %bb.0: 529; GFX10-NEXT: s_clause 0x1 530; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 531; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 532; GFX10-NEXT: v_mov_b32_e32 v0, 0 533; GFX10-NEXT: s_waitcnt lgkmcnt(0) 534; GFX10-NEXT: s_clause 0x1 535; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 536; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 537; GFX10-NEXT: s_waitcnt vmcnt(0) 538; GFX10-NEXT: v_rcp_f16_e32 v3, v2 539; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3 540; GFX10-NEXT: v_trunc_f16_e32 v3, v3 541; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 542; GFX10-NEXT: global_store_short v0, v1, s[0:1] 543; GFX10-NEXT: s_endpgm 544; 545; GFX11-LABEL: unsafe_frem_f16: 546; GFX11: ; %bb.0: 547; GFX11-NEXT: s_clause 0x1 548; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 549; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 550; GFX11-NEXT: v_mov_b32_e32 v0, 0 551; GFX11-NEXT: s_waitcnt lgkmcnt(0) 552; GFX11-NEXT: s_clause 0x1 553; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 554; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 555; GFX11-NEXT: s_waitcnt vmcnt(0) 556; GFX11-NEXT: v_rcp_f16_e32 v3, v2 557; GFX11-NEXT: s_waitcnt_depctr 0xfff 558; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 559; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 560; GFX11-NEXT: v_trunc_f16_e32 v3, v3 561; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 562; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 563; GFX11-NEXT: s_endpgm 564; 565; GFX1150-LABEL: unsafe_frem_f16: 566; GFX1150: ; %bb.0: 567; GFX1150-NEXT: s_clause 0x1 568; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 569; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 570; GFX1150-NEXT: v_mov_b32_e32 v0, 0 571; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 572; GFX1150-NEXT: s_clause 0x1 573; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3] 574; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 575; GFX1150-NEXT: s_waitcnt vmcnt(0) 576; GFX1150-NEXT: v_rcp_f16_e32 v3, v2 577; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 578; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3 579; GFX1150-NEXT: v_trunc_f16_e32 v3, v3 580; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 581; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3 582; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2 583; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1] 584; GFX1150-NEXT: s_endpgm 585 ptr addrspace(1) %in2) #1 { 586 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 587 %r0 = load half, ptr addrspace(1) %in1, align 4 588 %r1 = load half, ptr addrspace(1) %gep2, align 4 589 %r2 = frem afn half %r0, %r1 590 store half %r2, ptr addrspace(1) %out, align 4 591 ret void 592} 593 594define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, 595; SI-LABEL: frem_f32: 596; SI: ; %bb.0: 597; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 598; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 599; SI-NEXT: s_mov_b32 s11, 0xf000 600; SI-NEXT: s_mov_b32 s10, -1 601; SI-NEXT: s_waitcnt lgkmcnt(0) 602; SI-NEXT: s_mov_b32 s8, s0 603; SI-NEXT: s_mov_b32 s9, s1 604; SI-NEXT: s_mov_b32 s0, s2 605; SI-NEXT: s_mov_b32 s1, s3 606; SI-NEXT: s_mov_b32 s2, s10 607; SI-NEXT: s_mov_b32 s3, s11 608; SI-NEXT: s_mov_b32 s6, s10 609; SI-NEXT: s_mov_b32 s7, s11 610; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 611; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 612; SI-NEXT: s_waitcnt vmcnt(0) 613; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 614; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 615; SI-NEXT: v_rcp_f32_e32 v4, v3 616; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 617; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 618; SI-NEXT: v_fma_f32 v4, v5, v4, v4 619; SI-NEXT: v_mul_f32_e32 v5, v2, v4 620; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 621; SI-NEXT: v_fma_f32 v5, v6, v4, v5 622; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 623; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 624; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 625; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 626; SI-NEXT: v_trunc_f32_e32 v2, v2 627; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 628; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 629; SI-NEXT: s_endpgm 630; 631; CI-LABEL: frem_f32: 632; CI: ; %bb.0: 633; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 634; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 635; CI-NEXT: s_mov_b32 s11, 0xf000 636; CI-NEXT: s_mov_b32 s10, -1 637; CI-NEXT: s_mov_b32 s6, s10 638; CI-NEXT: s_waitcnt lgkmcnt(0) 639; CI-NEXT: s_mov_b32 s8, s0 640; CI-NEXT: s_mov_b32 s9, s1 641; CI-NEXT: s_mov_b32 s0, s2 642; CI-NEXT: s_mov_b32 s1, s3 643; CI-NEXT: s_mov_b32 s2, s10 644; CI-NEXT: s_mov_b32 s3, s11 645; CI-NEXT: s_mov_b32 s7, s11 646; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 647; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 648; CI-NEXT: s_waitcnt vmcnt(0) 649; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 650; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 651; CI-NEXT: v_rcp_f32_e32 v4, v3 652; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 653; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 654; CI-NEXT: v_fma_f32 v4, v5, v4, v4 655; CI-NEXT: v_mul_f32_e32 v5, v2, v4 656; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 657; CI-NEXT: v_fma_f32 v5, v6, v4, v5 658; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 659; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 660; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 661; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 662; CI-NEXT: v_trunc_f32_e32 v2, v2 663; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 664; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 665; CI-NEXT: s_endpgm 666; 667; VI-LABEL: frem_f32: 668; VI: ; %bb.0: 669; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 670; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 671; VI-NEXT: s_waitcnt lgkmcnt(0) 672; VI-NEXT: v_mov_b32_e32 v0, s0 673; VI-NEXT: s_add_u32 s0, s4, 16 674; VI-NEXT: v_mov_b32_e32 v1, s1 675; VI-NEXT: v_mov_b32_e32 v2, s2 676; VI-NEXT: v_mov_b32_e32 v3, s3 677; VI-NEXT: s_addc_u32 s1, s5, 0 678; VI-NEXT: flat_load_dword v4, v[2:3] 679; VI-NEXT: v_mov_b32_e32 v3, s1 680; VI-NEXT: v_mov_b32_e32 v2, s0 681; VI-NEXT: flat_load_dword v2, v[2:3] 682; VI-NEXT: s_waitcnt vmcnt(0) 683; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 684; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 685; VI-NEXT: v_rcp_f32_e32 v6, v5 686; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 687; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 688; VI-NEXT: v_fma_f32 v6, v7, v6, v6 689; VI-NEXT: v_mul_f32_e32 v7, v3, v6 690; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 691; VI-NEXT: v_fma_f32 v7, v8, v6, v7 692; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 693; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 694; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 695; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 696; VI-NEXT: v_trunc_f32_e32 v3, v3 697; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 698; VI-NEXT: flat_store_dword v[0:1], v2 699; VI-NEXT: s_endpgm 700; 701; GFX9-LABEL: frem_f32: 702; GFX9: ; %bb.0: 703; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 704; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 705; GFX9-NEXT: v_mov_b32_e32 v0, 0 706; GFX9-NEXT: s_waitcnt lgkmcnt(0) 707; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 708; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 709; GFX9-NEXT: s_waitcnt vmcnt(0) 710; GFX9-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, v1 711; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 712; GFX9-NEXT: v_rcp_f32_e32 v5, v4 713; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 714; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 715; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 716; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 717; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 718; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 719; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 720; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 721; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 722; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 723; GFX9-NEXT: v_trunc_f32_e32 v3, v3 724; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 725; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 726; GFX9-NEXT: s_endpgm 727; 728; GFX10-LABEL: frem_f32: 729; GFX10: ; %bb.0: 730; GFX10-NEXT: s_clause 0x1 731; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 732; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 733; GFX10-NEXT: v_mov_b32_e32 v0, 0 734; GFX10-NEXT: s_waitcnt lgkmcnt(0) 735; GFX10-NEXT: s_clause 0x1 736; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 737; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 738; GFX10-NEXT: s_waitcnt vmcnt(0) 739; GFX10-NEXT: v_div_scale_f32 v4, s2, v2, v2, v1 740; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 741; GFX10-NEXT: v_rcp_f32_e32 v5, v4 742; GFX10-NEXT: s_denorm_mode 15 743; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 744; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 745; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 746; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 747; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 748; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 749; GFX10-NEXT: s_denorm_mode 12 750; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 751; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 752; GFX10-NEXT: v_trunc_f32_e32 v3, v3 753; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 754; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 755; GFX10-NEXT: s_endpgm 756; 757; GFX11-LABEL: frem_f32: 758; GFX11: ; %bb.0: 759; GFX11-NEXT: s_clause 0x1 760; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 761; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 762; GFX11-NEXT: v_mov_b32_e32 v0, 0 763; GFX11-NEXT: s_waitcnt lgkmcnt(0) 764; GFX11-NEXT: s_clause 0x1 765; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 766; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 767; GFX11-NEXT: s_waitcnt vmcnt(0) 768; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 769; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 770; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 771; GFX11-NEXT: v_rcp_f32_e32 v5, v4 772; GFX11-NEXT: s_denorm_mode 15 773; GFX11-NEXT: s_waitcnt_depctr 0xfff 774; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 775; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 776; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 777; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5 778; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3 779; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 780; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 781; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3 782; GFX11-NEXT: s_denorm_mode 12 783; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 784; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6 785; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1 786; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 787; GFX11-NEXT: v_trunc_f32_e32 v3, v3 788; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 789; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 790; GFX11-NEXT: s_endpgm 791; 792; GFX1150-LABEL: frem_f32: 793; GFX1150: ; %bb.0: 794; GFX1150-NEXT: s_clause 0x1 795; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 796; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 797; GFX1150-NEXT: v_mov_b32_e32 v0, 0 798; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 799; GFX1150-NEXT: s_clause 0x1 800; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] 801; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 802; GFX1150-NEXT: s_waitcnt vmcnt(0) 803; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 804; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 805; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 806; GFX1150-NEXT: v_rcp_f32_e32 v5, v4 807; GFX1150-NEXT: s_denorm_mode 15 808; GFX1150-NEXT: v_fma_f32 v6, -v4, v5, 1.0 809; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 810; GFX1150-NEXT: v_fmac_f32_e32 v5, v6, v5 811; GFX1150-NEXT: v_mul_f32_e32 v6, v3, v5 812; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 813; GFX1150-NEXT: v_fma_f32 v7, -v4, v6, v3 814; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v5 815; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 816; GFX1150-NEXT: v_fma_f32 v3, -v4, v6, v3 817; GFX1150-NEXT: s_denorm_mode 12 818; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v5, v6 819; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 820; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v1 821; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 822; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 823; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 824; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 825; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] 826; GFX1150-NEXT: s_endpgm 827 ptr addrspace(1) %in2) #0 { 828 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 829 %r0 = load float, ptr addrspace(1) %in1, align 4 830 %r1 = load float, ptr addrspace(1) %gep2, align 4 831 %r2 = frem float %r0, %r1 832 store float %r2, ptr addrspace(1) %out, align 4 833 ret void 834} 835 836define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, 837; SI-LABEL: fast_frem_f32: 838; SI: ; %bb.0: 839; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 840; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 841; SI-NEXT: s_mov_b32 s11, 0xf000 842; SI-NEXT: s_mov_b32 s10, -1 843; SI-NEXT: s_waitcnt lgkmcnt(0) 844; SI-NEXT: s_mov_b32 s8, s0 845; SI-NEXT: s_mov_b32 s9, s1 846; SI-NEXT: s_mov_b32 s0, s2 847; SI-NEXT: s_mov_b32 s1, s3 848; SI-NEXT: s_mov_b32 s2, s10 849; SI-NEXT: s_mov_b32 s3, s11 850; SI-NEXT: s_mov_b32 s6, s10 851; SI-NEXT: s_mov_b32 s7, s11 852; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 853; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 854; SI-NEXT: s_waitcnt vmcnt(0) 855; SI-NEXT: v_rcp_f32_e32 v2, v1 856; SI-NEXT: v_mul_f32_e32 v2, v0, v2 857; SI-NEXT: v_trunc_f32_e32 v2, v2 858; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 859; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 860; SI-NEXT: s_endpgm 861; 862; CI-LABEL: fast_frem_f32: 863; CI: ; %bb.0: 864; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 865; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 866; CI-NEXT: s_mov_b32 s11, 0xf000 867; CI-NEXT: s_mov_b32 s10, -1 868; CI-NEXT: s_mov_b32 s6, s10 869; CI-NEXT: s_waitcnt lgkmcnt(0) 870; CI-NEXT: s_mov_b32 s8, s0 871; CI-NEXT: s_mov_b32 s9, s1 872; CI-NEXT: s_mov_b32 s0, s2 873; CI-NEXT: s_mov_b32 s1, s3 874; CI-NEXT: s_mov_b32 s2, s10 875; CI-NEXT: s_mov_b32 s3, s11 876; CI-NEXT: s_mov_b32 s7, s11 877; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 878; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 879; CI-NEXT: s_waitcnt vmcnt(0) 880; CI-NEXT: v_rcp_f32_e32 v2, v1 881; CI-NEXT: v_mul_f32_e32 v2, v0, v2 882; CI-NEXT: v_trunc_f32_e32 v2, v2 883; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 884; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 885; CI-NEXT: s_endpgm 886; 887; VI-LABEL: fast_frem_f32: 888; VI: ; %bb.0: 889; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 890; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 891; VI-NEXT: s_waitcnt lgkmcnt(0) 892; VI-NEXT: v_mov_b32_e32 v0, s0 893; VI-NEXT: s_add_u32 s0, s4, 16 894; VI-NEXT: v_mov_b32_e32 v1, s1 895; VI-NEXT: v_mov_b32_e32 v2, s2 896; VI-NEXT: v_mov_b32_e32 v3, s3 897; VI-NEXT: s_addc_u32 s1, s5, 0 898; VI-NEXT: flat_load_dword v4, v[2:3] 899; VI-NEXT: v_mov_b32_e32 v3, s1 900; VI-NEXT: v_mov_b32_e32 v2, s0 901; VI-NEXT: flat_load_dword v2, v[2:3] 902; VI-NEXT: s_waitcnt vmcnt(0) 903; VI-NEXT: v_rcp_f32_e32 v3, v2 904; VI-NEXT: v_mul_f32_e32 v3, v4, v3 905; VI-NEXT: v_trunc_f32_e32 v3, v3 906; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 907; VI-NEXT: flat_store_dword v[0:1], v2 908; VI-NEXT: s_endpgm 909; 910; GFX9-LABEL: fast_frem_f32: 911; GFX9: ; %bb.0: 912; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 913; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 914; GFX9-NEXT: v_mov_b32_e32 v0, 0 915; GFX9-NEXT: s_waitcnt lgkmcnt(0) 916; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 917; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 918; GFX9-NEXT: s_waitcnt vmcnt(0) 919; GFX9-NEXT: v_rcp_f32_e32 v3, v2 920; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 921; GFX9-NEXT: v_trunc_f32_e32 v3, v3 922; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 923; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 924; GFX9-NEXT: s_endpgm 925; 926; GFX10-LABEL: fast_frem_f32: 927; GFX10: ; %bb.0: 928; GFX10-NEXT: s_clause 0x1 929; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 930; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 931; GFX10-NEXT: v_mov_b32_e32 v0, 0 932; GFX10-NEXT: s_waitcnt lgkmcnt(0) 933; GFX10-NEXT: s_clause 0x1 934; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 935; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 936; GFX10-NEXT: s_waitcnt vmcnt(0) 937; GFX10-NEXT: v_rcp_f32_e32 v3, v2 938; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 939; GFX10-NEXT: v_trunc_f32_e32 v3, v3 940; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 941; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 942; GFX10-NEXT: s_endpgm 943; 944; GFX11-LABEL: fast_frem_f32: 945; GFX11: ; %bb.0: 946; GFX11-NEXT: s_clause 0x1 947; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 948; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 949; GFX11-NEXT: v_mov_b32_e32 v0, 0 950; GFX11-NEXT: s_waitcnt lgkmcnt(0) 951; GFX11-NEXT: s_clause 0x1 952; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 953; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 954; GFX11-NEXT: s_waitcnt vmcnt(0) 955; GFX11-NEXT: v_rcp_f32_e32 v3, v2 956; GFX11-NEXT: s_waitcnt_depctr 0xfff 957; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 958; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 959; GFX11-NEXT: v_trunc_f32_e32 v3, v3 960; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 961; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 962; GFX11-NEXT: s_endpgm 963; 964; GFX1150-LABEL: fast_frem_f32: 965; GFX1150: ; %bb.0: 966; GFX1150-NEXT: s_clause 0x1 967; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 968; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 969; GFX1150-NEXT: v_mov_b32_e32 v0, 0 970; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 971; GFX1150-NEXT: s_clause 0x1 972; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] 973; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 974; GFX1150-NEXT: s_waitcnt vmcnt(0) 975; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 976; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 977; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 978; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 979; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 980; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 981; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 982; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] 983; GFX1150-NEXT: s_endpgm 984 ptr addrspace(1) %in2) #0 { 985 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 986 %r0 = load float, ptr addrspace(1) %in1, align 4 987 %r1 = load float, ptr addrspace(1) %gep2, align 4 988 %r2 = frem fast float %r0, %r1 989 store float %r2, ptr addrspace(1) %out, align 4 990 ret void 991} 992 993define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, 994; SI-LABEL: unsafe_frem_f32: 995; SI: ; %bb.0: 996; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 997; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 998; SI-NEXT: s_mov_b32 s11, 0xf000 999; SI-NEXT: s_mov_b32 s10, -1 1000; SI-NEXT: s_waitcnt lgkmcnt(0) 1001; SI-NEXT: s_mov_b32 s8, s0 1002; SI-NEXT: s_mov_b32 s9, s1 1003; SI-NEXT: s_mov_b32 s0, s2 1004; SI-NEXT: s_mov_b32 s1, s3 1005; SI-NEXT: s_mov_b32 s2, s10 1006; SI-NEXT: s_mov_b32 s3, s11 1007; SI-NEXT: s_mov_b32 s6, s10 1008; SI-NEXT: s_mov_b32 s7, s11 1009; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 1010; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 1011; SI-NEXT: s_waitcnt vmcnt(0) 1012; SI-NEXT: v_rcp_f32_e32 v2, v1 1013; SI-NEXT: v_mul_f32_e32 v2, v0, v2 1014; SI-NEXT: v_trunc_f32_e32 v2, v2 1015; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 1016; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1017; SI-NEXT: s_endpgm 1018; 1019; CI-LABEL: unsafe_frem_f32: 1020; CI: ; %bb.0: 1021; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1022; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1023; CI-NEXT: s_mov_b32 s11, 0xf000 1024; CI-NEXT: s_mov_b32 s10, -1 1025; CI-NEXT: s_mov_b32 s6, s10 1026; CI-NEXT: s_waitcnt lgkmcnt(0) 1027; CI-NEXT: s_mov_b32 s8, s0 1028; CI-NEXT: s_mov_b32 s9, s1 1029; CI-NEXT: s_mov_b32 s0, s2 1030; CI-NEXT: s_mov_b32 s1, s3 1031; CI-NEXT: s_mov_b32 s2, s10 1032; CI-NEXT: s_mov_b32 s3, s11 1033; CI-NEXT: s_mov_b32 s7, s11 1034; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 1035; CI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 1036; CI-NEXT: s_waitcnt vmcnt(0) 1037; CI-NEXT: v_rcp_f32_e32 v2, v1 1038; CI-NEXT: v_mul_f32_e32 v2, v0, v2 1039; CI-NEXT: v_trunc_f32_e32 v2, v2 1040; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 1041; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1042; CI-NEXT: s_endpgm 1043; 1044; VI-LABEL: unsafe_frem_f32: 1045; VI: ; %bb.0: 1046; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1047; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1048; VI-NEXT: s_waitcnt lgkmcnt(0) 1049; VI-NEXT: v_mov_b32_e32 v0, s0 1050; VI-NEXT: s_add_u32 s0, s4, 16 1051; VI-NEXT: v_mov_b32_e32 v1, s1 1052; VI-NEXT: v_mov_b32_e32 v2, s2 1053; VI-NEXT: v_mov_b32_e32 v3, s3 1054; VI-NEXT: s_addc_u32 s1, s5, 0 1055; VI-NEXT: flat_load_dword v4, v[2:3] 1056; VI-NEXT: v_mov_b32_e32 v3, s1 1057; VI-NEXT: v_mov_b32_e32 v2, s0 1058; VI-NEXT: flat_load_dword v2, v[2:3] 1059; VI-NEXT: s_waitcnt vmcnt(0) 1060; VI-NEXT: v_rcp_f32_e32 v3, v2 1061; VI-NEXT: v_mul_f32_e32 v3, v4, v3 1062; VI-NEXT: v_trunc_f32_e32 v3, v3 1063; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 1064; VI-NEXT: flat_store_dword v[0:1], v2 1065; VI-NEXT: s_endpgm 1066; 1067; GFX9-LABEL: unsafe_frem_f32: 1068; GFX9: ; %bb.0: 1069; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1070; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1071; GFX9-NEXT: v_mov_b32_e32 v0, 0 1072; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1073; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 1074; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 1075; GFX9-NEXT: s_waitcnt vmcnt(0) 1076; GFX9-NEXT: v_rcp_f32_e32 v3, v2 1077; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 1078; GFX9-NEXT: v_trunc_f32_e32 v3, v3 1079; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 1080; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1081; GFX9-NEXT: s_endpgm 1082; 1083; GFX10-LABEL: unsafe_frem_f32: 1084; GFX10: ; %bb.0: 1085; GFX10-NEXT: s_clause 0x1 1086; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1087; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1088; GFX10-NEXT: v_mov_b32_e32 v0, 0 1089; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX10-NEXT: s_clause 0x1 1091; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 1092; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 1093; GFX10-NEXT: s_waitcnt vmcnt(0) 1094; GFX10-NEXT: v_rcp_f32_e32 v3, v2 1095; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3 1096; GFX10-NEXT: v_trunc_f32_e32 v3, v3 1097; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 1098; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1099; GFX10-NEXT: s_endpgm 1100; 1101; GFX11-LABEL: unsafe_frem_f32: 1102; GFX11: ; %bb.0: 1103; GFX11-NEXT: s_clause 0x1 1104; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1105; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1106; GFX11-NEXT: v_mov_b32_e32 v0, 0 1107; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX11-NEXT: s_clause 0x1 1109; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1110; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 1111; GFX11-NEXT: s_waitcnt vmcnt(0) 1112; GFX11-NEXT: v_rcp_f32_e32 v3, v2 1113; GFX11-NEXT: s_waitcnt_depctr 0xfff 1114; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 1115; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1116; GFX11-NEXT: v_trunc_f32_e32 v3, v3 1117; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 1118; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1119; GFX11-NEXT: s_endpgm 1120; 1121; GFX1150-LABEL: unsafe_frem_f32: 1122; GFX1150: ; %bb.0: 1123; GFX1150-NEXT: s_clause 0x1 1124; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1125; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1126; GFX1150-NEXT: v_mov_b32_e32 v0, 0 1127; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 1128; GFX1150-NEXT: s_clause 0x1 1129; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] 1130; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 1131; GFX1150-NEXT: s_waitcnt vmcnt(0) 1132; GFX1150-NEXT: v_rcp_f32_e32 v3, v2 1133; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1134; GFX1150-NEXT: v_mul_f32_e32 v3, v1, v3 1135; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 1136; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1137; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 1138; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2 1139; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] 1140; GFX1150-NEXT: s_endpgm 1141 ptr addrspace(1) %in2) #1 { 1142 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 1143 %r0 = load float, ptr addrspace(1) %in1, align 4 1144 %r1 = load float, ptr addrspace(1) %gep2, align 4 1145 %r2 = frem afn float %r0, %r1 1146 store float %r2, ptr addrspace(1) %out, align 4 1147 ret void 1148} 1149 1150define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, 1151; SI-LABEL: frem_f64: 1152; SI: ; %bb.0: 1153; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1154; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1155; SI-NEXT: s_mov_b32 s7, 0xf000 1156; SI-NEXT: s_mov_b32 s6, -1 1157; SI-NEXT: s_waitcnt lgkmcnt(0) 1158; SI-NEXT: s_mov_b32 s4, s0 1159; SI-NEXT: s_mov_b32 s5, s1 1160; SI-NEXT: s_mov_b32 s0, s2 1161; SI-NEXT: s_mov_b32 s1, s3 1162; SI-NEXT: s_mov_b32 s2, s6 1163; SI-NEXT: s_mov_b32 s3, s7 1164; SI-NEXT: s_mov_b32 s10, s6 1165; SI-NEXT: s_mov_b32 s11, s7 1166; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 1167; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 1168; SI-NEXT: s_waitcnt vmcnt(0) 1169; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 1170; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1171; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1172; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1173; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1174; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1175; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] 1176; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1177; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] 1178; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 1179; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 1180; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 1181; SI-NEXT: s_nop 1 1182; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] 1183; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1184; SI-NEXT: v_readfirstlane_b32 s2, v5 1185; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014 1186; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 1187; SI-NEXT: s_mov_b32 s1, 0xfffff 1188; SI-NEXT: s_mov_b32 s0, s6 1189; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 1190; SI-NEXT: v_not_b32_e32 v6, s0 1191; SI-NEXT: v_and_b32_e32 v6, v4, v6 1192; SI-NEXT: v_not_b32_e32 v7, s1 1193; SI-NEXT: v_and_b32_e32 v5, v5, v7 1194; SI-NEXT: s_and_b32 s0, s2, 0x80000000 1195; SI-NEXT: s_cmp_lt_i32 s3, 0 1196; SI-NEXT: s_cselect_b64 vcc, -1, 0 1197; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1198; SI-NEXT: v_mov_b32_e32 v7, s0 1199; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1200; SI-NEXT: s_cmp_gt_i32 s3, 51 1201; SI-NEXT: s_cselect_b64 vcc, -1, 0 1202; SI-NEXT: v_mov_b32_e32 v7, s2 1203; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1204; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 1205; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1206; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1207; SI-NEXT: s_endpgm 1208; 1209; CI-LABEL: frem_f64: 1210; CI: ; %bb.0: 1211; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1212; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1213; CI-NEXT: s_mov_b32 s11, 0xf000 1214; CI-NEXT: s_mov_b32 s10, -1 1215; CI-NEXT: s_mov_b32 s6, s10 1216; CI-NEXT: s_waitcnt lgkmcnt(0) 1217; CI-NEXT: s_mov_b32 s8, s0 1218; CI-NEXT: s_mov_b32 s9, s1 1219; CI-NEXT: s_mov_b32 s0, s2 1220; CI-NEXT: s_mov_b32 s1, s3 1221; CI-NEXT: s_mov_b32 s2, s10 1222; CI-NEXT: s_mov_b32 s3, s11 1223; CI-NEXT: s_mov_b32 s7, s11 1224; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 1225; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 1226; CI-NEXT: s_waitcnt vmcnt(0) 1227; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] 1228; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1229; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1230; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1231; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1232; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1233; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 1234; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1235; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1236; CI-NEXT: s_nop 1 1237; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1238; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1239; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1240; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1241; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1242; CI-NEXT: s_endpgm 1243; 1244; VI-LABEL: frem_f64: 1245; VI: ; %bb.0: 1246; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1247; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1248; VI-NEXT: s_waitcnt lgkmcnt(0) 1249; VI-NEXT: v_mov_b32_e32 v2, s2 1250; VI-NEXT: v_mov_b32_e32 v3, s3 1251; VI-NEXT: v_mov_b32_e32 v4, s4 1252; VI-NEXT: v_mov_b32_e32 v5, s5 1253; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1254; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1255; VI-NEXT: v_mov_b32_e32 v0, s0 1256; VI-NEXT: v_mov_b32_e32 v1, s1 1257; VI-NEXT: s_waitcnt vmcnt(0) 1258; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] 1259; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 1260; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1261; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1262; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 1263; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 1264; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] 1265; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 1266; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 1267; VI-NEXT: s_nop 1 1268; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 1269; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] 1270; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1271; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1272; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1273; VI-NEXT: s_endpgm 1274; 1275; GFX9-LABEL: frem_f64: 1276; GFX9: ; %bb.0: 1277; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1278; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1279; GFX9-NEXT: v_mov_b32_e32 v12, 0 1280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1281; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] 1282; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] 1283; GFX9-NEXT: s_waitcnt vmcnt(0) 1284; GFX9-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1] 1285; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1286; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1287; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1288; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1289; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1290; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] 1291; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1292; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1293; GFX9-NEXT: s_nop 1 1294; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1295; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1296; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1297; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1298; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] 1299; GFX9-NEXT: s_endpgm 1300; 1301; GFX10-LABEL: frem_f64: 1302; GFX10: ; %bb.0: 1303; GFX10-NEXT: s_clause 0x1 1304; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1305; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1306; GFX10-NEXT: v_mov_b32_e32 v12, 0 1307; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1308; GFX10-NEXT: s_clause 0x1 1309; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[2:3] 1310; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[6:7] 1311; GFX10-NEXT: s_waitcnt vmcnt(0) 1312; GFX10-NEXT: v_div_scale_f64 v[4:5], s2, v[2:3], v[2:3], v[0:1] 1313; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1314; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1315; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1316; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1317; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1318; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 1319; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1320; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1321; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1322; GFX10-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1323; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1324; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1325; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] 1326; GFX10-NEXT: s_endpgm 1327; 1328; GFX11-LABEL: frem_f64: 1329; GFX11: ; %bb.0: 1330; GFX11-NEXT: s_clause 0x1 1331; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1332; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1333; GFX11-NEXT: v_mov_b32_e32 v12, 0 1334; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX11-NEXT: s_clause 0x1 1336; GFX11-NEXT: global_load_b64 v[0:1], v12, s[2:3] 1337; GFX11-NEXT: global_load_b64 v[2:3], v12, s[4:5] 1338; GFX11-NEXT: s_waitcnt vmcnt(0) 1339; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] 1340; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1341; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1342; GFX11-NEXT: s_waitcnt_depctr 0xfff 1343; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1344; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1345; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1346; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1347; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1348; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 1349; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1350; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1351; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1352; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1353; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1354; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1355; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1356; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1357; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1358; GFX11-NEXT: global_store_b64 v12, v[0:1], s[0:1] 1359; GFX11-NEXT: s_endpgm 1360; 1361; GFX1150-LABEL: frem_f64: 1362; GFX1150: ; %bb.0: 1363; GFX1150-NEXT: s_clause 0x1 1364; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1365; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1366; GFX1150-NEXT: v_mov_b32_e32 v12, 0 1367; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 1368; GFX1150-NEXT: s_clause 0x1 1369; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[2:3] 1370; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[4:5] 1371; GFX1150-NEXT: s_waitcnt vmcnt(0) 1372; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] 1373; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 1374; GFX1150-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1375; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1376; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1377; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1378; GFX1150-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1379; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1380; GFX1150-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1381; GFX1150-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] 1382; GFX1150-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] 1383; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1384; GFX1150-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] 1385; GFX1150-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] 1386; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1387; GFX1150-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] 1388; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1389; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) 1390; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1391; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1] 1392; GFX1150-NEXT: s_endpgm 1393 ptr addrspace(1) %in2) #0 { 1394 %r0 = load double, ptr addrspace(1) %in1, align 8 1395 %r1 = load double, ptr addrspace(1) %in2, align 8 1396 %r2 = frem double %r0, %r1 1397 store double %r2, ptr addrspace(1) %out, align 8 1398 ret void 1399} 1400 1401define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, 1402; SI-LABEL: fast_frem_f64: 1403; SI: ; %bb.0: 1404; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1405; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1406; SI-NEXT: s_mov_b32 s3, 0xf000 1407; SI-NEXT: s_mov_b32 s2, -1 1408; SI-NEXT: s_waitcnt lgkmcnt(0) 1409; SI-NEXT: s_mov_b32 s0, s8 1410; SI-NEXT: s_mov_b32 s1, s9 1411; SI-NEXT: s_mov_b32 s8, s10 1412; SI-NEXT: s_mov_b32 s9, s11 1413; SI-NEXT: s_mov_b32 s10, s2 1414; SI-NEXT: s_mov_b32 s11, s3 1415; SI-NEXT: s_mov_b32 s6, s2 1416; SI-NEXT: s_mov_b32 s7, s3 1417; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1418; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 1419; SI-NEXT: s_waitcnt vmcnt(0) 1420; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1421; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1422; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1423; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1424; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1425; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1426; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1427; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1428; SI-NEXT: v_readfirstlane_b32 s6, v5 1429; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 1430; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 1431; SI-NEXT: s_mov_b32 s5, 0xfffff 1432; SI-NEXT: s_mov_b32 s4, s2 1433; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 1434; SI-NEXT: v_not_b32_e32 v6, s4 1435; SI-NEXT: v_and_b32_e32 v6, v4, v6 1436; SI-NEXT: v_not_b32_e32 v7, s5 1437; SI-NEXT: v_and_b32_e32 v5, v5, v7 1438; SI-NEXT: s_and_b32 s4, s6, 0x80000000 1439; SI-NEXT: s_cmp_lt_i32 s7, 0 1440; SI-NEXT: s_cselect_b64 vcc, -1, 0 1441; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1442; SI-NEXT: v_mov_b32_e32 v7, s4 1443; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1444; SI-NEXT: s_cmp_gt_i32 s7, 51 1445; SI-NEXT: s_cselect_b64 vcc, -1, 0 1446; SI-NEXT: v_mov_b32_e32 v7, s6 1447; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1448; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 1449; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1450; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1451; SI-NEXT: s_endpgm 1452; 1453; CI-LABEL: fast_frem_f64: 1454; CI: ; %bb.0: 1455; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1456; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1457; CI-NEXT: s_mov_b32 s11, 0xf000 1458; CI-NEXT: s_mov_b32 s10, -1 1459; CI-NEXT: s_mov_b32 s6, s10 1460; CI-NEXT: s_waitcnt lgkmcnt(0) 1461; CI-NEXT: s_mov_b32 s8, s0 1462; CI-NEXT: s_mov_b32 s9, s1 1463; CI-NEXT: s_mov_b32 s0, s2 1464; CI-NEXT: s_mov_b32 s1, s3 1465; CI-NEXT: s_mov_b32 s2, s10 1466; CI-NEXT: s_mov_b32 s3, s11 1467; CI-NEXT: s_mov_b32 s7, s11 1468; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 1469; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 1470; CI-NEXT: s_waitcnt vmcnt(0) 1471; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1472; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1473; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1474; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1475; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1476; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1477; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1478; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1479; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1480; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1481; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1482; CI-NEXT: s_endpgm 1483; 1484; VI-LABEL: fast_frem_f64: 1485; VI: ; %bb.0: 1486; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1487; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1488; VI-NEXT: s_waitcnt lgkmcnt(0) 1489; VI-NEXT: v_mov_b32_e32 v2, s2 1490; VI-NEXT: v_mov_b32_e32 v3, s3 1491; VI-NEXT: v_mov_b32_e32 v4, s4 1492; VI-NEXT: v_mov_b32_e32 v5, s5 1493; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1494; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1495; VI-NEXT: v_mov_b32_e32 v0, s0 1496; VI-NEXT: v_mov_b32_e32 v1, s1 1497; VI-NEXT: s_waitcnt vmcnt(0) 1498; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1499; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1500; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1501; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1502; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1503; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1504; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1505; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1506; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1507; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1508; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1509; VI-NEXT: s_endpgm 1510; 1511; GFX9-LABEL: fast_frem_f64: 1512; GFX9: ; %bb.0: 1513; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1514; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1515; GFX9-NEXT: v_mov_b32_e32 v10, 0 1516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] 1518; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] 1519; GFX9-NEXT: s_waitcnt vmcnt(0) 1520; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1521; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1522; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1523; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1524; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1525; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1526; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1527; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1528; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1529; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1530; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] 1531; GFX9-NEXT: s_endpgm 1532; 1533; GFX10-LABEL: fast_frem_f64: 1534; GFX10: ; %bb.0: 1535; GFX10-NEXT: s_clause 0x1 1536; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1537; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1538; GFX10-NEXT: v_mov_b32_e32 v10, 0 1539; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1540; GFX10-NEXT: s_clause 0x1 1541; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] 1542; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] 1543; GFX10-NEXT: s_waitcnt vmcnt(0) 1544; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1545; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1546; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1547; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1548; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1549; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1550; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1551; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1552; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1553; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1554; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] 1555; GFX10-NEXT: s_endpgm 1556; 1557; GFX11-LABEL: fast_frem_f64: 1558; GFX11: ; %bb.0: 1559; GFX11-NEXT: s_clause 0x1 1560; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1561; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1562; GFX11-NEXT: v_mov_b32_e32 v10, 0 1563; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX11-NEXT: s_clause 0x1 1565; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] 1566; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] 1567; GFX11-NEXT: s_waitcnt vmcnt(0) 1568; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1569; GFX11-NEXT: s_waitcnt_depctr 0xfff 1570; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1571; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1572; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1573; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1574; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1575; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1576; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1577; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1578; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1579; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1580; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1581; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1582; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1583; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] 1584; GFX11-NEXT: s_endpgm 1585; 1586; GFX1150-LABEL: fast_frem_f64: 1587; GFX1150: ; %bb.0: 1588; GFX1150-NEXT: s_clause 0x1 1589; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1590; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1591; GFX1150-NEXT: v_mov_b32_e32 v10, 0 1592; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX1150-NEXT: s_clause 0x1 1594; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] 1595; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] 1596; GFX1150-NEXT: s_waitcnt vmcnt(0) 1597; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1598; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1599; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1600; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1601; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1602; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1603; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1604; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1605; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1606; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1607; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1608; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1609; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1610; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) 1611; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1612; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] 1613; GFX1150-NEXT: s_endpgm 1614 ptr addrspace(1) %in2) #0 { 1615 %r0 = load double, ptr addrspace(1) %in1, align 8 1616 %r1 = load double, ptr addrspace(1) %in2, align 8 1617 %r2 = frem fast double %r0, %r1 1618 store double %r2, ptr addrspace(1) %out, align 8 1619 ret void 1620} 1621 1622define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, 1623; SI-LABEL: unsafe_frem_f64: 1624; SI: ; %bb.0: 1625; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1626; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1627; SI-NEXT: s_mov_b32 s3, 0xf000 1628; SI-NEXT: s_mov_b32 s2, -1 1629; SI-NEXT: s_waitcnt lgkmcnt(0) 1630; SI-NEXT: s_mov_b32 s0, s8 1631; SI-NEXT: s_mov_b32 s1, s9 1632; SI-NEXT: s_mov_b32 s8, s10 1633; SI-NEXT: s_mov_b32 s9, s11 1634; SI-NEXT: s_mov_b32 s10, s2 1635; SI-NEXT: s_mov_b32 s11, s3 1636; SI-NEXT: s_mov_b32 s6, s2 1637; SI-NEXT: s_mov_b32 s7, s3 1638; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1639; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 1640; SI-NEXT: s_waitcnt vmcnt(0) 1641; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1642; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1643; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1644; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1645; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1646; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1647; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1648; SI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1649; SI-NEXT: v_readfirstlane_b32 s6, v5 1650; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 1651; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 1652; SI-NEXT: s_mov_b32 s5, 0xfffff 1653; SI-NEXT: s_mov_b32 s4, s2 1654; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 1655; SI-NEXT: v_not_b32_e32 v6, s4 1656; SI-NEXT: v_and_b32_e32 v6, v4, v6 1657; SI-NEXT: v_not_b32_e32 v7, s5 1658; SI-NEXT: v_and_b32_e32 v5, v5, v7 1659; SI-NEXT: s_and_b32 s4, s6, 0x80000000 1660; SI-NEXT: s_cmp_lt_i32 s7, 0 1661; SI-NEXT: s_cselect_b64 vcc, -1, 0 1662; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 1663; SI-NEXT: v_mov_b32_e32 v7, s4 1664; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1665; SI-NEXT: s_cmp_gt_i32 s7, 51 1666; SI-NEXT: s_cselect_b64 vcc, -1, 0 1667; SI-NEXT: v_mov_b32_e32 v7, s6 1668; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1669; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc 1670; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1671; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1672; SI-NEXT: s_endpgm 1673; 1674; CI-LABEL: unsafe_frem_f64: 1675; CI: ; %bb.0: 1676; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1677; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1678; CI-NEXT: s_mov_b32 s11, 0xf000 1679; CI-NEXT: s_mov_b32 s10, -1 1680; CI-NEXT: s_mov_b32 s6, s10 1681; CI-NEXT: s_waitcnt lgkmcnt(0) 1682; CI-NEXT: s_mov_b32 s8, s0 1683; CI-NEXT: s_mov_b32 s9, s1 1684; CI-NEXT: s_mov_b32 s0, s2 1685; CI-NEXT: s_mov_b32 s1, s3 1686; CI-NEXT: s_mov_b32 s2, s10 1687; CI-NEXT: s_mov_b32 s3, s11 1688; CI-NEXT: s_mov_b32 s7, s11 1689; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 1690; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 1691; CI-NEXT: s_waitcnt vmcnt(0) 1692; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1693; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1694; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1695; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1696; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1697; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1698; CI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1699; CI-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1700; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1701; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1702; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1703; CI-NEXT: s_endpgm 1704; 1705; VI-LABEL: unsafe_frem_f64: 1706; VI: ; %bb.0: 1707; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1708; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1709; VI-NEXT: s_waitcnt lgkmcnt(0) 1710; VI-NEXT: v_mov_b32_e32 v2, s2 1711; VI-NEXT: v_mov_b32_e32 v3, s3 1712; VI-NEXT: v_mov_b32_e32 v4, s4 1713; VI-NEXT: v_mov_b32_e32 v5, s5 1714; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1715; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 1716; VI-NEXT: v_mov_b32_e32 v0, s0 1717; VI-NEXT: v_mov_b32_e32 v1, s1 1718; VI-NEXT: s_waitcnt vmcnt(0) 1719; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1720; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1721; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1722; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1723; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] 1724; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] 1725; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] 1726; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] 1727; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 1728; VI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[4:5], v[2:3] 1729; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1730; VI-NEXT: s_endpgm 1731; 1732; GFX9-LABEL: unsafe_frem_f64: 1733; GFX9: ; %bb.0: 1734; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1735; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1736; GFX9-NEXT: v_mov_b32_e32 v10, 0 1737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX9-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] 1739; GFX9-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] 1740; GFX9-NEXT: s_waitcnt vmcnt(0) 1741; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1742; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1743; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1744; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1745; GFX9-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1746; GFX9-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1747; GFX9-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1748; GFX9-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1749; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1750; GFX9-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1751; GFX9-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] 1752; GFX9-NEXT: s_endpgm 1753; 1754; GFX10-LABEL: unsafe_frem_f64: 1755; GFX10: ; %bb.0: 1756; GFX10-NEXT: s_clause 0x1 1757; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1758; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1759; GFX10-NEXT: v_mov_b32_e32 v10, 0 1760; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1761; GFX10-NEXT: s_clause 0x1 1762; GFX10-NEXT: global_load_dwordx2 v[0:1], v10, s[2:3] 1763; GFX10-NEXT: global_load_dwordx2 v[2:3], v10, s[6:7] 1764; GFX10-NEXT: s_waitcnt vmcnt(0) 1765; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1766; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1767; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1768; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1769; GFX10-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1770; GFX10-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1771; GFX10-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1772; GFX10-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1773; GFX10-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1774; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1775; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] 1776; GFX10-NEXT: s_endpgm 1777; 1778; GFX11-LABEL: unsafe_frem_f64: 1779; GFX11: ; %bb.0: 1780; GFX11-NEXT: s_clause 0x1 1781; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1782; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1783; GFX11-NEXT: v_mov_b32_e32 v10, 0 1784; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1785; GFX11-NEXT: s_clause 0x1 1786; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] 1787; GFX11-NEXT: global_load_b64 v[2:3], v10, s[4:5] 1788; GFX11-NEXT: s_waitcnt vmcnt(0) 1789; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1790; GFX11-NEXT: s_waitcnt_depctr 0xfff 1791; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1792; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1793; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1794; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1795; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1796; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1797; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1798; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1799; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1800; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1801; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1802; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1803; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1804; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] 1805; GFX11-NEXT: s_endpgm 1806; 1807; GFX1150-LABEL: unsafe_frem_f64: 1808; GFX1150: ; %bb.0: 1809; GFX1150-NEXT: s_clause 0x1 1810; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1811; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1812; GFX1150-NEXT: v_mov_b32_e32 v10, 0 1813; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 1814; GFX1150-NEXT: s_clause 0x1 1815; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[2:3] 1816; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[4:5] 1817; GFX1150-NEXT: s_waitcnt vmcnt(0) 1818; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1819; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1820; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1821; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1822; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1823; GFX1150-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1824; GFX1150-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] 1825; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1826; GFX1150-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] 1827; GFX1150-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] 1828; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1829; GFX1150-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] 1830; GFX1150-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1831; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) 1832; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] 1833; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1] 1834; GFX1150-NEXT: s_endpgm 1835 ptr addrspace(1) %in2) #1 { 1836 %r0 = load double, ptr addrspace(1) %in1, align 8 1837 %r1 = load double, ptr addrspace(1) %in2, align 8 1838 %r2 = frem afn double %r0, %r1 1839 store double %r2, ptr addrspace(1) %out, align 8 1840 ret void 1841} 1842 1843define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 1844; SI-LABEL: frem_v2f16: 1845; SI: ; %bb.0: 1846; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1847; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1848; SI-NEXT: s_mov_b32 s3, 0xf000 1849; SI-NEXT: s_mov_b32 s2, -1 1850; SI-NEXT: s_waitcnt lgkmcnt(0) 1851; SI-NEXT: s_mov_b32 s0, s8 1852; SI-NEXT: s_mov_b32 s1, s9 1853; SI-NEXT: s_mov_b32 s8, s10 1854; SI-NEXT: s_mov_b32 s9, s11 1855; SI-NEXT: s_mov_b32 s10, s2 1856; SI-NEXT: s_mov_b32 s11, s3 1857; SI-NEXT: s_mov_b32 s6, s2 1858; SI-NEXT: s_mov_b32 s7, s3 1859; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1860; SI-NEXT: s_waitcnt vmcnt(0) 1861; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1862; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1863; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1864; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 1865; SI-NEXT: s_waitcnt vmcnt(0) 1866; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 1867; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1868; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1869; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1870; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1871; SI-NEXT: v_rcp_f32_e32 v6, v5 1872; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1873; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1874; SI-NEXT: v_fma_f32 v6, v7, v6, v6 1875; SI-NEXT: v_mul_f32_e32 v7, v4, v6 1876; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 1877; SI-NEXT: v_fma_f32 v7, v8, v6, v7 1878; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 1879; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1880; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1881; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1882; SI-NEXT: v_trunc_f32_e32 v4, v4 1883; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 1884; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1885; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1886; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1887; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1888; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1889; SI-NEXT: v_rcp_f32_e32 v5, v4 1890; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1891; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1892; SI-NEXT: v_fma_f32 v5, v6, v5, v5 1893; SI-NEXT: v_mul_f32_e32 v6, v2, v5 1894; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 1895; SI-NEXT: v_fma_f32 v6, v7, v5, v6 1896; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 1897; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1898; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1899; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1900; SI-NEXT: v_trunc_f32_e32 v2, v2 1901; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 1902; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1903; SI-NEXT: v_or_b32_e32 v0, v1, v0 1904; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1905; SI-NEXT: s_endpgm 1906; 1907; CI-LABEL: frem_v2f16: 1908; CI: ; %bb.0: 1909; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1910; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1911; CI-NEXT: s_mov_b32 s3, 0xf000 1912; CI-NEXT: s_mov_b32 s2, -1 1913; CI-NEXT: s_mov_b32 s6, s2 1914; CI-NEXT: s_waitcnt lgkmcnt(0) 1915; CI-NEXT: s_mov_b32 s0, s8 1916; CI-NEXT: s_mov_b32 s1, s9 1917; CI-NEXT: s_mov_b32 s8, s10 1918; CI-NEXT: s_mov_b32 s9, s11 1919; CI-NEXT: s_mov_b32 s10, s2 1920; CI-NEXT: s_mov_b32 s11, s3 1921; CI-NEXT: s_mov_b32 s7, s3 1922; CI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1923; CI-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:16 1924; CI-NEXT: s_waitcnt vmcnt(1) 1925; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1926; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1927; CI-NEXT: s_waitcnt vmcnt(0) 1928; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 1929; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1930; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1931; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1932; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 1933; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 1934; CI-NEXT: v_rcp_f32_e32 v6, v5 1935; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1936; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 1937; CI-NEXT: v_fma_f32 v6, v7, v6, v6 1938; CI-NEXT: v_mul_f32_e32 v7, v4, v6 1939; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 1940; CI-NEXT: v_fma_f32 v7, v8, v6, v7 1941; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 1942; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1943; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1944; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 1945; CI-NEXT: v_trunc_f32_e32 v4, v4 1946; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 1947; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 1948; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 1949; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1950; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1951; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1952; CI-NEXT: v_rcp_f32_e32 v5, v4 1953; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1954; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 1955; CI-NEXT: v_fma_f32 v5, v6, v5, v5 1956; CI-NEXT: v_mul_f32_e32 v6, v2, v5 1957; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 1958; CI-NEXT: v_fma_f32 v6, v7, v5, v6 1959; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 1960; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1961; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 1962; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 1963; CI-NEXT: v_trunc_f32_e32 v2, v2 1964; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 1965; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1966; CI-NEXT: v_or_b32_e32 v0, v1, v0 1967; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1968; CI-NEXT: s_endpgm 1969; 1970; VI-LABEL: frem_v2f16: 1971; VI: ; %bb.0: 1972; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1973; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1974; VI-NEXT: s_waitcnt lgkmcnt(0) 1975; VI-NEXT: v_mov_b32_e32 v0, s0 1976; VI-NEXT: s_add_u32 s0, s4, 16 1977; VI-NEXT: v_mov_b32_e32 v1, s1 1978; VI-NEXT: v_mov_b32_e32 v2, s2 1979; VI-NEXT: v_mov_b32_e32 v3, s3 1980; VI-NEXT: s_addc_u32 s1, s5, 0 1981; VI-NEXT: flat_load_dword v4, v[2:3] 1982; VI-NEXT: v_mov_b32_e32 v3, s1 1983; VI-NEXT: v_mov_b32_e32 v2, s0 1984; VI-NEXT: flat_load_dword v2, v[2:3] 1985; VI-NEXT: s_waitcnt vmcnt(1) 1986; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1987; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 1988; VI-NEXT: s_waitcnt vmcnt(0) 1989; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1990; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1991; VI-NEXT: v_rcp_f32_e32 v8, v7 1992; VI-NEXT: v_mul_f32_e32 v9, v5, v8 1993; VI-NEXT: v_mad_f32 v10, -v7, v9, v5 1994; VI-NEXT: v_mac_f32_e32 v9, v10, v8 1995; VI-NEXT: v_mad_f32 v5, -v7, v9, v5 1996; VI-NEXT: v_mul_f32_e32 v5, v5, v8 1997; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 1998; VI-NEXT: v_add_f32_e32 v5, v5, v9 1999; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 2000; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 2001; VI-NEXT: v_trunc_f16_e32 v5, v5 2002; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 2003; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 2004; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 2005; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2006; VI-NEXT: v_rcp_f32_e32 v7, v6 2007; VI-NEXT: v_mul_f32_e32 v8, v5, v7 2008; VI-NEXT: v_mad_f32 v9, -v6, v8, v5 2009; VI-NEXT: v_mac_f32_e32 v8, v9, v7 2010; VI-NEXT: v_mad_f32 v5, -v6, v8, v5 2011; VI-NEXT: v_mul_f32_e32 v5, v5, v7 2012; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2013; VI-NEXT: v_add_f32_e32 v5, v5, v8 2014; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 2015; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 2016; VI-NEXT: v_trunc_f16_e32 v5, v5 2017; VI-NEXT: v_fma_f16 v2, -v5, v2, v4 2018; VI-NEXT: v_or_b32_e32 v2, v2, v3 2019; VI-NEXT: flat_store_dword v[0:1], v2 2020; VI-NEXT: s_endpgm 2021; 2022; GFX9-LABEL: frem_v2f16: 2023; GFX9: ; %bb.0: 2024; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2025; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2026; GFX9-NEXT: v_mov_b32_e32 v0, 0 2027; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 2029; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:16 2030; GFX9-NEXT: s_waitcnt vmcnt(1) 2031; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 2032; GFX9-NEXT: s_waitcnt vmcnt(0) 2033; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 2034; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2035; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 2036; GFX9-NEXT: v_rcp_f32_e32 v4, v4 2037; GFX9-NEXT: v_rcp_f32_e32 v7, v7 2038; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 2039; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 2040; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 2041; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 2042; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 2043; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 2044; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 2045; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2046; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 2047; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 2048; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7 2049; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 2050; GFX9-NEXT: v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2051; GFX9-NEXT: v_trunc_f16_e32 v3, v3 2052; GFX9-NEXT: v_mac_f32_e32 v5, v8, v7 2053; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 2054; GFX9-NEXT: v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2055; GFX9-NEXT: v_mul_f32_e32 v1, v1, v7 2056; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 2057; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 2058; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 2059; GFX9-NEXT: v_div_fixup_f16 v1, v1, v6, v4 2060; GFX9-NEXT: v_trunc_f16_e32 v1, v1 2061; GFX9-NEXT: v_fma_f16 v1, -v1, v6, v4 2062; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1 2063; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 2064; GFX9-NEXT: s_endpgm 2065; 2066; GFX10-LABEL: frem_v2f16: 2067; GFX10: ; %bb.0: 2068; GFX10-NEXT: s_clause 0x1 2069; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2070; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2071; GFX10-NEXT: v_mov_b32_e32 v0, 0 2072; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX10-NEXT: s_clause 0x1 2074; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 2075; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:16 2076; GFX10-NEXT: s_waitcnt vmcnt(1) 2077; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 2078; GFX10-NEXT: s_waitcnt vmcnt(0) 2079; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 2080; GFX10-NEXT: v_rcp_f32_e32 v5, v4 2081; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 2082; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 2083; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 2084; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 2085; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 2086; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 2087; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 2088; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 2089; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 2090; GFX10-NEXT: v_trunc_f16_e32 v3, v3 2091; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 2092; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2093; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2094; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 2095; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 2096; GFX10-NEXT: v_rcp_f32_e32 v6, v5 2097; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6 2098; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v4 2099; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 2100; GFX10-NEXT: v_mad_f32 v4, -v5, v7, v4 2101; GFX10-NEXT: v_mul_f32_e32 v4, v4, v6 2102; GFX10-NEXT: v_and_b32_e32 v4, 0xff800000, v4 2103; GFX10-NEXT: v_add_f32_e32 v4, v4, v7 2104; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 2105; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 2106; GFX10-NEXT: v_trunc_f16_e32 v4, v4 2107; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 2108; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 2109; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2110; GFX10-NEXT: s_endpgm 2111; 2112; GFX11-LABEL: frem_v2f16: 2113; GFX11: ; %bb.0: 2114; GFX11-NEXT: s_clause 0x1 2115; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2116; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2117; GFX11-NEXT: v_mov_b32_e32 v0, 0 2118; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2119; GFX11-NEXT: s_clause 0x1 2120; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2121; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 2122; GFX11-NEXT: s_waitcnt vmcnt(1) 2123; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 2124; GFX11-NEXT: s_waitcnt vmcnt(0) 2125; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 2126; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2128; GFX11-NEXT: v_rcp_f32_e32 v4, v4 2129; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6 2130; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2131; GFX11-NEXT: v_rcp_f32_e32 v7, v7 2132; GFX11-NEXT: s_waitcnt_depctr 0xfff 2133; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 2134; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 2135; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2136; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4 2137; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] 2138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2139; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 2140; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 2141; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2142; GFX11-NEXT: v_add_f32_e32 v3, v4, v3 2143; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 2144; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2145; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2146; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v4 2147; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 2148; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2149; GFX11-NEXT: v_mul_f32_e32 v5, v5, v7 2150; GFX11-NEXT: v_trunc_f16_e32 v3, v3 2151; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2152; GFX11-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2153; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 2154; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2155; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v7 2156; GFX11-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2157; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2158; GFX11-NEXT: v_mul_f32_e32 v1, v1, v7 2159; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 2160; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2161; GFX11-NEXT: v_add_f32_e32 v1, v1, v5 2162; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2163; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2164; GFX11-NEXT: v_div_fixup_f16 v1, v1, v6, v4 2165; GFX11-NEXT: v_trunc_f16_e32 v1, v1 2166; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2167; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4 2168; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 2169; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2170; GFX11-NEXT: s_endpgm 2171; 2172; GFX1150-LABEL: frem_v2f16: 2173; GFX1150: ; %bb.0: 2174; GFX1150-NEXT: s_clause 0x1 2175; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2176; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2177; GFX1150-NEXT: v_mov_b32_e32 v0, 0 2178; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 2179; GFX1150-NEXT: s_clause 0x1 2180; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3] 2181; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16 2182; GFX1150-NEXT: s_waitcnt vmcnt(1) 2183; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2184; GFX1150-NEXT: s_waitcnt vmcnt(0) 2185; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2186; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2187; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3 2188; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 2189; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 2190; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 2191; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6 2192; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2193; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2194; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6 2195; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2196; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2197; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 2198; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2199; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2200; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4 2201; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2202; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 2203; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3 2204; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2205; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 2206; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 2207; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2208; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5 2209; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 2210; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1 2211; GFX1150-NEXT: v_rcp_f32_e32 v5, v5 2212; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2213; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5 2214; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] 2215; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2216; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5 2217; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1] 2218; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2219; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5 2220; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2221; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2222; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4 2223; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4 2224; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2225; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1 2226; GFX1150-NEXT: v_trunc_f16_e32 v4, v4 2227; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2228; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4 2229; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2 2230; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) 2231; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3 2232; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1] 2233; GFX1150-NEXT: s_endpgm 2234 ptr addrspace(1) %in2) #0 { 2235 %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 2236 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 2237 %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8 2238 %r2 = frem <2 x half> %r0, %r1 2239 store <2 x half> %r2, ptr addrspace(1) %out, align 8 2240 ret void 2241} 2242 2243define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 2244; SI-LABEL: frem_v4f16: 2245; SI: ; %bb.0: 2246; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2247; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2248; SI-NEXT: s_mov_b32 s3, 0xf000 2249; SI-NEXT: s_mov_b32 s2, -1 2250; SI-NEXT: s_waitcnt lgkmcnt(0) 2251; SI-NEXT: s_mov_b32 s0, s8 2252; SI-NEXT: s_mov_b32 s1, s9 2253; SI-NEXT: s_mov_b32 s8, s10 2254; SI-NEXT: s_mov_b32 s9, s11 2255; SI-NEXT: s_mov_b32 s10, s2 2256; SI-NEXT: s_mov_b32 s11, s3 2257; SI-NEXT: s_mov_b32 s6, s2 2258; SI-NEXT: s_mov_b32 s7, s3 2259; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2260; SI-NEXT: s_waitcnt vmcnt(0) 2261; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 2262; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2263; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 2264; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 2265; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 2266; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 2267; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 2268; SI-NEXT: s_waitcnt vmcnt(0) 2269; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 2270; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2271; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 2272; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 2273; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2274; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2275; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 2276; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 2277; SI-NEXT: v_rcp_f32_e32 v10, v9 2278; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2279; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2280; SI-NEXT: v_fma_f32 v10, v11, v10, v10 2281; SI-NEXT: v_mul_f32_e32 v11, v8, v10 2282; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 2283; SI-NEXT: v_fma_f32 v11, v12, v10, v11 2284; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 2285; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2286; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2287; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 2288; SI-NEXT: v_trunc_f32_e32 v8, v8 2289; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 2290; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 2291; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2292; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2293; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 2294; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 2295; SI-NEXT: v_rcp_f32_e32 v9, v8 2296; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2297; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2298; SI-NEXT: v_fma_f32 v9, v10, v9, v9 2299; SI-NEXT: v_mul_f32_e32 v10, v5, v9 2300; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 2301; SI-NEXT: v_fma_f32 v10, v11, v9, v10 2302; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 2303; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2304; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 2305; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 2306; SI-NEXT: v_trunc_f32_e32 v5, v5 2307; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 2308; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 2309; SI-NEXT: v_or_b32_e32 v1, v4, v1 2310; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 2311; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 2312; SI-NEXT: v_rcp_f32_e32 v7, v5 2313; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2314; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 2315; SI-NEXT: v_fma_f32 v7, v8, v7, v7 2316; SI-NEXT: v_mul_f32_e32 v8, v4, v7 2317; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 2318; SI-NEXT: v_fma_f32 v8, v9, v7, v8 2319; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 2320; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2321; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 2322; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 2323; SI-NEXT: v_trunc_f32_e32 v4, v4 2324; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 2325; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2326; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2327; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 2328; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 2329; SI-NEXT: v_rcp_f32_e32 v5, v4 2330; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2331; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 2332; SI-NEXT: v_fma_f32 v5, v7, v5, v5 2333; SI-NEXT: v_mul_f32_e32 v7, v3, v5 2334; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 2335; SI-NEXT: v_fma_f32 v7, v8, v5, v7 2336; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 2337; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2338; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 2339; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 2340; SI-NEXT: v_trunc_f32_e32 v3, v3 2341; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 2342; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 2343; SI-NEXT: v_or_b32_e32 v0, v2, v0 2344; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2345; SI-NEXT: s_endpgm 2346; 2347; CI-LABEL: frem_v4f16: 2348; CI: ; %bb.0: 2349; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2350; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2351; CI-NEXT: s_mov_b32 s3, 0xf000 2352; CI-NEXT: s_mov_b32 s2, -1 2353; CI-NEXT: s_mov_b32 s6, s2 2354; CI-NEXT: s_waitcnt lgkmcnt(0) 2355; CI-NEXT: s_mov_b32 s0, s8 2356; CI-NEXT: s_mov_b32 s1, s9 2357; CI-NEXT: s_mov_b32 s8, s10 2358; CI-NEXT: s_mov_b32 s9, s11 2359; CI-NEXT: s_mov_b32 s10, s2 2360; CI-NEXT: s_mov_b32 s11, s3 2361; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2362; CI-NEXT: s_mov_b32 s7, s3 2363; CI-NEXT: s_waitcnt vmcnt(0) 2364; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 2365; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2366; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 2367; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 2368; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 2369; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 2370; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 2371; CI-NEXT: s_waitcnt vmcnt(0) 2372; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 2373; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2374; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2375; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 2376; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2377; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2378; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 2379; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 2380; CI-NEXT: v_rcp_f32_e32 v10, v9 2381; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2382; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 2383; CI-NEXT: v_fma_f32 v10, v11, v10, v10 2384; CI-NEXT: v_mul_f32_e32 v11, v8, v10 2385; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 2386; CI-NEXT: v_fma_f32 v11, v12, v10, v11 2387; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 2388; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2389; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 2390; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 2391; CI-NEXT: v_trunc_f32_e32 v8, v8 2392; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 2393; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 2394; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 2395; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 2396; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2397; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2398; CI-NEXT: v_rcp_f32_e32 v9, v8 2399; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2400; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 2401; CI-NEXT: v_fma_f32 v9, v10, v9, v9 2402; CI-NEXT: v_mul_f32_e32 v10, v5, v9 2403; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 2404; CI-NEXT: v_fma_f32 v10, v11, v9, v10 2405; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 2406; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2407; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 2408; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 2409; CI-NEXT: v_trunc_f32_e32 v5, v5 2410; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2411; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 2412; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2413; CI-NEXT: v_or_b32_e32 v1, v4, v1 2414; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 2415; CI-NEXT: v_rcp_f32_e32 v7, v5 2416; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2417; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 2418; CI-NEXT: v_fma_f32 v7, v8, v7, v7 2419; CI-NEXT: v_mul_f32_e32 v8, v4, v7 2420; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 2421; CI-NEXT: v_fma_f32 v8, v9, v7, v8 2422; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 2423; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2424; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 2425; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 2426; CI-NEXT: v_trunc_f32_e32 v4, v4 2427; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 2428; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 2429; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 2430; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2431; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2432; CI-NEXT: v_rcp_f32_e32 v5, v4 2433; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2434; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 2435; CI-NEXT: v_fma_f32 v5, v7, v5, v5 2436; CI-NEXT: v_mul_f32_e32 v7, v3, v5 2437; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 2438; CI-NEXT: v_fma_f32 v7, v8, v5, v7 2439; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 2440; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2441; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 2442; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 2443; CI-NEXT: v_trunc_f32_e32 v3, v3 2444; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 2445; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2446; CI-NEXT: v_or_b32_e32 v0, v2, v0 2447; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2448; CI-NEXT: s_endpgm 2449; 2450; VI-LABEL: frem_v4f16: 2451; VI: ; %bb.0: 2452; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2453; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2454; VI-NEXT: s_waitcnt lgkmcnt(0) 2455; VI-NEXT: v_mov_b32_e32 v0, s0 2456; VI-NEXT: s_add_u32 s0, s4, 32 2457; VI-NEXT: v_mov_b32_e32 v1, s1 2458; VI-NEXT: s_addc_u32 s1, s5, 0 2459; VI-NEXT: v_mov_b32_e32 v5, s1 2460; VI-NEXT: v_mov_b32_e32 v4, s0 2461; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 2462; VI-NEXT: v_mov_b32_e32 v2, s2 2463; VI-NEXT: v_mov_b32_e32 v3, s3 2464; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2465; VI-NEXT: s_waitcnt vmcnt(1) 2466; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 2467; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 2468; VI-NEXT: s_waitcnt vmcnt(0) 2469; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 2470; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 2471; VI-NEXT: v_rcp_f32_e32 v10, v9 2472; VI-NEXT: v_mul_f32_e32 v11, v7, v10 2473; VI-NEXT: v_mad_f32 v12, -v9, v11, v7 2474; VI-NEXT: v_mac_f32_e32 v11, v12, v10 2475; VI-NEXT: v_mad_f32 v7, -v9, v11, v7 2476; VI-NEXT: v_mul_f32_e32 v7, v7, v10 2477; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 2478; VI-NEXT: v_add_f32_e32 v7, v7, v11 2479; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 2480; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 2481; VI-NEXT: v_trunc_f16_e32 v7, v7 2482; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 2483; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 2484; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 2485; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2486; VI-NEXT: v_rcp_f32_e32 v9, v8 2487; VI-NEXT: v_mul_f32_e32 v10, v7, v9 2488; VI-NEXT: v_mad_f32 v11, -v8, v10, v7 2489; VI-NEXT: v_mac_f32_e32 v10, v11, v9 2490; VI-NEXT: v_mad_f32 v7, -v8, v10, v7 2491; VI-NEXT: v_mul_f32_e32 v7, v7, v9 2492; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7 2493; VI-NEXT: v_add_f32_e32 v7, v7, v10 2494; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 2495; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 2496; VI-NEXT: v_trunc_f16_e32 v7, v7 2497; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 2498; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 2499; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 2500; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 2501; VI-NEXT: v_or_b32_e32 v3, v3, v6 2502; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 2503; VI-NEXT: v_rcp_f32_e32 v9, v8 2504; VI-NEXT: v_mul_f32_e32 v10, v6, v9 2505; VI-NEXT: v_mad_f32 v11, -v8, v10, v6 2506; VI-NEXT: v_mac_f32_e32 v10, v11, v9 2507; VI-NEXT: v_mad_f32 v6, -v8, v10, v6 2508; VI-NEXT: v_mul_f32_e32 v6, v6, v9 2509; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2510; VI-NEXT: v_add_f32_e32 v6, v6, v10 2511; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2512; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 2513; VI-NEXT: v_trunc_f16_e32 v6, v6 2514; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 2515; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 2516; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 2517; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2518; VI-NEXT: v_rcp_f32_e32 v8, v7 2519; VI-NEXT: v_mul_f32_e32 v9, v6, v8 2520; VI-NEXT: v_mad_f32 v10, -v7, v9, v6 2521; VI-NEXT: v_mac_f32_e32 v9, v10, v8 2522; VI-NEXT: v_mad_f32 v6, -v7, v9, v6 2523; VI-NEXT: v_mul_f32_e32 v6, v6, v8 2524; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2525; VI-NEXT: v_add_f32_e32 v6, v6, v9 2526; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2527; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 2528; VI-NEXT: v_trunc_f16_e32 v6, v6 2529; VI-NEXT: v_fma_f16 v2, -v6, v4, v2 2530; VI-NEXT: v_or_b32_e32 v2, v2, v5 2531; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2532; VI-NEXT: s_endpgm 2533; 2534; GFX9-LABEL: frem_v4f16: 2535; GFX9: ; %bb.0: 2536; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2537; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2538; GFX9-NEXT: v_mov_b32_e32 v4, 0 2539; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2540; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 2541; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 2542; GFX9-NEXT: s_waitcnt vmcnt(1) 2543; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 2544; GFX9-NEXT: s_waitcnt vmcnt(0) 2545; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 2546; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3 2547; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v8 2548; GFX9-NEXT: v_rcp_f32_e32 v6, v6 2549; GFX9-NEXT: v_rcp_f32_e32 v9, v9 2550; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 2551; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2552; GFX9-NEXT: v_mac_f32_e32 v5, v7, v6 2553; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2554; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 2555; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2556; GFX9-NEXT: v_add_f32_e32 v5, v6, v5 2557; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 2558; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 2559; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 2560; GFX9-NEXT: v_mul_f32_e32 v7, v7, v9 2561; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2562; GFX9-NEXT: v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2563; GFX9-NEXT: v_trunc_f16_e32 v5, v5 2564; GFX9-NEXT: v_mac_f32_e32 v7, v10, v9 2565; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 2566; GFX9-NEXT: v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2567; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 2568; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1 2569; GFX9-NEXT: v_add_f32_e32 v1, v1, v7 2570; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 2571; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 2572; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2573; GFX9-NEXT: v_div_fixup_f16 v1, v1, v8, v6 2574; GFX9-NEXT: v_trunc_f16_e32 v1, v1 2575; GFX9-NEXT: v_fma_f16 v1, -v1, v8, v6 2576; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 2577; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 2578; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v7 2579; GFX9-NEXT: v_rcp_f32_e32 v5, v5 2580; GFX9-NEXT: v_rcp_f32_e32 v8, v8 2581; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 2582; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] 2583; GFX9-NEXT: v_mac_f32_e32 v3, v6, v5 2584; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] 2585; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5 2586; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2587; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 2588; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0 2589; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v5 2590; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 2591; GFX9-NEXT: v_mul_f32_e32 v6, v6, v8 2592; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2593; GFX9-NEXT: v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2594; GFX9-NEXT: v_trunc_f16_e32 v3, v3 2595; GFX9-NEXT: v_mac_f32_e32 v6, v9, v8 2596; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 2597; GFX9-NEXT: v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2598; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 2599; GFX9-NEXT: v_and_b32_e32 v0, 0xff800000, v0 2600; GFX9-NEXT: v_add_f32_e32 v0, v0, v6 2601; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 2602; GFX9-NEXT: v_div_fixup_f16 v0, v0, v7, v5 2603; GFX9-NEXT: v_trunc_f16_e32 v0, v0 2604; GFX9-NEXT: v_fma_f16 v0, -v0, v7, v5 2605; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0 2606; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 2607; GFX9-NEXT: s_endpgm 2608; 2609; GFX10-LABEL: frem_v4f16: 2610; GFX10: ; %bb.0: 2611; GFX10-NEXT: s_clause 0x1 2612; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2613; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2614; GFX10-NEXT: v_mov_b32_e32 v4, 0 2615; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2616; GFX10-NEXT: s_clause 0x1 2617; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 2618; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 2619; GFX10-NEXT: s_waitcnt vmcnt(1) 2620; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 2621; GFX10-NEXT: s_waitcnt vmcnt(0) 2622; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 2623; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2624; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 2625; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 2626; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 2627; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 2628; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 2629; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2630; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 2631; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 2632; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2633; GFX10-NEXT: v_trunc_f16_e32 v5, v5 2634; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 2635; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2636; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2637; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 2638; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 2639; GFX10-NEXT: v_rcp_f32_e32 v8, v7 2640; GFX10-NEXT: v_mul_f32_e32 v9, v6, v8 2641; GFX10-NEXT: v_mad_f32 v10, -v7, v9, v6 2642; GFX10-NEXT: v_mac_f32_e32 v9, v10, v8 2643; GFX10-NEXT: v_mad_f32 v6, -v7, v9, v6 2644; GFX10-NEXT: v_mul_f32_e32 v6, v6, v8 2645; GFX10-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2646; GFX10-NEXT: v_add_f32_e32 v6, v6, v9 2647; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 2648; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 2649; GFX10-NEXT: v_trunc_f16_e32 v6, v6 2650; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 2651; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 2652; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 2653; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 2654; GFX10-NEXT: v_rcp_f32_e32 v6, v5 2655; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 2656; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v3 2657; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6 2658; GFX10-NEXT: v_mad_f32 v3, -v5, v7, v3 2659; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6 2660; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 2661; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 2662; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 2663; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2664; GFX10-NEXT: v_trunc_f16_e32 v3, v3 2665; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 2666; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2667; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2668; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 2669; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 2670; GFX10-NEXT: v_rcp_f32_e32 v7, v6 2671; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 2672; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5 2673; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7 2674; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5 2675; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7 2676; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2677; GFX10-NEXT: v_add_f32_e32 v5, v5, v8 2678; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 2679; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 2680; GFX10-NEXT: v_trunc_f16_e32 v5, v5 2681; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 2682; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 2683; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 2684; GFX10-NEXT: s_endpgm 2685; 2686; GFX11-LABEL: frem_v4f16: 2687; GFX11: ; %bb.0: 2688; GFX11-NEXT: s_clause 0x1 2689; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2690; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2691; GFX11-NEXT: v_mov_b32_e32 v4, 0 2692; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2693; GFX11-NEXT: s_clause 0x1 2694; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] 2695; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 2696; GFX11-NEXT: s_waitcnt vmcnt(1) 2697; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 2698; GFX11-NEXT: s_waitcnt vmcnt(0) 2699; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 2700; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3 2701; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2702; GFX11-NEXT: v_rcp_f32_e32 v6, v6 2703; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v8 2704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2705; GFX11-NEXT: v_rcp_f32_e32 v9, v9 2706; GFX11-NEXT: s_waitcnt_depctr 0xfff 2707; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 2708; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2709; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2710; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v6 2711; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2712; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2713; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6 2714; GFX11-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2715; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2716; GFX11-NEXT: v_add_f32_e32 v5, v6, v5 2717; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 2718; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 2719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2720; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6 2721; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2722; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2723; GFX11-NEXT: v_mul_f32_e32 v7, v7, v9 2724; GFX11-NEXT: v_trunc_f16_e32 v5, v5 2725; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2726; GFX11-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2727; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 2728; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2729; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v9 2730; GFX11-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2731; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 2732; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2733; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9 2734; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 2735; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2736; GFX11-NEXT: v_add_f32_e32 v1, v1, v7 2737; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2738; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2739; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2740; GFX11-NEXT: v_div_fixup_f16 v1, v1, v8, v6 2741; GFX11-NEXT: v_trunc_f16_e32 v1, v1 2742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2743; GFX11-NEXT: v_fma_f16 v1, -v1, v8, v6 2744; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v7 2745; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 2746; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 2747; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2748; GFX11-NEXT: v_rcp_f32_e32 v8, v8 2749; GFX11-NEXT: v_rcp_f32_e32 v5, v5 2750; GFX11-NEXT: s_waitcnt_depctr 0xfff 2751; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 2752; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2753; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] 2754; GFX11-NEXT: v_fmac_f32_e32 v3, v6, v5 2755; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2756; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1] 2757; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5 2758; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2759; GFX11-NEXT: v_and_b32_e32 v5, 0xff800000, v5 2760; GFX11-NEXT: v_add_f32_e32 v3, v5, v3 2761; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 2762; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2763; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2764; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 2765; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2766; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 2767; GFX11-NEXT: v_mul_f32_e32 v6, v6, v8 2768; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2769; GFX11-NEXT: v_trunc_f16_e32 v3, v3 2770; GFX11-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2772; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0 2773; GFX11-NEXT: v_fmac_f32_e32 v6, v9, v8 2774; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2775; GFX11-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2776; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8 2777; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2778; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 2779; GFX11-NEXT: v_add_f32_e32 v0, v0, v6 2780; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2781; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2782; GFX11-NEXT: v_div_fixup_f16 v0, v0, v7, v5 2783; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2784; GFX11-NEXT: v_trunc_f16_e32 v0, v0 2785; GFX11-NEXT: v_fma_f16 v0, -v0, v7, v5 2786; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2787; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 2788; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 2789; GFX11-NEXT: s_endpgm 2790; 2791; GFX1150-LABEL: frem_v4f16: 2792; GFX1150: ; %bb.0: 2793; GFX1150-NEXT: s_clause 0x1 2794; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2795; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2796; GFX1150-NEXT: v_mov_b32_e32 v4, 0 2797; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 2798; GFX1150-NEXT: s_clause 0x1 2799; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] 2800; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 2801; GFX1150-NEXT: s_waitcnt vmcnt(1) 2802; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0 2803; GFX1150-NEXT: s_waitcnt vmcnt(0) 2804; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2 2805; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2806; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5 2807; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7 2808; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 2809; GFX1150-NEXT: v_rcp_f32_e32 v8, v8 2810; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8 2811; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2812; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2813; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8 2814; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2815; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2816; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8 2817; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2818; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8 2819; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6 2820; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2821; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 2822; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5 2823; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2824; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 2825; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 2826; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2827; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7 2828; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2 2829; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0 2830; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 2831; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2832; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7 2833; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] 2834; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2835; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7 2836; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1] 2837; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2838; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 2839; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 2840; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2841; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6 2842; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6 2843; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2844; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0 2845; GFX1150-NEXT: v_trunc_f16_e32 v6, v6 2846; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2847; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6 2848; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0 2849; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3 2850; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1 2851; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2852; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5 2853; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6 2854; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2855; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2 2856; GFX1150-NEXT: v_rcp_f32_e32 v7, v7 2857; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2858; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7 2859; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2860; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2861; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7 2862; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] 2863; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2864; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7 2865; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7 2866; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2867; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5 2868; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 2869; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2870; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2 2871; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 2872; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2873; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 2874; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6 2875; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3 2876; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1 2877; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) 2878; GFX1150-NEXT: v_rcp_f32_e32 v6, v6 2879; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6 2880; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2881; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2882; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6 2883; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2884; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1] 2885; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6 2886; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2887; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6 2888; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5 2889; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2890; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5 2891; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1 2892; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2893; GFX1150-NEXT: v_trunc_f16_e32 v5, v5 2894; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5 2895; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2896; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3 2897; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2 2898; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] 2899; GFX1150-NEXT: s_endpgm 2900 ptr addrspace(1) %in2) #0 { 2901 %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 2902 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 2903 %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16 2904 %r2 = frem <4 x half> %r0, %r1 2905 store <4 x half> %r2, ptr addrspace(1) %out, align 16 2906 ret void 2907} 2908 2909define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, 2910; SI-LABEL: frem_v2f32: 2911; SI: ; %bb.0: 2912; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2913; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2914; SI-NEXT: s_mov_b32 s3, 0xf000 2915; SI-NEXT: s_mov_b32 s2, -1 2916; SI-NEXT: s_waitcnt lgkmcnt(0) 2917; SI-NEXT: s_mov_b32 s0, s8 2918; SI-NEXT: s_mov_b32 s1, s9 2919; SI-NEXT: s_mov_b32 s8, s10 2920; SI-NEXT: s_mov_b32 s9, s11 2921; SI-NEXT: s_mov_b32 s10, s2 2922; SI-NEXT: s_mov_b32 s11, s3 2923; SI-NEXT: s_mov_b32 s6, s2 2924; SI-NEXT: s_mov_b32 s7, s3 2925; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2926; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 2927; SI-NEXT: s_waitcnt vmcnt(0) 2928; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2929; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2930; SI-NEXT: v_rcp_f32_e32 v6, v5 2931; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2932; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2933; SI-NEXT: v_fma_f32 v6, v7, v6, v6 2934; SI-NEXT: v_mul_f32_e32 v7, v4, v6 2935; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 2936; SI-NEXT: v_fma_f32 v7, v8, v6, v7 2937; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 2938; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2939; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2940; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2941; SI-NEXT: v_trunc_f32_e32 v4, v4 2942; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 2943; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2944; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2945; SI-NEXT: v_rcp_f32_e32 v5, v4 2946; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2947; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2948; SI-NEXT: v_fma_f32 v5, v6, v5, v5 2949; SI-NEXT: v_mul_f32_e32 v6, v3, v5 2950; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 2951; SI-NEXT: v_fma_f32 v6, v7, v5, v6 2952; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 2953; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2954; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 2955; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 2956; SI-NEXT: v_trunc_f32_e32 v3, v3 2957; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 2958; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2959; SI-NEXT: s_endpgm 2960; 2961; CI-LABEL: frem_v2f32: 2962; CI: ; %bb.0: 2963; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2964; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2965; CI-NEXT: s_mov_b32 s3, 0xf000 2966; CI-NEXT: s_mov_b32 s2, -1 2967; CI-NEXT: s_mov_b32 s6, s2 2968; CI-NEXT: s_waitcnt lgkmcnt(0) 2969; CI-NEXT: s_mov_b32 s0, s8 2970; CI-NEXT: s_mov_b32 s1, s9 2971; CI-NEXT: s_mov_b32 s8, s10 2972; CI-NEXT: s_mov_b32 s9, s11 2973; CI-NEXT: s_mov_b32 s10, s2 2974; CI-NEXT: s_mov_b32 s11, s3 2975; CI-NEXT: s_mov_b32 s7, s3 2976; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2977; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 2978; CI-NEXT: s_waitcnt vmcnt(0) 2979; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 2980; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 2981; CI-NEXT: v_rcp_f32_e32 v6, v5 2982; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2983; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 2984; CI-NEXT: v_fma_f32 v6, v7, v6, v6 2985; CI-NEXT: v_mul_f32_e32 v7, v4, v6 2986; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 2987; CI-NEXT: v_fma_f32 v7, v8, v6, v7 2988; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 2989; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2990; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 2991; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 2992; CI-NEXT: v_trunc_f32_e32 v4, v4 2993; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 2994; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 2995; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 2996; CI-NEXT: v_rcp_f32_e32 v5, v4 2997; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2998; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 2999; CI-NEXT: v_fma_f32 v5, v6, v5, v5 3000; CI-NEXT: v_mul_f32_e32 v6, v3, v5 3001; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 3002; CI-NEXT: v_fma_f32 v6, v7, v5, v6 3003; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 3004; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3005; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 3006; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 3007; CI-NEXT: v_trunc_f32_e32 v3, v3 3008; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 3009; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3010; CI-NEXT: s_endpgm 3011; 3012; VI-LABEL: frem_v2f32: 3013; VI: ; %bb.0: 3014; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3015; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3016; VI-NEXT: s_waitcnt lgkmcnt(0) 3017; VI-NEXT: v_mov_b32_e32 v0, s0 3018; VI-NEXT: s_add_u32 s0, s4, 32 3019; VI-NEXT: v_mov_b32_e32 v1, s1 3020; VI-NEXT: s_addc_u32 s1, s5, 0 3021; VI-NEXT: v_mov_b32_e32 v5, s1 3022; VI-NEXT: v_mov_b32_e32 v2, s2 3023; VI-NEXT: v_mov_b32_e32 v3, s3 3024; VI-NEXT: v_mov_b32_e32 v4, s0 3025; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 3026; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] 3027; VI-NEXT: s_waitcnt vmcnt(0) 3028; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 3029; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 3030; VI-NEXT: v_rcp_f32_e32 v8, v7 3031; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3032; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 3033; VI-NEXT: v_fma_f32 v8, v9, v8, v8 3034; VI-NEXT: v_mul_f32_e32 v9, v6, v8 3035; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 3036; VI-NEXT: v_fma_f32 v9, v10, v8, v9 3037; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 3038; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3039; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 3040; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 3041; VI-NEXT: v_trunc_f32_e32 v6, v6 3042; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 3043; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 3044; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 3045; VI-NEXT: v_rcp_f32_e32 v7, v6 3046; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3047; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3048; VI-NEXT: v_fma_f32 v7, v8, v7, v7 3049; VI-NEXT: v_mul_f32_e32 v8, v5, v7 3050; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 3051; VI-NEXT: v_fma_f32 v8, v9, v7, v8 3052; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 3053; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3054; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3055; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 3056; VI-NEXT: v_trunc_f32_e32 v5, v5 3057; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 3058; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 3059; VI-NEXT: s_endpgm 3060; 3061; GFX9-LABEL: frem_v2f32: 3062; GFX9: ; %bb.0: 3063; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3064; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3065; GFX9-NEXT: v_mov_b32_e32 v4, 0 3066; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3067; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 3068; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 3069; GFX9-NEXT: s_waitcnt vmcnt(0) 3070; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v3, v3, v1 3071; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 3072; GFX9-NEXT: v_rcp_f32_e32 v7, v6 3073; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3074; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3075; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 3076; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 3077; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 3078; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 3079; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 3080; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3081; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3082; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 3083; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3084; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 3085; GFX9-NEXT: v_div_scale_f32 v5, s[2:3], v2, v2, v0 3086; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 3087; GFX9-NEXT: v_rcp_f32_e32 v6, v5 3088; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3089; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 3090; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 3091; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 3092; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 3093; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 3094; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 3095; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3096; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 3097; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 3098; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3099; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 3100; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 3101; GFX9-NEXT: s_endpgm 3102; 3103; GFX10-LABEL: frem_v2f32: 3104; GFX10: ; %bb.0: 3105; GFX10-NEXT: s_clause 0x1 3106; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3107; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3108; GFX10-NEXT: v_mov_b32_e32 v4, 0 3109; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3110; GFX10-NEXT: s_clause 0x1 3111; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 3112; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] offset:32 3113; GFX10-NEXT: s_waitcnt vmcnt(0) 3114; GFX10-NEXT: v_div_scale_f32 v6, s2, v3, v3, v1 3115; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 3116; GFX10-NEXT: v_rcp_f32_e32 v7, v6 3117; GFX10-NEXT: s_denorm_mode 15 3118; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3119; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 3120; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 3121; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 3122; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 3123; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 3124; GFX10-NEXT: s_denorm_mode 12 3125; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3126; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 3127; GFX10-NEXT: v_trunc_f32_e32 v5, v5 3128; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 3129; GFX10-NEXT: v_div_scale_f32 v5, s2, v2, v2, v0 3130; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 3131; GFX10-NEXT: v_rcp_f32_e32 v6, v5 3132; GFX10-NEXT: s_denorm_mode 15 3133; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 3134; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 3135; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 3136; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 3137; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 3138; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 3139; GFX10-NEXT: s_denorm_mode 12 3140; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 3141; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 3142; GFX10-NEXT: v_trunc_f32_e32 v3, v3 3143; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 3144; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 3145; GFX10-NEXT: s_endpgm 3146; 3147; GFX11-LABEL: frem_v2f32: 3148; GFX11: ; %bb.0: 3149; GFX11-NEXT: s_clause 0x1 3150; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3151; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3152; GFX11-NEXT: v_mov_b32_e32 v4, 0 3153; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3154; GFX11-NEXT: s_clause 0x1 3155; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] 3156; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 3157; GFX11-NEXT: s_waitcnt vmcnt(0) 3158; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 3159; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 3160; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3161; GFX11-NEXT: v_rcp_f32_e32 v7, v6 3162; GFX11-NEXT: s_denorm_mode 15 3163; GFX11-NEXT: s_waitcnt_depctr 0xfff 3164; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3165; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7 3166; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3167; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7 3168; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5 3169; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3170; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7 3171; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5 3172; GFX11-NEXT: s_denorm_mode 12 3173; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3174; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3175; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1 3176; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3177; GFX11-NEXT: v_trunc_f32_e32 v5, v5 3178; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 3179; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 3180; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 3181; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3182; GFX11-NEXT: v_rcp_f32_e32 v6, v5 3183; GFX11-NEXT: s_denorm_mode 15 3184; GFX11-NEXT: s_waitcnt_depctr 0xfff 3185; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0 3186; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6 3187; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3188; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6 3189; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3 3190; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3191; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6 3192; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3 3193; GFX11-NEXT: s_denorm_mode 12 3194; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3195; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7 3196; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0 3197; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3198; GFX11-NEXT: v_trunc_f32_e32 v3, v3 3199; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 3200; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 3201; GFX11-NEXT: s_endpgm 3202; 3203; GFX1150-LABEL: frem_v2f32: 3204; GFX1150: ; %bb.0: 3205; GFX1150-NEXT: s_clause 0x1 3206; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3207; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3208; GFX1150-NEXT: v_mov_b32_e32 v4, 0 3209; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 3210; GFX1150-NEXT: s_clause 0x1 3211; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3] 3212; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32 3213; GFX1150-NEXT: s_waitcnt vmcnt(0) 3214; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 3215; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 3216; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3217; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 3218; GFX1150-NEXT: s_denorm_mode 15 3219; GFX1150-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3220; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3221; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v7 3222; GFX1150-NEXT: v_mul_f32_e32 v8, v5, v7 3223; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3224; GFX1150-NEXT: v_fma_f32 v9, -v6, v8, v5 3225; GFX1150-NEXT: v_fmac_f32_e32 v8, v9, v7 3226; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3227; GFX1150-NEXT: v_fma_f32 v5, -v6, v8, v5 3228; GFX1150-NEXT: s_denorm_mode 12 3229; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3230; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3231; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, v1 3232; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 3233; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3234; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 3235; GFX1150-NEXT: v_fma_f32 v1, v5, v3, v1 3236; GFX1150-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 3237; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 3238; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3239; GFX1150-NEXT: v_rcp_f32_e32 v6, v5 3240; GFX1150-NEXT: s_denorm_mode 15 3241; GFX1150-NEXT: v_fma_f32 v7, -v5, v6, 1.0 3242; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3243; GFX1150-NEXT: v_fmac_f32_e32 v6, v7, v6 3244; GFX1150-NEXT: v_mul_f32_e32 v7, v3, v6 3245; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3246; GFX1150-NEXT: v_fma_f32 v8, -v5, v7, v3 3247; GFX1150-NEXT: v_fmac_f32_e32 v7, v8, v6 3248; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3249; GFX1150-NEXT: v_fma_f32 v3, -v5, v7, v3 3250; GFX1150-NEXT: s_denorm_mode 12 3251; GFX1150-NEXT: v_div_fmas_f32 v3, v3, v6, v7 3252; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3253; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v2, v0 3254; GFX1150-NEXT: v_trunc_f32_e32 v3, v3 3255; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3256; GFX1150-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 3257; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2 3258; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1] 3259; GFX1150-NEXT: s_endpgm 3260 ptr addrspace(1) %in2) #0 { 3261 %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 3262 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8 3263 %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8 3264 %r2 = frem <2 x float> %r0, %r1 3265 store <2 x float> %r2, ptr addrspace(1) %out, align 8 3266 ret void 3267} 3268 3269define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, 3270; SI-LABEL: frem_v4f32: 3271; SI: ; %bb.0: 3272; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3273; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3274; SI-NEXT: s_mov_b32 s3, 0xf000 3275; SI-NEXT: s_mov_b32 s2, -1 3276; SI-NEXT: s_waitcnt lgkmcnt(0) 3277; SI-NEXT: s_mov_b32 s0, s8 3278; SI-NEXT: s_mov_b32 s1, s9 3279; SI-NEXT: s_mov_b32 s8, s10 3280; SI-NEXT: s_mov_b32 s9, s11 3281; SI-NEXT: s_mov_b32 s10, s2 3282; SI-NEXT: s_mov_b32 s11, s3 3283; SI-NEXT: s_mov_b32 s6, s2 3284; SI-NEXT: s_mov_b32 s7, s3 3285; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 3286; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 3287; SI-NEXT: s_waitcnt vmcnt(0) 3288; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 3289; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 3290; SI-NEXT: v_rcp_f32_e32 v10, v9 3291; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3292; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3293; SI-NEXT: v_fma_f32 v10, v11, v10, v10 3294; SI-NEXT: v_mul_f32_e32 v11, v8, v10 3295; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 3296; SI-NEXT: v_fma_f32 v11, v12, v10, v11 3297; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 3298; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3299; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 3300; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 3301; SI-NEXT: v_trunc_f32_e32 v8, v8 3302; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 3303; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 3304; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 3305; SI-NEXT: v_rcp_f32_e32 v9, v8 3306; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3307; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 3308; SI-NEXT: v_fma_f32 v9, v10, v9, v9 3309; SI-NEXT: v_mul_f32_e32 v10, v7, v9 3310; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 3311; SI-NEXT: v_fma_f32 v10, v11, v9, v10 3312; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 3313; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3314; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 3315; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3316; SI-NEXT: v_trunc_f32_e32 v7, v7 3317; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 3318; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 3319; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 3320; SI-NEXT: v_rcp_f32_e32 v8, v7 3321; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3322; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 3323; SI-NEXT: v_fma_f32 v8, v9, v8, v8 3324; SI-NEXT: v_mul_f32_e32 v9, v6, v8 3325; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 3326; SI-NEXT: v_fma_f32 v9, v10, v8, v9 3327; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 3328; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3329; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 3330; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3331; SI-NEXT: v_trunc_f32_e32 v6, v6 3332; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 3333; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 3334; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 3335; SI-NEXT: v_rcp_f32_e32 v7, v6 3336; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3337; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3338; SI-NEXT: v_fma_f32 v7, v8, v7, v7 3339; SI-NEXT: v_mul_f32_e32 v8, v5, v7 3340; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 3341; SI-NEXT: v_fma_f32 v8, v9, v7, v8 3342; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 3343; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3344; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3345; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3346; SI-NEXT: v_trunc_f32_e32 v5, v5 3347; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 3348; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3349; SI-NEXT: s_endpgm 3350; 3351; CI-LABEL: frem_v4f32: 3352; CI: ; %bb.0: 3353; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3354; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3355; CI-NEXT: s_mov_b32 s3, 0xf000 3356; CI-NEXT: s_mov_b32 s2, -1 3357; CI-NEXT: s_mov_b32 s6, s2 3358; CI-NEXT: s_waitcnt lgkmcnt(0) 3359; CI-NEXT: s_mov_b32 s0, s8 3360; CI-NEXT: s_mov_b32 s1, s9 3361; CI-NEXT: s_mov_b32 s8, s10 3362; CI-NEXT: s_mov_b32 s9, s11 3363; CI-NEXT: s_mov_b32 s10, s2 3364; CI-NEXT: s_mov_b32 s11, s3 3365; CI-NEXT: s_mov_b32 s7, s3 3366; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 3367; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 3368; CI-NEXT: s_waitcnt vmcnt(0) 3369; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 3370; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 3371; CI-NEXT: v_rcp_f32_e32 v10, v9 3372; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3373; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3374; CI-NEXT: v_fma_f32 v10, v11, v10, v10 3375; CI-NEXT: v_mul_f32_e32 v11, v8, v10 3376; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 3377; CI-NEXT: v_fma_f32 v11, v12, v10, v11 3378; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 3379; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3380; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 3381; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 3382; CI-NEXT: v_trunc_f32_e32 v8, v8 3383; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 3384; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 3385; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 3386; CI-NEXT: v_rcp_f32_e32 v9, v8 3387; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3388; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 3389; CI-NEXT: v_fma_f32 v9, v10, v9, v9 3390; CI-NEXT: v_mul_f32_e32 v10, v7, v9 3391; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 3392; CI-NEXT: v_fma_f32 v10, v11, v9, v10 3393; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 3394; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3395; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 3396; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3397; CI-NEXT: v_trunc_f32_e32 v7, v7 3398; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 3399; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 3400; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 3401; CI-NEXT: v_rcp_f32_e32 v8, v7 3402; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3403; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 3404; CI-NEXT: v_fma_f32 v8, v9, v8, v8 3405; CI-NEXT: v_mul_f32_e32 v9, v6, v8 3406; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 3407; CI-NEXT: v_fma_f32 v9, v10, v8, v9 3408; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 3409; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3410; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 3411; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3412; CI-NEXT: v_trunc_f32_e32 v6, v6 3413; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 3414; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 3415; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 3416; CI-NEXT: v_rcp_f32_e32 v7, v6 3417; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3418; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 3419; CI-NEXT: v_fma_f32 v7, v8, v7, v7 3420; CI-NEXT: v_mul_f32_e32 v8, v5, v7 3421; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 3422; CI-NEXT: v_fma_f32 v8, v9, v7, v8 3423; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 3424; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3425; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 3426; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3427; CI-NEXT: v_trunc_f32_e32 v5, v5 3428; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 3429; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 3430; CI-NEXT: s_endpgm 3431; 3432; VI-LABEL: frem_v4f32: 3433; VI: ; %bb.0: 3434; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3435; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3436; VI-NEXT: s_waitcnt lgkmcnt(0) 3437; VI-NEXT: v_mov_b32_e32 v8, s0 3438; VI-NEXT: s_add_u32 s0, s4, 64 3439; VI-NEXT: v_mov_b32_e32 v9, s1 3440; VI-NEXT: s_addc_u32 s1, s5, 0 3441; VI-NEXT: v_mov_b32_e32 v5, s1 3442; VI-NEXT: v_mov_b32_e32 v0, s2 3443; VI-NEXT: v_mov_b32_e32 v1, s3 3444; VI-NEXT: v_mov_b32_e32 v4, s0 3445; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3446; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 3447; VI-NEXT: s_waitcnt vmcnt(0) 3448; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 3449; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 3450; VI-NEXT: v_rcp_f32_e32 v12, v11 3451; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3452; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 3453; VI-NEXT: v_fma_f32 v12, v13, v12, v12 3454; VI-NEXT: v_mul_f32_e32 v13, v10, v12 3455; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 3456; VI-NEXT: v_fma_f32 v13, v14, v12, v13 3457; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 3458; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3459; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 3460; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 3461; VI-NEXT: v_trunc_f32_e32 v10, v10 3462; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 3463; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 3464; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 3465; VI-NEXT: v_rcp_f32_e32 v11, v10 3466; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3467; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3468; VI-NEXT: v_fma_f32 v11, v12, v11, v11 3469; VI-NEXT: v_mul_f32_e32 v12, v7, v11 3470; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 3471; VI-NEXT: v_fma_f32 v12, v13, v11, v12 3472; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 3473; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3474; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 3475; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3476; VI-NEXT: v_trunc_f32_e32 v7, v7 3477; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 3478; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 3479; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 3480; VI-NEXT: v_rcp_f32_e32 v10, v7 3481; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3482; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 3483; VI-NEXT: v_fma_f32 v10, v11, v10, v10 3484; VI-NEXT: v_mul_f32_e32 v11, v6, v10 3485; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 3486; VI-NEXT: v_fma_f32 v11, v12, v10, v11 3487; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 3488; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3489; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 3490; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3491; VI-NEXT: v_trunc_f32_e32 v6, v6 3492; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 3493; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 3494; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 3495; VI-NEXT: v_rcp_f32_e32 v7, v6 3496; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3497; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 3498; VI-NEXT: v_fma_f32 v7, v10, v7, v7 3499; VI-NEXT: v_mul_f32_e32 v10, v5, v7 3500; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 3501; VI-NEXT: v_fma_f32 v10, v11, v7, v10 3502; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 3503; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3504; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 3505; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3506; VI-NEXT: v_trunc_f32_e32 v5, v5 3507; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 3508; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 3509; VI-NEXT: s_endpgm 3510; 3511; GFX9-LABEL: frem_v4f32: 3512; GFX9: ; %bb.0: 3513; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3514; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3515; GFX9-NEXT: v_mov_b32_e32 v8, 0 3516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3517; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 3518; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 3519; GFX9-NEXT: s_waitcnt vmcnt(0) 3520; GFX9-NEXT: v_div_scale_f32 v10, s[2:3], v7, v7, v3 3521; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 3522; GFX9-NEXT: v_rcp_f32_e32 v11, v10 3523; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3524; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3525; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 3526; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 3527; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 3528; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 3529; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 3530; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3531; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 3532; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 3533; GFX9-NEXT: v_trunc_f32_e32 v9, v9 3534; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 3535; GFX9-NEXT: v_div_scale_f32 v9, s[2:3], v6, v6, v2 3536; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 3537; GFX9-NEXT: v_rcp_f32_e32 v10, v9 3538; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3539; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3540; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 3541; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 3542; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 3543; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 3544; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 3545; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3546; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 3547; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3548; GFX9-NEXT: v_trunc_f32_e32 v7, v7 3549; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 3550; GFX9-NEXT: v_div_scale_f32 v7, s[2:3], v5, v5, v1 3551; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 3552; GFX9-NEXT: v_rcp_f32_e32 v9, v7 3553; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3554; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 3555; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 3556; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 3557; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 3558; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 3559; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 3560; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3561; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 3562; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3563; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3564; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 3565; GFX9-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, v0 3566; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 3567; GFX9-NEXT: v_rcp_f32_e32 v7, v6 3568; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 3569; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 3570; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 3571; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 3572; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 3573; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 3574; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 3575; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 3576; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 3577; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3578; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3579; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 3580; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 3581; GFX9-NEXT: s_endpgm 3582; 3583; GFX10-LABEL: frem_v4f32: 3584; GFX10: ; %bb.0: 3585; GFX10-NEXT: s_clause 0x1 3586; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3587; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3588; GFX10-NEXT: v_mov_b32_e32 v8, 0 3589; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3590; GFX10-NEXT: s_clause 0x1 3591; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 3592; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] offset:64 3593; GFX10-NEXT: s_waitcnt vmcnt(0) 3594; GFX10-NEXT: v_div_scale_f32 v10, s2, v7, v7, v3 3595; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 3596; GFX10-NEXT: v_rcp_f32_e32 v11, v10 3597; GFX10-NEXT: s_denorm_mode 15 3598; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3599; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 3600; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 3601; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 3602; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 3603; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 3604; GFX10-NEXT: s_denorm_mode 12 3605; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 3606; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 3607; GFX10-NEXT: v_trunc_f32_e32 v9, v9 3608; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 3609; GFX10-NEXT: v_div_scale_f32 v9, s2, v6, v6, v2 3610; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 3611; GFX10-NEXT: v_rcp_f32_e32 v10, v9 3612; GFX10-NEXT: s_denorm_mode 15 3613; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3614; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 3615; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 3616; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 3617; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 3618; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 3619; GFX10-NEXT: s_denorm_mode 12 3620; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 3621; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3622; GFX10-NEXT: v_trunc_f32_e32 v7, v7 3623; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 3624; GFX10-NEXT: v_div_scale_f32 v7, s2, v5, v5, v1 3625; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 3626; GFX10-NEXT: v_rcp_f32_e32 v9, v7 3627; GFX10-NEXT: s_denorm_mode 15 3628; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 3629; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 3630; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 3631; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 3632; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 3633; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 3634; GFX10-NEXT: s_denorm_mode 12 3635; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 3636; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3637; GFX10-NEXT: v_trunc_f32_e32 v6, v6 3638; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 3639; GFX10-NEXT: v_div_scale_f32 v6, s2, v4, v4, v0 3640; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 3641; GFX10-NEXT: v_rcp_f32_e32 v7, v6 3642; GFX10-NEXT: s_denorm_mode 15 3643; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 3644; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 3645; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 3646; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 3647; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 3648; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 3649; GFX10-NEXT: s_denorm_mode 12 3650; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 3651; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3652; GFX10-NEXT: v_trunc_f32_e32 v5, v5 3653; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 3654; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 3655; GFX10-NEXT: s_endpgm 3656; 3657; GFX11-LABEL: frem_v4f32: 3658; GFX11: ; %bb.0: 3659; GFX11-NEXT: s_clause 0x1 3660; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3661; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3662; GFX11-NEXT: v_mov_b32_e32 v8, 0 3663; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3664; GFX11-NEXT: s_clause 0x1 3665; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] 3666; GFX11-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 3667; GFX11-NEXT: s_waitcnt vmcnt(0) 3668; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 3669; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 3670; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3671; GFX11-NEXT: v_rcp_f32_e32 v11, v10 3672; GFX11-NEXT: s_denorm_mode 15 3673; GFX11-NEXT: s_waitcnt_depctr 0xfff 3674; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3675; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 3676; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3677; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11 3678; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9 3679; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3680; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 3681; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9 3682; GFX11-NEXT: s_denorm_mode 12 3683; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3684; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12 3685; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3 3686; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3687; GFX11-NEXT: v_trunc_f32_e32 v9, v9 3688; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 3689; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 3690; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 3691; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3692; GFX11-NEXT: v_rcp_f32_e32 v10, v9 3693; GFX11-NEXT: s_denorm_mode 15 3694; GFX11-NEXT: s_waitcnt_depctr 0xfff 3695; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3696; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 3697; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3698; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 3699; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 3700; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3701; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 3702; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 3703; GFX11-NEXT: s_denorm_mode 12 3704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3705; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 3706; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3708; GFX11-NEXT: v_trunc_f32_e32 v7, v7 3709; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 3710; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 3711; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 3712; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3713; GFX11-NEXT: v_rcp_f32_e32 v9, v7 3714; GFX11-NEXT: s_denorm_mode 15 3715; GFX11-NEXT: s_waitcnt_depctr 0xfff 3716; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0 3717; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9 3718; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3719; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9 3720; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6 3721; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3722; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9 3723; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6 3724; GFX11-NEXT: s_denorm_mode 12 3725; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3726; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10 3727; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3728; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3729; GFX11-NEXT: v_trunc_f32_e32 v6, v6 3730; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 3731; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 3732; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 3733; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) 3734; GFX11-NEXT: v_rcp_f32_e32 v7, v6 3735; GFX11-NEXT: s_denorm_mode 15 3736; GFX11-NEXT: s_waitcnt_depctr 0xfff 3737; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0 3738; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7 3739; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3740; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7 3741; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5 3742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3743; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7 3744; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5 3745; GFX11-NEXT: s_denorm_mode 12 3746; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3747; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9 3748; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3749; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3750; GFX11-NEXT: v_trunc_f32_e32 v5, v5 3751; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 3752; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 3753; GFX11-NEXT: s_endpgm 3754; 3755; GFX1150-LABEL: frem_v4f32: 3756; GFX1150: ; %bb.0: 3757; GFX1150-NEXT: s_clause 0x1 3758; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3759; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3760; GFX1150-NEXT: v_mov_b32_e32 v8, 0 3761; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 3762; GFX1150-NEXT: s_clause 0x1 3763; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[2:3] 3764; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64 3765; GFX1150-NEXT: s_waitcnt vmcnt(0) 3766; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 3767; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 3768; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3769; GFX1150-NEXT: v_rcp_f32_e32 v11, v10 3770; GFX1150-NEXT: s_denorm_mode 15 3771; GFX1150-NEXT: v_fma_f32 v12, -v10, v11, 1.0 3772; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3773; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v11 3774; GFX1150-NEXT: v_mul_f32_e32 v12, v9, v11 3775; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3776; GFX1150-NEXT: v_fma_f32 v13, -v10, v12, v9 3777; GFX1150-NEXT: v_fmac_f32_e32 v12, v13, v11 3778; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3779; GFX1150-NEXT: v_fma_f32 v9, -v10, v12, v9 3780; GFX1150-NEXT: s_denorm_mode 12 3781; GFX1150-NEXT: v_div_fmas_f32 v9, v9, v11, v12 3782; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3783; GFX1150-NEXT: v_div_fixup_f32 v9, v9, v7, v3 3784; GFX1150-NEXT: v_trunc_f32_e32 v9, v9 3785; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3786; GFX1150-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 3787; GFX1150-NEXT: v_fma_f32 v3, v9, v7, v3 3788; GFX1150-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 3789; GFX1150-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 3790; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3791; GFX1150-NEXT: v_rcp_f32_e32 v10, v9 3792; GFX1150-NEXT: s_denorm_mode 15 3793; GFX1150-NEXT: v_fma_f32 v11, -v9, v10, 1.0 3794; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3795; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v10 3796; GFX1150-NEXT: v_mul_f32_e32 v11, v7, v10 3797; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3798; GFX1150-NEXT: v_fma_f32 v12, -v9, v11, v7 3799; GFX1150-NEXT: v_fmac_f32_e32 v11, v12, v10 3800; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3801; GFX1150-NEXT: v_fma_f32 v7, -v9, v11, v7 3802; GFX1150-NEXT: s_denorm_mode 12 3803; GFX1150-NEXT: v_div_fmas_f32 v7, v7, v10, v11 3804; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3805; GFX1150-NEXT: v_div_fixup_f32 v7, v7, v6, v2 3806; GFX1150-NEXT: v_trunc_f32_e32 v7, v7 3807; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3808; GFX1150-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 3809; GFX1150-NEXT: v_fma_f32 v2, v7, v6, v2 3810; GFX1150-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 3811; GFX1150-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 3812; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3813; GFX1150-NEXT: v_rcp_f32_e32 v9, v7 3814; GFX1150-NEXT: s_denorm_mode 15 3815; GFX1150-NEXT: v_fma_f32 v10, -v7, v9, 1.0 3816; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3817; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v9 3818; GFX1150-NEXT: v_mul_f32_e32 v10, v6, v9 3819; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3820; GFX1150-NEXT: v_fma_f32 v11, -v7, v10, v6 3821; GFX1150-NEXT: v_fmac_f32_e32 v10, v11, v9 3822; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3823; GFX1150-NEXT: v_fma_f32 v6, -v7, v10, v6 3824; GFX1150-NEXT: s_denorm_mode 12 3825; GFX1150-NEXT: v_div_fmas_f32 v6, v6, v9, v10 3826; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3827; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v5, v1 3828; GFX1150-NEXT: v_trunc_f32_e32 v6, v6 3829; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3830; GFX1150-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 3831; GFX1150-NEXT: v_fma_f32 v1, v6, v5, v1 3832; GFX1150-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 3833; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 3834; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) 3835; GFX1150-NEXT: v_rcp_f32_e32 v7, v6 3836; GFX1150-NEXT: s_denorm_mode 15 3837; GFX1150-NEXT: v_fma_f32 v9, -v6, v7, 1.0 3838; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3839; GFX1150-NEXT: v_fmac_f32_e32 v7, v9, v7 3840; GFX1150-NEXT: v_mul_f32_e32 v9, v5, v7 3841; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3842; GFX1150-NEXT: v_fma_f32 v10, -v6, v9, v5 3843; GFX1150-NEXT: v_fmac_f32_e32 v9, v10, v7 3844; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 3845; GFX1150-NEXT: v_fma_f32 v5, -v6, v9, v5 3846; GFX1150-NEXT: s_denorm_mode 12 3847; GFX1150-NEXT: v_div_fmas_f32 v5, v5, v7, v9 3848; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3849; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v4, v0 3850; GFX1150-NEXT: v_trunc_f32_e32 v5, v5 3851; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 3852; GFX1150-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 3853; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4 3854; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1] 3855; GFX1150-NEXT: s_endpgm 3856 ptr addrspace(1) %in2) #0 { 3857 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 3858 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16 3859 %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16 3860 %r2 = frem <4 x float> %r0, %r1 3861 store <4 x float> %r2, ptr addrspace(1) %out, align 16 3862 ret void 3863} 3864 3865define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, 3866; SI-LABEL: frem_v2f64: 3867; SI: ; %bb.0: 3868; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 3869; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 3870; SI-NEXT: s_mov_b32 s7, 0xf000 3871; SI-NEXT: s_mov_b32 s6, -1 3872; SI-NEXT: s_waitcnt lgkmcnt(0) 3873; SI-NEXT: s_mov_b32 s4, s0 3874; SI-NEXT: s_mov_b32 s5, s1 3875; SI-NEXT: s_mov_b32 s0, s2 3876; SI-NEXT: s_mov_b32 s1, s3 3877; SI-NEXT: s_mov_b32 s2, s6 3878; SI-NEXT: s_mov_b32 s3, s7 3879; SI-NEXT: s_mov_b32 s10, s6 3880; SI-NEXT: s_mov_b32 s11, s7 3881; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 3882; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 3883; SI-NEXT: s_waitcnt vmcnt(0) 3884; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] 3885; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3886; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3887; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3888; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3889; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3890; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] 3891; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3892; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] 3893; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 3894; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 3895; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 3896; SI-NEXT: s_nop 1 3897; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] 3898; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3899; SI-NEXT: v_readfirstlane_b32 s8, v9 3900; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 3901; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 3902; SI-NEXT: s_mov_b32 s3, 0xfffff 3903; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 3904; SI-NEXT: v_not_b32_e32 v10, s0 3905; SI-NEXT: v_and_b32_e32 v10, v8, v10 3906; SI-NEXT: v_not_b32_e32 v11, s1 3907; SI-NEXT: v_and_b32_e32 v9, v9, v11 3908; SI-NEXT: s_and_b32 s0, s8, 0x80000000 3909; SI-NEXT: s_cmp_lt_i32 s9, 0 3910; SI-NEXT: s_cselect_b64 vcc, -1, 0 3911; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc 3912; SI-NEXT: v_mov_b32_e32 v11, s0 3913; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 3914; SI-NEXT: s_cmp_gt_i32 s9, 51 3915; SI-NEXT: s_cselect_b64 vcc, -1, 0 3916; SI-NEXT: v_mov_b32_e32 v11, s8 3917; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc 3918; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc 3919; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3920; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 3921; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3922; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3923; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3924; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3925; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3926; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] 3927; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3928; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] 3929; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 3930; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 3931; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc 3932; SI-NEXT: s_nop 1 3933; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] 3934; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 3935; SI-NEXT: v_readfirstlane_b32 s8, v7 3936; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 3937; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 3938; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 3939; SI-NEXT: v_not_b32_e32 v8, s0 3940; SI-NEXT: v_and_b32_e32 v8, v6, v8 3941; SI-NEXT: v_not_b32_e32 v9, s1 3942; SI-NEXT: v_and_b32_e32 v7, v7, v9 3943; SI-NEXT: s_and_b32 s0, s8, 0x80000000 3944; SI-NEXT: s_cmp_lt_i32 s9, 0 3945; SI-NEXT: s_cselect_b64 vcc, -1, 0 3946; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc 3947; SI-NEXT: v_mov_b32_e32 v9, s0 3948; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 3949; SI-NEXT: s_cmp_gt_i32 s9, 51 3950; SI-NEXT: s_cselect_b64 vcc, -1, 0 3951; SI-NEXT: v_mov_b32_e32 v9, s8 3952; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc 3953; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc 3954; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 3955; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 3956; SI-NEXT: s_endpgm 3957; 3958; CI-LABEL: frem_v2f64: 3959; CI: ; %bb.0: 3960; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3961; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 3962; CI-NEXT: s_mov_b32 s3, 0xf000 3963; CI-NEXT: s_mov_b32 s2, -1 3964; CI-NEXT: s_mov_b32 s6, s2 3965; CI-NEXT: s_waitcnt lgkmcnt(0) 3966; CI-NEXT: s_mov_b32 s0, s8 3967; CI-NEXT: s_mov_b32 s1, s9 3968; CI-NEXT: s_mov_b32 s8, s10 3969; CI-NEXT: s_mov_b32 s9, s11 3970; CI-NEXT: s_mov_b32 s10, s2 3971; CI-NEXT: s_mov_b32 s11, s3 3972; CI-NEXT: s_mov_b32 s7, s3 3973; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 3974; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 3975; CI-NEXT: s_waitcnt vmcnt(0) 3976; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] 3977; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 3978; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3979; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3980; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 3981; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 3982; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 3983; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 3984; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 3985; CI-NEXT: s_nop 1 3986; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 3987; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 3988; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 3989; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 3990; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] 3991; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 3992; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3993; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3994; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 3995; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 3996; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 3997; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 3998; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 3999; CI-NEXT: s_nop 1 4000; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 4001; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4002; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4003; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4004; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 4005; CI-NEXT: s_endpgm 4006; 4007; VI-LABEL: frem_v2f64: 4008; VI: ; %bb.0: 4009; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4010; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 4011; VI-NEXT: s_waitcnt lgkmcnt(0) 4012; VI-NEXT: v_mov_b32_e32 v8, s0 4013; VI-NEXT: s_add_u32 s0, s4, 64 4014; VI-NEXT: v_mov_b32_e32 v9, s1 4015; VI-NEXT: s_addc_u32 s1, s5, 0 4016; VI-NEXT: v_mov_b32_e32 v5, s1 4017; VI-NEXT: v_mov_b32_e32 v0, s2 4018; VI-NEXT: v_mov_b32_e32 v1, s3 4019; VI-NEXT: v_mov_b32_e32 v4, s0 4020; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 4021; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 4022; VI-NEXT: s_waitcnt vmcnt(0) 4023; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] 4024; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] 4025; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 4026; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 4027; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 4028; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] 4029; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] 4030; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] 4031; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] 4032; VI-NEXT: s_nop 1 4033; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] 4034; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] 4035; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] 4036; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] 4037; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] 4038; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] 4039; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 4040; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4041; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 4042; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4043; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] 4044; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 4045; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] 4046; VI-NEXT: s_nop 1 4047; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] 4048; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4049; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4050; VI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4051; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 4052; VI-NEXT: s_endpgm 4053; 4054; GFX9-LABEL: frem_v2f64: 4055; GFX9: ; %bb.0: 4056; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4057; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4058; GFX9-NEXT: v_mov_b32_e32 v16, 0 4059; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4060; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] 4061; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 4062; GFX9-NEXT: s_waitcnt vmcnt(0) 4063; GFX9-NEXT: v_div_scale_f64 v[8:9], s[2:3], v[6:7], v[6:7], v[2:3] 4064; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 4065; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4066; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4067; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4068; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4069; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] 4070; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 4071; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 4072; GFX9-NEXT: s_nop 1 4073; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 4074; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 4075; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 4076; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 4077; GFX9-NEXT: v_div_scale_f64 v[6:7], s[2:3], v[4:5], v[4:5], v[0:1] 4078; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 4079; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4080; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4081; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4082; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4083; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] 4084; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 4085; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 4086; GFX9-NEXT: s_nop 1 4087; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 4088; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4089; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4090; GFX9-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4091; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] 4092; GFX9-NEXT: s_endpgm 4093; 4094; GFX10-LABEL: frem_v2f64: 4095; GFX10: ; %bb.0: 4096; GFX10-NEXT: s_clause 0x1 4097; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4098; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4099; GFX10-NEXT: v_mov_b32_e32 v16, 0 4100; GFX10-NEXT: s_waitcnt lgkmcnt(0) 4101; GFX10-NEXT: s_clause 0x1 4102; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] 4103; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:64 4104; GFX10-NEXT: s_waitcnt vmcnt(0) 4105; GFX10-NEXT: v_div_scale_f64 v[8:9], s2, v[6:7], v[6:7], v[2:3] 4106; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 4107; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4108; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4109; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4110; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4111; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 4112; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 4113; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 4114; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 4115; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 4116; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 4117; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 4118; GFX10-NEXT: v_div_scale_f64 v[6:7], s2, v[4:5], v[4:5], v[0:1] 4119; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 4120; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4121; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4122; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4123; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4124; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 4125; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 4126; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 4127; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 4128; GFX10-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4129; GFX10-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4130; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4131; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] 4132; GFX10-NEXT: s_endpgm 4133; 4134; GFX11-LABEL: frem_v2f64: 4135; GFX11: ; %bb.0: 4136; GFX11-NEXT: s_clause 0x1 4137; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4138; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 4139; GFX11-NEXT: v_mov_b32_e32 v16, 0 4140; GFX11-NEXT: s_waitcnt lgkmcnt(0) 4141; GFX11-NEXT: s_clause 0x1 4142; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] 4143; GFX11-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 4144; GFX11-NEXT: s_waitcnt vmcnt(0) 4145; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] 4146; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 4147; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 4148; GFX11-NEXT: s_waitcnt_depctr 0xfff 4149; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4150; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4151; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4152; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4153; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4154; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 4155; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4156; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 4157; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 4158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4159; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 4160; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 4161; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4162; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 4163; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 4164; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] 4165; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 4166; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 4167; GFX11-NEXT: s_waitcnt_depctr 0xfff 4168; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4169; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4170; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4171; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4172; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4173; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 4174; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4175; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 4176; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 4177; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4178; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 4179; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4180; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4181; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4182; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4183; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] 4184; GFX11-NEXT: s_endpgm 4185; 4186; GFX1150-LABEL: frem_v2f64: 4187; GFX1150: ; %bb.0: 4188; GFX1150-NEXT: s_clause 0x1 4189; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 4190; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 4191; GFX1150-NEXT: v_mov_b32_e32 v16, 0 4192; GFX1150-NEXT: s_waitcnt lgkmcnt(0) 4193; GFX1150-NEXT: s_clause 0x1 4194; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[2:3] 4195; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64 4196; GFX1150-NEXT: s_waitcnt vmcnt(0) 4197; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] 4198; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 4199; GFX1150-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] 4200; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4201; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4202; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4203; GFX1150-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 4204; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 4205; GFX1150-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] 4206; GFX1150-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] 4207; GFX1150-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] 4208; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4209; GFX1150-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] 4210; GFX1150-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] 4211; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4212; GFX1150-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] 4213; GFX1150-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] 4214; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 4215; GFX1150-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] 4216; GFX1150-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] 4217; GFX1150-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] 4218; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4219; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4220; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4221; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4222; GFX1150-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 4223; GFX1150-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] 4224; GFX1150-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] 4225; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4226; GFX1150-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] 4227; GFX1150-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] 4228; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4229; GFX1150-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] 4230; GFX1150-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] 4231; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 4232; GFX1150-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] 4233; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] 4234; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1] 4235; GFX1150-NEXT: s_endpgm 4236 ptr addrspace(1) %in2) #0 { 4237 %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 4238 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16 4239 %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16 4240 %r2 = frem <2 x double> %r0, %r1 4241 store <2 x double> %r2, ptr addrspace(1) %out, align 16 4242 ret void 4243} 4244 4245attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 4246attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 4247