1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s 3; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s 4 5define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 6; CI-LABEL: frem_f16: 7; CI: ; %bb.0: 8; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 9; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 10; CI-NEXT: s_waitcnt lgkmcnt(0) 11; CI-NEXT: s_load_dword s2, s[2:3], 0x0 12; CI-NEXT: s_load_dword s3, s[4:5], 0x2 13; CI-NEXT: s_waitcnt lgkmcnt(0) 14; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 15; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 16; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 17; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 18; CI-NEXT: v_rcp_f32_e32 v4, v2 19; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 20; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 21; CI-NEXT: v_fma_f32 v4, v5, v4, v4 22; CI-NEXT: v_mul_f32_e32 v5, v3, v4 23; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 24; CI-NEXT: v_fma_f32 v5, v6, v4, v5 25; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 26; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 27; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 28; CI-NEXT: s_mov_b32 s2, -1 29; CI-NEXT: s_mov_b32 s3, 0xf000 30; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 31; CI-NEXT: v_trunc_f32_e32 v2, v2 32; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 33; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 34; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 35; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 36; CI-NEXT: s_endpgm 37; 38; VI-LABEL: frem_f16: 39; VI: ; %bb.0: 40; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 41; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_load_dword s2, s[2:3], 0x0 44; VI-NEXT: s_load_dword s3, s[4:5], 0x8 45; VI-NEXT: s_waitcnt lgkmcnt(0) 46; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 47; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 48; VI-NEXT: v_mov_b32_e32 v1, s3 49; VI-NEXT: v_rcp_f32_e32 v3, v2 50; VI-NEXT: v_mul_f32_e32 v4, v0, v3 51; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 52; VI-NEXT: v_mac_f32_e32 v4, v5, v3 53; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 54; VI-NEXT: v_mul_f32_e32 v0, v0, v3 55; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 56; VI-NEXT: v_add_f32_e32 v0, v0, v4 57; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 58; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 59; VI-NEXT: v_trunc_f16_e32 v0, v0 60; VI-NEXT: v_fma_f16 v2, -v0, v1, s2 61; VI-NEXT: v_mov_b32_e32 v0, s0 62; VI-NEXT: v_mov_b32_e32 v1, s1 63; VI-NEXT: flat_store_short v[0:1], v2 64; VI-NEXT: s_endpgm 65 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 66 %r0 = load half, ptr addrspace(1) %in1, align 4 67 %r1 = load half, ptr addrspace(1) %gep2, align 4 68 %r2 = frem half %r0, %r1 69 store half %r2, ptr addrspace(1) %out, align 4 70 ret void 71} 72 73define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 74; CI-LABEL: fast_frem_f16: 75; CI: ; %bb.0: 76; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 77; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 78; CI-NEXT: s_waitcnt lgkmcnt(0) 79; CI-NEXT: s_load_dword s2, s[2:3], 0x0 80; CI-NEXT: s_load_dword s3, s[4:5], 0x2 81; CI-NEXT: s_waitcnt lgkmcnt(0) 82; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 83; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 84; CI-NEXT: s_mov_b32 s2, -1 85; CI-NEXT: s_mov_b32 s3, 0xf000 86; CI-NEXT: v_rcp_f32_e32 v2, v1 87; CI-NEXT: v_mul_f32_e32 v2, v0, v2 88; CI-NEXT: v_trunc_f32_e32 v2, v2 89; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 90; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 91; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 92; CI-NEXT: s_endpgm 93; 94; VI-LABEL: fast_frem_f16: 95; VI: ; %bb.0: 96; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 97; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: s_load_dword s2, s[2:3], 0x0 100; VI-NEXT: s_load_dword s3, s[4:5], 0x8 101; VI-NEXT: s_waitcnt lgkmcnt(0) 102; VI-NEXT: v_mov_b32_e32 v1, s2 103; VI-NEXT: v_rcp_f16_e32 v0, s3 104; VI-NEXT: v_mul_f16_e32 v0, s2, v0 105; VI-NEXT: v_trunc_f16_e32 v0, v0 106; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 107; VI-NEXT: v_mov_b32_e32 v0, s0 108; VI-NEXT: v_mov_b32_e32 v1, s1 109; VI-NEXT: flat_store_short v[0:1], v2 110; VI-NEXT: s_endpgm 111 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 112 %r0 = load half, ptr addrspace(1) %in1, align 4 113 %r1 = load half, ptr addrspace(1) %gep2, align 4 114 %r2 = frem fast half %r0, %r1 115 store half %r2, ptr addrspace(1) %out, align 4 116 ret void 117} 118 119define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { 120; CI-LABEL: unsafe_frem_f16: 121; CI: ; %bb.0: 122; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 123; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 124; CI-NEXT: s_waitcnt lgkmcnt(0) 125; CI-NEXT: s_load_dword s2, s[2:3], 0x0 126; CI-NEXT: s_load_dword s3, s[4:5], 0x2 127; CI-NEXT: s_waitcnt lgkmcnt(0) 128; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 129; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 130; CI-NEXT: s_mov_b32 s2, -1 131; CI-NEXT: s_mov_b32 s3, 0xf000 132; CI-NEXT: v_rcp_f32_e32 v2, v1 133; CI-NEXT: v_mul_f32_e32 v2, v0, v2 134; CI-NEXT: v_trunc_f32_e32 v2, v2 135; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 136; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 137; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 138; CI-NEXT: s_endpgm 139; 140; VI-LABEL: unsafe_frem_f16: 141; VI: ; %bb.0: 142; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 143; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 144; VI-NEXT: s_waitcnt lgkmcnt(0) 145; VI-NEXT: s_load_dword s2, s[2:3], 0x0 146; VI-NEXT: s_load_dword s3, s[4:5], 0x8 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: v_mov_b32_e32 v1, s2 149; VI-NEXT: v_rcp_f16_e32 v0, s3 150; VI-NEXT: v_mul_f16_e32 v0, s2, v0 151; VI-NEXT: v_trunc_f16_e32 v0, v0 152; VI-NEXT: v_fma_f16 v2, -v0, s3, v1 153; VI-NEXT: v_mov_b32_e32 v0, s0 154; VI-NEXT: v_mov_b32_e32 v1, s1 155; VI-NEXT: flat_store_short v[0:1], v2 156; VI-NEXT: s_endpgm 157 %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 158 %r0 = load half, ptr addrspace(1) %in1, align 4 159 %r1 = load half, ptr addrspace(1) %gep2, align 4 160 %r2 = frem half %r0, %r1 161 store half %r2, ptr addrspace(1) %out, align 4 162 ret void 163} 164 165define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 166; CI-LABEL: frem_f32: 167; CI: ; %bb.0: 168; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 169; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 170; CI-NEXT: s_waitcnt lgkmcnt(0) 171; CI-NEXT: s_load_dword s6, s[2:3], 0x0 172; CI-NEXT: s_load_dword s2, s[4:5], 0x4 173; CI-NEXT: s_waitcnt lgkmcnt(0) 174; CI-NEXT: v_mov_b32_e32 v0, s2 175; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 176; CI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 177; CI-NEXT: v_rcp_f32_e32 v3, v1 178; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 179; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 180; CI-NEXT: v_fma_f32 v3, v4, v3, v3 181; CI-NEXT: v_mul_f32_e32 v4, v2, v3 182; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 183; CI-NEXT: v_fma_f32 v4, v5, v3, v4 184; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 185; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 186; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 187; CI-NEXT: s_mov_b32 s2, -1 188; CI-NEXT: s_mov_b32 s3, 0xf000 189; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 190; CI-NEXT: v_trunc_f32_e32 v1, v1 191; CI-NEXT: v_fma_f32 v0, -v1, v0, s6 192; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 193; CI-NEXT: s_endpgm 194; 195; VI-LABEL: frem_f32: 196; VI: ; %bb.0: 197; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 198; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 199; VI-NEXT: s_waitcnt lgkmcnt(0) 200; VI-NEXT: s_load_dword s6, s[2:3], 0x0 201; VI-NEXT: s_load_dword s2, s[4:5], 0x10 202; VI-NEXT: s_waitcnt lgkmcnt(0) 203; VI-NEXT: v_mov_b32_e32 v0, s2 204; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s6 205; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 206; VI-NEXT: v_rcp_f32_e32 v3, v1 207; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 208; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 209; VI-NEXT: v_fma_f32 v3, v4, v3, v3 210; VI-NEXT: v_mul_f32_e32 v4, v2, v3 211; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 212; VI-NEXT: v_fma_f32 v4, v5, v3, v4 213; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 214; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 215; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 216; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6 217; VI-NEXT: v_trunc_f32_e32 v1, v1 218; VI-NEXT: v_fma_f32 v2, -v1, v0, s6 219; VI-NEXT: v_mov_b32_e32 v0, s0 220; VI-NEXT: v_mov_b32_e32 v1, s1 221; VI-NEXT: flat_store_dword v[0:1], v2 222; VI-NEXT: s_endpgm 223 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 224 %r0 = load float, ptr addrspace(1) %in1, align 4 225 %r1 = load float, ptr addrspace(1) %gep2, align 4 226 %r2 = frem float %r0, %r1 227 store float %r2, ptr addrspace(1) %out, align 4 228 ret void 229} 230 231define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 232; CI-LABEL: fast_frem_f32: 233; CI: ; %bb.0: 234; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 235; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 236; CI-NEXT: s_waitcnt lgkmcnt(0) 237; CI-NEXT: s_load_dword s2, s[2:3], 0x0 238; CI-NEXT: s_load_dword s3, s[4:5], 0x4 239; CI-NEXT: s_waitcnt lgkmcnt(0) 240; CI-NEXT: v_mov_b32_e32 v1, s2 241; CI-NEXT: v_rcp_f32_e32 v0, s3 242; CI-NEXT: v_mul_f32_e32 v0, s2, v0 243; CI-NEXT: v_trunc_f32_e32 v0, v0 244; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 245; CI-NEXT: s_mov_b32 s2, -1 246; CI-NEXT: s_mov_b32 s3, 0xf000 247; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 248; CI-NEXT: s_endpgm 249; 250; VI-LABEL: fast_frem_f32: 251; VI: ; %bb.0: 252; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 253; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 254; VI-NEXT: s_waitcnt lgkmcnt(0) 255; VI-NEXT: s_load_dword s2, s[2:3], 0x0 256; VI-NEXT: s_load_dword s3, s[4:5], 0x10 257; VI-NEXT: s_waitcnt lgkmcnt(0) 258; VI-NEXT: v_mov_b32_e32 v1, s2 259; VI-NEXT: v_rcp_f32_e32 v0, s3 260; VI-NEXT: v_mul_f32_e32 v0, s2, v0 261; VI-NEXT: v_trunc_f32_e32 v0, v0 262; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 263; VI-NEXT: v_mov_b32_e32 v0, s0 264; VI-NEXT: v_mov_b32_e32 v1, s1 265; VI-NEXT: flat_store_dword v[0:1], v2 266; VI-NEXT: s_endpgm 267 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 268 %r0 = load float, ptr addrspace(1) %in1, align 4 269 %r1 = load float, ptr addrspace(1) %gep2, align 4 270 %r2 = frem fast float %r0, %r1 271 store float %r2, ptr addrspace(1) %out, align 4 272 ret void 273} 274 275define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #1 { 276; CI-LABEL: unsafe_frem_f32: 277; CI: ; %bb.0: 278; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 279; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 280; CI-NEXT: s_waitcnt lgkmcnt(0) 281; CI-NEXT: s_load_dword s2, s[2:3], 0x0 282; CI-NEXT: s_load_dword s3, s[4:5], 0x4 283; CI-NEXT: s_waitcnt lgkmcnt(0) 284; CI-NEXT: v_mov_b32_e32 v1, s2 285; CI-NEXT: v_rcp_f32_e32 v0, s3 286; CI-NEXT: v_mul_f32_e32 v0, s2, v0 287; CI-NEXT: v_trunc_f32_e32 v0, v0 288; CI-NEXT: v_fma_f32 v0, -v0, s3, v1 289; CI-NEXT: s_mov_b32 s2, -1 290; CI-NEXT: s_mov_b32 s3, 0xf000 291; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 292; CI-NEXT: s_endpgm 293; 294; VI-LABEL: unsafe_frem_f32: 295; VI: ; %bb.0: 296; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 297; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 298; VI-NEXT: s_waitcnt lgkmcnt(0) 299; VI-NEXT: s_load_dword s2, s[2:3], 0x0 300; VI-NEXT: s_load_dword s3, s[4:5], 0x10 301; VI-NEXT: s_waitcnt lgkmcnt(0) 302; VI-NEXT: v_mov_b32_e32 v1, s2 303; VI-NEXT: v_rcp_f32_e32 v0, s3 304; VI-NEXT: v_mul_f32_e32 v0, s2, v0 305; VI-NEXT: v_trunc_f32_e32 v0, v0 306; VI-NEXT: v_fma_f32 v2, -v0, s3, v1 307; VI-NEXT: v_mov_b32_e32 v0, s0 308; VI-NEXT: v_mov_b32_e32 v1, s1 309; VI-NEXT: flat_store_dword v[0:1], v2 310; VI-NEXT: s_endpgm 311 %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 312 %r0 = load float, ptr addrspace(1) %in1, align 4 313 %r1 = load float, ptr addrspace(1) %gep2, align 4 314 %r2 = frem float %r0, %r1 315 store float %r2, ptr addrspace(1) %out, align 4 316 ret void 317} 318 319define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 320; CI-LABEL: frem_f64: 321; CI: ; %bb.0: 322; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 323; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 324; CI-NEXT: s_waitcnt lgkmcnt(0) 325; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 326; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 327; CI-NEXT: s_waitcnt lgkmcnt(0) 328; CI-NEXT: v_mov_b32_e32 v0, s4 329; CI-NEXT: v_mov_b32_e32 v1, s5 330; CI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] 331; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] 332; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 333; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 334; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 335; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 336; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 337; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] 338; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] 339; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] 340; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] 341; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] 342; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] 343; CI-NEXT: s_mov_b32 s2, -1 344; CI-NEXT: s_mov_b32 s3, 0xf000 345; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 346; CI-NEXT: s_endpgm 347; 348; VI-LABEL: frem_f64: 349; VI: ; %bb.0: 350; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 351; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 352; VI-NEXT: s_waitcnt lgkmcnt(0) 353; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 354; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 355; VI-NEXT: s_waitcnt lgkmcnt(0) 356; VI-NEXT: v_mov_b32_e32 v0, s4 357; VI-NEXT: v_mov_b32_e32 v1, s5 358; VI-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[2:3] 359; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] 360; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 361; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 362; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 363; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 364; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 365; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] 366; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] 367; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] 368; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3] 369; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] 370; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3] 371; VI-NEXT: v_mov_b32_e32 v3, s1 372; VI-NEXT: v_mov_b32_e32 v2, s0 373; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 374; VI-NEXT: s_endpgm 375 %r0 = load double, ptr addrspace(1) %in1, align 8 376 %r1 = load double, ptr addrspace(1) %in2, align 8 377 %r2 = frem double %r0, %r1 378 store double %r2, ptr addrspace(1) %out, align 8 379 ret void 380} 381 382define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 383; CI-LABEL: fast_frem_f64: 384; CI: ; %bb.0: 385; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 386; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 387; CI-NEXT: s_waitcnt lgkmcnt(0) 388; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 389; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 390; CI-NEXT: s_waitcnt lgkmcnt(0) 391; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] 392; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 393; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 394; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 395; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 396; CI-NEXT: v_mov_b32_e32 v2, s2 397; CI-NEXT: v_mov_b32_e32 v3, s3 398; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] 399; CI-NEXT: s_mov_b32 s2, -1 400; CI-NEXT: s_mov_b32 s3, 0xf000 401; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] 402; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] 403; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] 404; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] 405; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 406; CI-NEXT: s_endpgm 407; 408; VI-LABEL: fast_frem_f64: 409; VI: ; %bb.0: 410; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 411; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 412; VI-NEXT: s_waitcnt lgkmcnt(0) 413; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 414; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 415; VI-NEXT: s_waitcnt lgkmcnt(0) 416; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] 417; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 418; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 419; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 420; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 421; VI-NEXT: v_mov_b32_e32 v2, s2 422; VI-NEXT: v_mov_b32_e32 v3, s3 423; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] 424; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] 425; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] 426; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] 427; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] 428; VI-NEXT: v_mov_b32_e32 v3, s1 429; VI-NEXT: v_mov_b32_e32 v2, s0 430; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 431; VI-NEXT: s_endpgm 432 %r0 = load double, ptr addrspace(1) %in1, align 8 433 %r1 = load double, ptr addrspace(1) %in2, align 8 434 %r2 = frem fast double %r0, %r1 435 store double %r2, ptr addrspace(1) %out, align 8 436 ret void 437} 438 439define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, 440; CI-LABEL: unsafe_frem_f64: 441; CI: ; %bb.0: 442; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 443; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 444; CI-NEXT: s_waitcnt lgkmcnt(0) 445; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 446; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 447; CI-NEXT: s_waitcnt lgkmcnt(0) 448; CI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] 449; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 450; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 451; CI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 452; CI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 453; CI-NEXT: v_mov_b32_e32 v2, s2 454; CI-NEXT: v_mov_b32_e32 v3, s3 455; CI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] 456; CI-NEXT: s_mov_b32 s2, -1 457; CI-NEXT: s_mov_b32 s3, 0xf000 458; CI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] 459; CI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] 460; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] 461; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] 462; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 463; CI-NEXT: s_endpgm 464; 465; VI-LABEL: unsafe_frem_f64: 466; VI: ; %bb.0: 467; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 468; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 469; VI-NEXT: s_waitcnt lgkmcnt(0) 470; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 471; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 472; VI-NEXT: s_waitcnt lgkmcnt(0) 473; VI-NEXT: v_rcp_f64_e32 v[0:1], s[4:5] 474; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 475; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 476; VI-NEXT: v_fma_f64 v[2:3], -s[4:5], v[0:1], 1.0 477; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] 478; VI-NEXT: v_mov_b32_e32 v2, s2 479; VI-NEXT: v_mov_b32_e32 v3, s3 480; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1] 481; VI-NEXT: v_fma_f64 v[6:7], -s[4:5], v[4:5], v[2:3] 482; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5] 483; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] 484; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[4:5], v[2:3] 485; VI-NEXT: v_mov_b32_e32 v3, s1 486; VI-NEXT: v_mov_b32_e32 v2, s0 487; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 488; VI-NEXT: s_endpgm 489 ptr addrspace(1) %in2) #1 { 490 %r0 = load double, ptr addrspace(1) %in1, align 8 491 %r1 = load double, ptr addrspace(1) %in2, align 8 492 %r2 = frem double %r0, %r1 493 store double %r2, ptr addrspace(1) %out, align 8 494 ret void 495} 496 497define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 498; CI-LABEL: frem_v2f16: 499; CI: ; %bb.0: 500; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 501; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 502; CI-NEXT: s_waitcnt lgkmcnt(0) 503; CI-NEXT: s_load_dword s2, s[2:3], 0x0 504; CI-NEXT: s_load_dword s3, s[4:5], 0x4 505; CI-NEXT: s_waitcnt lgkmcnt(0) 506; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 507; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 508; CI-NEXT: s_lshr_b32 s4, s2, 16 509; CI-NEXT: s_lshr_b32 s5, s3, 16 510; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 511; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 512; CI-NEXT: v_rcp_f32_e32 v4, v2 513; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 514; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 515; CI-NEXT: v_fma_f32 v4, v5, v4, v4 516; CI-NEXT: v_mul_f32_e32 v5, v3, v4 517; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 518; CI-NEXT: v_fma_f32 v5, v6, v4, v5 519; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 520; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 521; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 522; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 523; CI-NEXT: v_trunc_f32_e32 v2, v2 524; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 525; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 526; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 527; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 528; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 529; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, v1 530; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 531; CI-NEXT: v_rcp_f32_e32 v5, v3 532; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 533; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 534; CI-NEXT: v_fma_f32 v5, v6, v5, v5 535; CI-NEXT: v_mul_f32_e32 v6, v4, v5 536; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 537; CI-NEXT: v_fma_f32 v6, v7, v5, v6 538; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 539; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 540; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 541; CI-NEXT: s_mov_b32 s2, -1 542; CI-NEXT: s_mov_b32 s3, 0xf000 543; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 544; CI-NEXT: v_trunc_f32_e32 v3, v3 545; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 546; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 547; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 548; CI-NEXT: v_or_b32_e32 v0, v0, v1 549; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 550; CI-NEXT: s_endpgm 551; 552; VI-LABEL: frem_v2f16: 553; VI: ; %bb.0: 554; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 555; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 556; VI-NEXT: s_waitcnt lgkmcnt(0) 557; VI-NEXT: s_load_dword s2, s[2:3], 0x0 558; VI-NEXT: s_load_dword s3, s[4:5], 0x10 559; VI-NEXT: s_waitcnt lgkmcnt(0) 560; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 561; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 562; VI-NEXT: s_lshr_b32 s5, s3, 16 563; VI-NEXT: v_mov_b32_e32 v1, s3 564; VI-NEXT: s_lshr_b32 s4, s2, 16 565; VI-NEXT: v_rcp_f32_e32 v3, v2 566; VI-NEXT: v_mul_f32_e32 v4, v0, v3 567; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 568; VI-NEXT: v_mac_f32_e32 v4, v5, v3 569; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 570; VI-NEXT: v_mul_f32_e32 v0, v0, v3 571; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 572; VI-NEXT: v_add_f32_e32 v0, v0, v4 573; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 574; VI-NEXT: v_cvt_f32_f16_e32 v3, s5 575; VI-NEXT: v_mov_b32_e32 v2, s5 576; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 577; VI-NEXT: v_trunc_f16_e32 v0, v0 578; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 579; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 580; VI-NEXT: v_rcp_f32_e32 v4, v3 581; VI-NEXT: v_mul_f32_e32 v5, v1, v4 582; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 583; VI-NEXT: v_mac_f32_e32 v5, v6, v4 584; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 585; VI-NEXT: v_mul_f32_e32 v1, v1, v4 586; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 587; VI-NEXT: v_add_f32_e32 v1, v1, v5 588; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 589; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s4 590; VI-NEXT: v_trunc_f16_e32 v1, v1 591; VI-NEXT: v_fma_f16 v1, -v1, v2, s4 592; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 593; VI-NEXT: v_or_b32_e32 v2, v0, v1 594; VI-NEXT: v_mov_b32_e32 v0, s0 595; VI-NEXT: v_mov_b32_e32 v1, s1 596; VI-NEXT: flat_store_dword v[0:1], v2 597; VI-NEXT: s_endpgm 598 %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4 599 %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8 600 %r1 = load <2 x half>, ptr addrspace(1) %gep2, align 8 601 %r2 = frem <2 x half> %r0, %r1 602 store <2 x half> %r2, ptr addrspace(1) %out, align 8 603 ret void 604} 605 606define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 607; CI-LABEL: frem_v4f16: 608; CI: ; %bb.0: 609; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 610; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 611; CI-NEXT: s_waitcnt lgkmcnt(0) 612; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 613; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 614; CI-NEXT: s_waitcnt lgkmcnt(0) 615; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 616; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 617; CI-NEXT: s_lshr_b32 s8, s2, 16 618; CI-NEXT: s_lshr_b32 s9, s3, 16 619; CI-NEXT: s_lshr_b32 s10, s4, 16 620; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 621; CI-NEXT: s_lshr_b32 s11, s5, 16 622; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 623; CI-NEXT: v_rcp_f32_e32 v4, v2 624; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 625; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 626; CI-NEXT: v_fma_f32 v4, v5, v4, v4 627; CI-NEXT: v_mul_f32_e32 v5, v3, v4 628; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 629; CI-NEXT: v_fma_f32 v5, v6, v4, v5 630; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 631; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 632; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 633; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 634; CI-NEXT: v_trunc_f32_e32 v2, v2 635; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 636; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 637; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 638; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 639; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 640; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1 641; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 642; CI-NEXT: v_rcp_f32_e32 v5, v3 643; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 644; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 645; CI-NEXT: v_fma_f32 v5, v6, v5, v5 646; CI-NEXT: v_mul_f32_e32 v6, v4, v5 647; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 648; CI-NEXT: v_fma_f32 v6, v7, v5, v6 649; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 650; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 651; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 652; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v1 653; CI-NEXT: v_trunc_f32_e32 v3, v3 654; CI-NEXT: v_fma_f32 v1, -v3, v2, v1 655; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 656; CI-NEXT: v_cvt_f32_f16_e32 v3, s5 657; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 658; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 659; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 660; CI-NEXT: v_rcp_f32_e32 v6, v4 661; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 662; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 663; CI-NEXT: v_fma_f32 v6, v7, v6, v6 664; CI-NEXT: v_mul_f32_e32 v7, v5, v6 665; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 666; CI-NEXT: v_fma_f32 v7, v8, v6, v7 667; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 668; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 669; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 670; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2 671; CI-NEXT: v_trunc_f32_e32 v4, v4 672; CI-NEXT: v_fma_f32 v2, -v4, v3, v2 673; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 674; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 675; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 676; CI-NEXT: v_div_scale_f32 v5, s[2:3], v4, v4, v3 677; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 678; CI-NEXT: v_rcp_f32_e32 v7, v5 679; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 680; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 681; CI-NEXT: v_fma_f32 v7, v8, v7, v7 682; CI-NEXT: v_mul_f32_e32 v8, v6, v7 683; CI-NEXT: v_fma_f32 v9, -v5, v8, v6 684; CI-NEXT: v_fma_f32 v8, v9, v7, v8 685; CI-NEXT: v_fma_f32 v5, -v5, v8, v6 686; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 687; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 688; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 689; CI-NEXT: v_or_b32_e32 v0, v0, v1 690; CI-NEXT: s_mov_b32 s2, -1 691; CI-NEXT: s_mov_b32 s3, 0xf000 692; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v3 693; CI-NEXT: v_trunc_f32_e32 v5, v5 694; CI-NEXT: v_fma_f32 v3, -v5, v4, v3 695; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 696; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 697; CI-NEXT: v_or_b32_e32 v1, v2, v1 698; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 699; CI-NEXT: s_endpgm 700; 701; VI-LABEL: frem_v4f16: 702; VI: ; %bb.0: 703; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 704; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 705; VI-NEXT: s_waitcnt lgkmcnt(0) 706; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 707; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 708; VI-NEXT: s_waitcnt lgkmcnt(0) 709; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 710; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 711; VI-NEXT: s_lshr_b32 s8, s4, 16 712; VI-NEXT: v_mov_b32_e32 v1, s4 713; VI-NEXT: s_lshr_b32 s6, s2, 16 714; VI-NEXT: v_rcp_f32_e32 v3, v2 715; VI-NEXT: s_lshr_b32 s9, s5, 16 716; VI-NEXT: s_lshr_b32 s7, s3, 16 717; VI-NEXT: v_mul_f32_e32 v4, v0, v3 718; VI-NEXT: v_mad_f32 v5, -v2, v4, v0 719; VI-NEXT: v_mac_f32_e32 v4, v5, v3 720; VI-NEXT: v_mad_f32 v0, -v2, v4, v0 721; VI-NEXT: v_mul_f32_e32 v0, v0, v3 722; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0 723; VI-NEXT: v_add_f32_e32 v0, v0, v4 724; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 725; VI-NEXT: v_cvt_f32_f16_e32 v3, s8 726; VI-NEXT: v_mov_b32_e32 v2, s8 727; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2 728; VI-NEXT: v_trunc_f16_e32 v0, v0 729; VI-NEXT: v_fma_f16 v0, -v0, v1, s2 730; VI-NEXT: v_cvt_f32_f16_e32 v1, s6 731; VI-NEXT: v_rcp_f32_e32 v4, v3 732; VI-NEXT: v_mul_f32_e32 v5, v1, v4 733; VI-NEXT: v_mad_f32 v6, -v3, v5, v1 734; VI-NEXT: v_mac_f32_e32 v5, v6, v4 735; VI-NEXT: v_mad_f32 v1, -v3, v5, v1 736; VI-NEXT: v_mul_f32_e32 v1, v1, v4 737; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 738; VI-NEXT: v_add_f32_e32 v1, v1, v5 739; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 740; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 741; VI-NEXT: v_mov_b32_e32 v3, s5 742; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6 743; VI-NEXT: v_trunc_f16_e32 v1, v1 744; VI-NEXT: v_fma_f16 v1, -v1, v2, s6 745; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 746; VI-NEXT: v_rcp_f32_e32 v5, v4 747; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 748; VI-NEXT: v_or_b32_e32 v0, v0, v1 749; VI-NEXT: v_mul_f32_e32 v6, v2, v5 750; VI-NEXT: v_mad_f32 v7, -v4, v6, v2 751; VI-NEXT: v_mac_f32_e32 v6, v7, v5 752; VI-NEXT: v_mad_f32 v2, -v4, v6, v2 753; VI-NEXT: v_mul_f32_e32 v2, v2, v5 754; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2 755; VI-NEXT: v_add_f32_e32 v2, v2, v6 756; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 757; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 758; VI-NEXT: v_mov_b32_e32 v4, s9 759; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3 760; VI-NEXT: v_trunc_f16_e32 v2, v2 761; VI-NEXT: v_fma_f16 v2, -v2, v3, s3 762; VI-NEXT: v_cvt_f32_f16_e32 v3, s7 763; VI-NEXT: v_rcp_f32_e32 v6, v5 764; VI-NEXT: v_mul_f32_e32 v7, v3, v6 765; VI-NEXT: v_mad_f32 v8, -v5, v7, v3 766; VI-NEXT: v_mac_f32_e32 v7, v8, v6 767; VI-NEXT: v_mad_f32 v3, -v5, v7, v3 768; VI-NEXT: v_mul_f32_e32 v3, v3, v6 769; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3 770; VI-NEXT: v_add_f32_e32 v3, v3, v7 771; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 772; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7 773; VI-NEXT: v_trunc_f16_e32 v3, v3 774; VI-NEXT: v_fma_f16 v3, -v3, v4, s7 775; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 776; VI-NEXT: v_or_b32_e32 v1, v2, v1 777; VI-NEXT: v_mov_b32_e32 v3, s1 778; VI-NEXT: v_mov_b32_e32 v2, s0 779; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 780; VI-NEXT: s_endpgm 781 %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4 782 %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16 783 %r1 = load <4 x half>, ptr addrspace(1) %gep2, align 16 784 %r2 = frem <4 x half> %r0, %r1 785 store <4 x half> %r2, ptr addrspace(1) %out, align 16 786 ret void 787} 788 789define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 790; CI-LABEL: frem_v2f32: 791; CI: ; %bb.0: 792; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 793; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 794; CI-NEXT: s_waitcnt lgkmcnt(0) 795; CI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 796; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 797; CI-NEXT: s_waitcnt lgkmcnt(0) 798; CI-NEXT: v_mov_b32_e32 v0, s4 799; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 800; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 801; CI-NEXT: v_rcp_f32_e32 v3, v1 802; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 803; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 804; CI-NEXT: v_fma_f32 v3, v4, v3, v3 805; CI-NEXT: v_mul_f32_e32 v4, v2, v3 806; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 807; CI-NEXT: v_fma_f32 v4, v5, v3, v4 808; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 809; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 810; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 811; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 812; CI-NEXT: v_trunc_f32_e32 v1, v1 813; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 814; CI-NEXT: v_mov_b32_e32 v1, s5 815; CI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 816; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 817; CI-NEXT: v_rcp_f32_e32 v4, v2 818; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 819; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 820; CI-NEXT: v_fma_f32 v4, v5, v4, v4 821; CI-NEXT: v_mul_f32_e32 v5, v3, v4 822; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 823; CI-NEXT: v_fma_f32 v5, v6, v4, v5 824; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 825; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 826; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 827; CI-NEXT: s_mov_b32 s2, -1 828; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 829; CI-NEXT: v_trunc_f32_e32 v2, v2 830; CI-NEXT: v_fma_f32 v1, -v2, v1, s3 831; CI-NEXT: s_mov_b32 s3, 0xf000 832; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 833; CI-NEXT: s_endpgm 834; 835; VI-LABEL: frem_v2f32: 836; VI: ; %bb.0: 837; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 838; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 839; VI-NEXT: s_waitcnt lgkmcnt(0) 840; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 841; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20 842; VI-NEXT: s_waitcnt lgkmcnt(0) 843; VI-NEXT: v_mov_b32_e32 v0, s4 844; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 845; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 846; VI-NEXT: v_rcp_f32_e32 v3, v1 847; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 848; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 849; VI-NEXT: v_fma_f32 v3, v4, v3, v3 850; VI-NEXT: v_mul_f32_e32 v4, v2, v3 851; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 852; VI-NEXT: v_fma_f32 v4, v5, v3, v4 853; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 854; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 855; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 856; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2 857; VI-NEXT: v_trunc_f32_e32 v1, v1 858; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 859; VI-NEXT: v_mov_b32_e32 v1, s5 860; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, s3 861; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 862; VI-NEXT: v_rcp_f32_e32 v4, v2 863; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 864; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 865; VI-NEXT: v_fma_f32 v4, v5, v4, v4 866; VI-NEXT: v_mul_f32_e32 v5, v3, v4 867; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 868; VI-NEXT: v_fma_f32 v5, v6, v4, v5 869; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 870; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 871; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 872; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 873; VI-NEXT: v_trunc_f32_e32 v2, v2 874; VI-NEXT: v_fma_f32 v1, -v2, v1, s3 875; VI-NEXT: v_mov_b32_e32 v3, s1 876; VI-NEXT: v_mov_b32_e32 v2, s0 877; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 878; VI-NEXT: s_endpgm 879 %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4 880 %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8 881 %r1 = load <2 x float>, ptr addrspace(1) %gep2, align 8 882 %r2 = frem <2 x float> %r0, %r1 883 store <2 x float> %r2, ptr addrspace(1) %out, align 8 884 ret void 885} 886 887define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 888; CI-LABEL: frem_v4f32: 889; CI: ; %bb.0: 890; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 891; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 892; CI-NEXT: s_waitcnt lgkmcnt(0) 893; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 894; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 895; CI-NEXT: s_waitcnt lgkmcnt(0) 896; CI-NEXT: v_mov_b32_e32 v0, s8 897; CI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 898; CI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 899; CI-NEXT: v_rcp_f32_e32 v3, v1 900; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 901; CI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 902; CI-NEXT: v_fma_f32 v3, v4, v3, v3 903; CI-NEXT: v_mul_f32_e32 v4, v2, v3 904; CI-NEXT: v_fma_f32 v5, -v1, v4, v2 905; CI-NEXT: v_fma_f32 v4, v5, v3, v4 906; CI-NEXT: v_fma_f32 v1, -v1, v4, v2 907; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 908; CI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 909; CI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 910; CI-NEXT: v_trunc_f32_e32 v1, v1 911; CI-NEXT: v_fma_f32 v0, -v1, v0, s4 912; CI-NEXT: v_mov_b32_e32 v1, s9 913; CI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 914; CI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 915; CI-NEXT: v_rcp_f32_e32 v4, v2 916; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 917; CI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 918; CI-NEXT: v_fma_f32 v4, v5, v4, v4 919; CI-NEXT: v_mul_f32_e32 v5, v3, v4 920; CI-NEXT: v_fma_f32 v6, -v2, v5, v3 921; CI-NEXT: v_fma_f32 v5, v6, v4, v5 922; CI-NEXT: v_fma_f32 v2, -v2, v5, v3 923; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 924; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 925; CI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 926; CI-NEXT: v_trunc_f32_e32 v2, v2 927; CI-NEXT: v_fma_f32 v1, -v2, v1, s5 928; CI-NEXT: v_mov_b32_e32 v2, s10 929; CI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 930; CI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 931; CI-NEXT: v_rcp_f32_e32 v5, v3 932; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 933; CI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 934; CI-NEXT: v_fma_f32 v5, v6, v5, v5 935; CI-NEXT: v_mul_f32_e32 v6, v4, v5 936; CI-NEXT: v_fma_f32 v7, -v3, v6, v4 937; CI-NEXT: v_fma_f32 v6, v7, v5, v6 938; CI-NEXT: v_fma_f32 v3, -v3, v6, v4 939; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 940; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 941; CI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 942; CI-NEXT: v_trunc_f32_e32 v3, v3 943; CI-NEXT: v_fma_f32 v2, -v3, v2, s6 944; CI-NEXT: v_mov_b32_e32 v3, s11 945; CI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 946; CI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 947; CI-NEXT: v_rcp_f32_e32 v6, v4 948; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 949; CI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 950; CI-NEXT: v_fma_f32 v6, v7, v6, v6 951; CI-NEXT: v_mul_f32_e32 v7, v5, v6 952; CI-NEXT: v_fma_f32 v8, -v4, v7, v5 953; CI-NEXT: v_fma_f32 v7, v8, v6, v7 954; CI-NEXT: v_fma_f32 v4, -v4, v7, v5 955; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 956; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 957; CI-NEXT: s_mov_b32 s2, -1 958; CI-NEXT: s_mov_b32 s3, 0xf000 959; CI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 960; CI-NEXT: v_trunc_f32_e32 v4, v4 961; CI-NEXT: v_fma_f32 v3, -v4, v3, s7 962; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 963; CI-NEXT: s_endpgm 964; 965; VI-LABEL: frem_v4f32: 966; VI: ; %bb.0: 967; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 968; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 969; VI-NEXT: s_waitcnt lgkmcnt(0) 970; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 971; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 972; VI-NEXT: s_waitcnt lgkmcnt(0) 973; VI-NEXT: v_mov_b32_e32 v0, s8 974; VI-NEXT: v_div_scale_f32 v1, s[2:3], v0, v0, s4 975; VI-NEXT: v_div_scale_f32 v2, vcc, s4, v0, s4 976; VI-NEXT: v_rcp_f32_e32 v3, v1 977; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 978; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 979; VI-NEXT: v_fma_f32 v3, v4, v3, v3 980; VI-NEXT: v_mul_f32_e32 v4, v2, v3 981; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 982; VI-NEXT: v_fma_f32 v4, v5, v3, v4 983; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 984; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 985; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 986; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s4 987; VI-NEXT: v_trunc_f32_e32 v1, v1 988; VI-NEXT: v_fma_f32 v0, -v1, v0, s4 989; VI-NEXT: v_mov_b32_e32 v1, s9 990; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s5 991; VI-NEXT: v_div_scale_f32 v3, vcc, s5, v1, s5 992; VI-NEXT: v_rcp_f32_e32 v4, v2 993; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 994; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0 995; VI-NEXT: v_fma_f32 v4, v5, v4, v4 996; VI-NEXT: v_mul_f32_e32 v5, v3, v4 997; VI-NEXT: v_fma_f32 v6, -v2, v5, v3 998; VI-NEXT: v_fma_f32 v5, v6, v4, v5 999; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 1000; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1001; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 1002; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s5 1003; VI-NEXT: v_trunc_f32_e32 v2, v2 1004; VI-NEXT: v_fma_f32 v1, -v2, v1, s5 1005; VI-NEXT: v_mov_b32_e32 v2, s10 1006; VI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, s6 1007; VI-NEXT: v_div_scale_f32 v4, vcc, s6, v2, s6 1008; VI-NEXT: v_rcp_f32_e32 v5, v3 1009; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1010; VI-NEXT: v_fma_f32 v6, -v3, v5, 1.0 1011; VI-NEXT: v_fma_f32 v5, v6, v5, v5 1012; VI-NEXT: v_mul_f32_e32 v6, v4, v5 1013; VI-NEXT: v_fma_f32 v7, -v3, v6, v4 1014; VI-NEXT: v_fma_f32 v6, v7, v5, v6 1015; VI-NEXT: v_fma_f32 v3, -v3, v6, v4 1016; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1017; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 1018; VI-NEXT: v_div_fixup_f32 v3, v3, v2, s6 1019; VI-NEXT: v_trunc_f32_e32 v3, v3 1020; VI-NEXT: v_fma_f32 v2, -v3, v2, s6 1021; VI-NEXT: v_mov_b32_e32 v3, s11 1022; VI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, s7 1023; VI-NEXT: v_div_scale_f32 v5, vcc, s7, v3, s7 1024; VI-NEXT: v_rcp_f32_e32 v6, v4 1025; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1026; VI-NEXT: v_fma_f32 v7, -v4, v6, 1.0 1027; VI-NEXT: v_fma_f32 v6, v7, v6, v6 1028; VI-NEXT: v_mul_f32_e32 v7, v5, v6 1029; VI-NEXT: v_fma_f32 v8, -v4, v7, v5 1030; VI-NEXT: v_fma_f32 v7, v8, v6, v7 1031; VI-NEXT: v_fma_f32 v4, -v4, v7, v5 1032; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1033; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 1034; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s7 1035; VI-NEXT: v_trunc_f32_e32 v4, v4 1036; VI-NEXT: v_fma_f32 v3, -v4, v3, s7 1037; VI-NEXT: v_mov_b32_e32 v5, s1 1038; VI-NEXT: v_mov_b32_e32 v4, s0 1039; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1040; VI-NEXT: s_endpgm 1041 %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4 1042 %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16 1043 %r1 = load <4 x float>, ptr addrspace(1) %gep2, align 16 1044 %r2 = frem <4 x float> %r0, %r1 1045 store <4 x float> %r2, ptr addrspace(1) %out, align 16 1046 ret void 1047} 1048 1049define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { 1050; CI-LABEL: frem_v2f64: 1051; CI: ; %bb.0: 1052; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1053; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1054; CI-NEXT: s_waitcnt lgkmcnt(0) 1055; CI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 1056; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 1057; CI-NEXT: s_waitcnt lgkmcnt(0) 1058; CI-NEXT: v_mov_b32_e32 v0, s8 1059; CI-NEXT: v_mov_b32_e32 v1, s9 1060; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] 1061; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] 1062; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1063; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1064; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 1065; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1066; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 1067; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] 1068; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] 1069; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] 1070; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] 1071; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] 1072; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] 1073; CI-NEXT: v_mov_b32_e32 v2, s10 1074; CI-NEXT: v_mov_b32_e32 v3, s11 1075; CI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] 1076; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] 1077; CI-NEXT: s_mov_b32 s2, -1 1078; CI-NEXT: s_mov_b32 s3, 0xf000 1079; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1080; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1081; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1082; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1083; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1084; CI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] 1085; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] 1086; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] 1087; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] 1088; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1089; CI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] 1090; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1091; CI-NEXT: s_endpgm 1092; 1093; VI-LABEL: frem_v2f64: 1094; VI: ; %bb.0: 1095; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1096; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1097; VI-NEXT: s_waitcnt lgkmcnt(0) 1098; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 1099; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 1100; VI-NEXT: s_waitcnt lgkmcnt(0) 1101; VI-NEXT: v_mov_b32_e32 v0, s8 1102; VI-NEXT: v_mov_b32_e32 v1, s9 1103; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[4:5] 1104; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[4:5], v[0:1], s[4:5] 1105; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] 1106; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1107; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 1108; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 1109; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] 1110; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] 1111; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] 1112; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] 1113; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[4:5] 1114; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] 1115; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[4:5] 1116; VI-NEXT: v_mov_b32_e32 v2, s10 1117; VI-NEXT: v_mov_b32_e32 v3, s11 1118; VI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], s[6:7] 1119; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[6:7], v[2:3], s[6:7] 1120; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] 1121; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1122; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1123; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 1124; VI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] 1125; VI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] 1126; VI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] 1127; VI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] 1128; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], s[6:7] 1129; VI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] 1130; VI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[2:3], s[6:7] 1131; VI-NEXT: v_mov_b32_e32 v5, s1 1132; VI-NEXT: v_mov_b32_e32 v4, s0 1133; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1134; VI-NEXT: s_endpgm 1135 %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4 1136 %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16 1137 %r1 = load <2 x double>, ptr addrspace(1) %gep2, align 16 1138 %r2 = frem <2 x double> %r0, %r1 1139 store <2 x double> %r2, ptr addrspace(1) %out, align 16 1140 ret void 1141} 1142 1143attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1144attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1145