1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX11 %s 6; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s 7 8define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 { 9; GFX6-LABEL: round_f32: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 12; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 13; GFX6-NEXT: s_mov_b32 s3, 0xf000 14; GFX6-NEXT: s_mov_b32 s2, -1 15; GFX6-NEXT: s_waitcnt lgkmcnt(0) 16; GFX6-NEXT: v_trunc_f32_e32 v0, s6 17; GFX6-NEXT: v_sub_f32_e32 v1, s6, v0 18; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 19; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 20; GFX6-NEXT: s_brev_b32 s4, -2 21; GFX6-NEXT: v_mov_b32_e32 v2, s6 22; GFX6-NEXT: v_bfi_b32 v1, s4, v1, v2 23; GFX6-NEXT: v_add_f32_e32 v0, v0, v1 24; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 25; GFX6-NEXT: s_endpgm 26; 27; GFX8-LABEL: round_f32: 28; GFX8: ; %bb.0: 29; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 30; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 31; GFX8-NEXT: s_mov_b32 s3, 0xf000 32; GFX8-NEXT: s_mov_b32 s2, -1 33; GFX8-NEXT: s_waitcnt lgkmcnt(0) 34; GFX8-NEXT: v_trunc_f32_e32 v0, s6 35; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0 36; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 37; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 38; GFX8-NEXT: s_brev_b32 s4, -2 39; GFX8-NEXT: v_mov_b32_e32 v2, s6 40; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 41; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 42; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 43; GFX8-NEXT: s_endpgm 44; 45; GFX9-LABEL: round_f32: 46; GFX9: ; %bb.0: 47; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 48; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 49; GFX9-NEXT: s_mov_b32 s3, 0xf000 50; GFX9-NEXT: s_mov_b32 s2, -1 51; GFX9-NEXT: s_waitcnt lgkmcnt(0) 52; GFX9-NEXT: v_trunc_f32_e32 v0, s6 53; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0 54; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 55; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 56; GFX9-NEXT: s_brev_b32 s4, -2 57; GFX9-NEXT: v_mov_b32_e32 v2, s6 58; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 59; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 60; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 61; GFX9-NEXT: s_endpgm 62; 63; GFX11-LABEL: round_f32: 64; GFX11: ; %bb.0: 65; GFX11-NEXT: s_clause 0x1 66; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 67; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: v_trunc_f32_e32 v0, s2 70; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 71; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0 72; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5 73; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 74; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3 75; GFX11-NEXT: s_mov_b32 s3, 0x31016000 76; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 77; GFX11-NEXT: s_mov_b32 s2, -1 78; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 79; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 80; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 81; GFX11-NEXT: s_endpgm 82; 83; R600-LABEL: round_f32: 84; R600: ; %bb.0: 85; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 86; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 87; R600-NEXT: CF_END 88; R600-NEXT: PAD 89; R600-NEXT: ALU clause starting at 4: 90; R600-NEXT: TRUNC * T0.W, KC0[2].Z, 91; R600-NEXT: ADD * T1.W, KC0[2].Z, -PV.W, 92; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, 93; R600-NEXT: BFI_INT * T1.W, literal.x, PV.W, KC0[2].Z, 94; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 95; R600-NEXT: ADD T0.X, T0.W, PV.W, 96; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 97; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 98 %result = call float @llvm.round.f32(float %x) #1 99 store float %result, ptr addrspace(1) %out 100 ret void 101} 102 103; The vector tests are really difficult to verify, since it can be hard to 104; predict how the scheduler will order the instructions. We already have 105; a test for the scalar case, so the vector tests just check that the 106; compiler doesn't crash. 107define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) #0 { 108; GFX6-LABEL: round_v2f32: 109; GFX6: ; %bb.0: 110; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 111; GFX6-NEXT: s_brev_b32 s8, -2 112; GFX6-NEXT: s_mov_b32 s7, 0xf000 113; GFX6-NEXT: s_mov_b32 s6, -1 114; GFX6-NEXT: s_waitcnt lgkmcnt(0) 115; GFX6-NEXT: v_trunc_f32_e32 v0, s3 116; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0 117; GFX6-NEXT: s_mov_b32 s4, s0 118; GFX6-NEXT: s_mov_b32 s5, s1 119; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 120; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] 121; GFX6-NEXT: v_mov_b32_e32 v2, s3 122; GFX6-NEXT: v_bfi_b32 v1, s8, v1, v2 123; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 124; GFX6-NEXT: v_trunc_f32_e32 v0, s2 125; GFX6-NEXT: v_sub_f32_e32 v2, s2, v0 126; GFX6-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 127; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] 128; GFX6-NEXT: v_mov_b32_e32 v3, s2 129; GFX6-NEXT: v_bfi_b32 v2, s8, v2, v3 130; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 131; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 132; GFX6-NEXT: s_endpgm 133; 134; GFX8-LABEL: round_v2f32: 135; GFX8: ; %bb.0: 136; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 137; GFX8-NEXT: s_brev_b32 s8, -2 138; GFX8-NEXT: s_mov_b32 s7, 0xf000 139; GFX8-NEXT: s_mov_b32 s6, -1 140; GFX8-NEXT: s_waitcnt lgkmcnt(0) 141; GFX8-NEXT: v_trunc_f32_e32 v0, s3 142; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 143; GFX8-NEXT: s_mov_b32 s4, s0 144; GFX8-NEXT: s_mov_b32 s5, s1 145; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 146; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] 147; GFX8-NEXT: v_mov_b32_e32 v2, s3 148; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2 149; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 150; GFX8-NEXT: v_trunc_f32_e32 v0, s2 151; GFX8-NEXT: v_sub_f32_e32 v2, s2, v0 152; GFX8-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 153; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] 154; GFX8-NEXT: v_mov_b32_e32 v3, s2 155; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3 156; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 157; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 158; GFX8-NEXT: s_endpgm 159; 160; GFX9-LABEL: round_v2f32: 161; GFX9: ; %bb.0: 162; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 163; GFX9-NEXT: s_brev_b32 s8, -2 164; GFX9-NEXT: s_mov_b32 s7, 0xf000 165; GFX9-NEXT: s_mov_b32 s6, -1 166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 167; GFX9-NEXT: v_trunc_f32_e32 v0, s3 168; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0 169; GFX9-NEXT: s_mov_b32 s4, s0 170; GFX9-NEXT: s_mov_b32 s5, s1 171; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 172; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] 173; GFX9-NEXT: v_mov_b32_e32 v2, s3 174; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 175; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 176; GFX9-NEXT: v_trunc_f32_e32 v0, s2 177; GFX9-NEXT: v_sub_f32_e32 v2, s2, v0 178; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 179; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] 180; GFX9-NEXT: v_mov_b32_e32 v3, s2 181; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 182; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 183; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 184; GFX9-NEXT: s_endpgm 185; 186; GFX11-LABEL: round_v2f32: 187; GFX11: ; %bb.0: 188; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 189; GFX11-NEXT: s_waitcnt lgkmcnt(0) 190; GFX11-NEXT: v_trunc_f32_e32 v0, s3 191; GFX11-NEXT: v_trunc_f32_e32 v2, s2 192; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 193; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0 194; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2 195; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 196; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5 197; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4 198; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 199; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5 200; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3 201; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 202; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 203; GFX11-NEXT: s_mov_b32 s3, 0x31016000 204; GFX11-NEXT: v_add_f32_e32 v1, v0, v1 205; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 206; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2 207; GFX11-NEXT: s_mov_b32 s2, -1 208; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 209; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 210; GFX11-NEXT: s_endpgm 211; 212; R600-LABEL: round_v2f32: 213; R600: ; %bb.0: 214; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 215; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 216; R600-NEXT: CF_END 217; R600-NEXT: PAD 218; R600-NEXT: ALU clause starting at 4: 219; R600-NEXT: TRUNC * T0.W, KC0[3].X, 220; R600-NEXT: ADD T1.W, KC0[3].X, -PV.W, 221; R600-NEXT: TRUNC * T2.W, KC0[2].W, 222; R600-NEXT: ADD T3.W, KC0[2].W, -PS, 223; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, 224; R600-NEXT: BFI_INT T1.W, literal.x, PS, KC0[3].X, 225; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, 226; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 227; R600-NEXT: ADD T0.Y, T0.W, PV.W, 228; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[2].W, 229; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 230; R600-NEXT: ADD T0.X, T2.W, PV.W, 231; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 232; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 233 %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 234 store <2 x float> %result, ptr addrspace(1) %out 235 ret void 236} 237 238define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #0 { 239; GFX6-LABEL: round_v4f32: 240; GFX6: ; %bb.0: 241; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 242; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 243; GFX6-NEXT: s_brev_b32 s10, -2 244; GFX6-NEXT: s_mov_b32 s7, 0xf000 245; GFX6-NEXT: s_mov_b32 s6, -1 246; GFX6-NEXT: s_waitcnt lgkmcnt(0) 247; GFX6-NEXT: v_trunc_f32_e32 v0, s3 248; GFX6-NEXT: v_sub_f32_e32 v1, s3, v0 249; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 250; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] 251; GFX6-NEXT: v_mov_b32_e32 v2, s3 252; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 253; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 254; GFX6-NEXT: v_trunc_f32_e32 v0, s2 255; GFX6-NEXT: v_sub_f32_e32 v1, s2, v0 256; GFX6-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 257; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] 258; GFX6-NEXT: v_mov_b32_e32 v2, s2 259; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v2 260; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 261; GFX6-NEXT: v_trunc_f32_e32 v0, s1 262; GFX6-NEXT: v_sub_f32_e32 v1, s1, v0 263; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 264; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] 265; GFX6-NEXT: v_mov_b32_e32 v4, s1 266; GFX6-NEXT: v_bfi_b32 v1, s10, v1, v4 267; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 268; GFX6-NEXT: v_trunc_f32_e32 v0, s0 269; GFX6-NEXT: v_sub_f32_e32 v4, s0, v0 270; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 271; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] 272; GFX6-NEXT: v_mov_b32_e32 v5, s0 273; GFX6-NEXT: v_bfi_b32 v4, s10, v4, v5 274; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 275; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 276; GFX6-NEXT: s_endpgm 277; 278; GFX8-LABEL: round_v4f32: 279; GFX8: ; %bb.0: 280; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 281; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 282; GFX8-NEXT: s_brev_b32 s10, -2 283; GFX8-NEXT: s_mov_b32 s7, 0xf000 284; GFX8-NEXT: s_mov_b32 s6, -1 285; GFX8-NEXT: s_waitcnt lgkmcnt(0) 286; GFX8-NEXT: v_trunc_f32_e32 v0, s3 287; GFX8-NEXT: v_sub_f32_e32 v1, s3, v0 288; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 289; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] 290; GFX8-NEXT: v_mov_b32_e32 v2, s3 291; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 292; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 293; GFX8-NEXT: v_trunc_f32_e32 v0, s2 294; GFX8-NEXT: v_sub_f32_e32 v1, s2, v0 295; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5 296; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9] 297; GFX8-NEXT: v_mov_b32_e32 v2, s2 298; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2 299; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 300; GFX8-NEXT: v_trunc_f32_e32 v0, s1 301; GFX8-NEXT: v_sub_f32_e32 v1, s1, v0 302; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 303; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] 304; GFX8-NEXT: v_mov_b32_e32 v4, s1 305; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4 306; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 307; GFX8-NEXT: v_trunc_f32_e32 v0, s0 308; GFX8-NEXT: v_sub_f32_e32 v4, s0, v0 309; GFX8-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 310; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] 311; GFX8-NEXT: v_mov_b32_e32 v5, s0 312; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5 313; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 314; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 315; GFX8-NEXT: s_endpgm 316; 317; GFX9-LABEL: round_v4f32: 318; GFX9: ; %bb.0: 319; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 320; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24 321; GFX9-NEXT: s_brev_b32 s6, -2 322; GFX9-NEXT: s_mov_b32 s11, 0xf000 323; GFX9-NEXT: s_mov_b32 s10, -1 324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 325; GFX9-NEXT: v_trunc_f32_e32 v0, s3 326; GFX9-NEXT: v_sub_f32_e32 v1, s3, v0 327; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 328; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 329; GFX9-NEXT: v_mov_b32_e32 v2, s3 330; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 331; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 332; GFX9-NEXT: v_trunc_f32_e32 v0, s2 333; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0 334; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 335; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 336; GFX9-NEXT: v_mov_b32_e32 v2, s2 337; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 338; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 339; GFX9-NEXT: v_trunc_f32_e32 v0, s1 340; GFX9-NEXT: v_sub_f32_e32 v1, s1, v0 341; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, 0.5 342; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[2:3] 343; GFX9-NEXT: v_mov_b32_e32 v4, s1 344; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4 345; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 346; GFX9-NEXT: v_trunc_f32_e32 v0, s0 347; GFX9-NEXT: v_sub_f32_e32 v4, s0, v0 348; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 349; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[2:3] 350; GFX9-NEXT: v_mov_b32_e32 v5, s0 351; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5 352; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 353; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 354; GFX9-NEXT: s_endpgm 355; 356; GFX11-LABEL: round_v4f32: 357; GFX11: ; %bb.0: 358; GFX11-NEXT: s_clause 0x1 359; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 360; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 361; GFX11-NEXT: s_mov_b32 s7, 0x31016000 362; GFX11-NEXT: s_waitcnt lgkmcnt(0) 363; GFX11-NEXT: v_trunc_f32_e32 v0, s3 364; GFX11-NEXT: v_trunc_f32_e32 v1, s2 365; GFX11-NEXT: v_trunc_f32_e32 v4, s1 366; GFX11-NEXT: v_trunc_f32_e32 v5, s0 367; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 368; GFX11-NEXT: v_dual_sub_f32 v2, s3, v0 :: v_dual_sub_f32 v3, s2, v1 369; GFX11-NEXT: v_dual_sub_f32 v6, s1, v4 :: v_dual_sub_f32 v7, s0, v5 370; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 371; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, 0.5 372; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s6 373; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 374; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, 0.5 375; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s3 376; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 377; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s6 378; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v6|, 0.5 379; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s2 380; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) 381; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s6 382; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v7|, 0.5 383; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8 384; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 385; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s1 386; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s6 387; GFX11-NEXT: s_mov_b32 s6, -1 388; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 389; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s0 390; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7 391; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 392; GFX11-NEXT: s_endpgm 393; 394; R600-LABEL: round_v4f32: 395; R600: ; %bb.0: 396; R600-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[] 397; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 398; R600-NEXT: CF_END 399; R600-NEXT: PAD 400; R600-NEXT: ALU clause starting at 4: 401; R600-NEXT: TRUNC * T0.W, KC0[4].X, 402; R600-NEXT: ADD T1.W, KC0[4].X, -PV.W, 403; R600-NEXT: TRUNC * T2.W, KC0[3].W, 404; R600-NEXT: TRUNC T0.Z, KC0[3].Z, 405; R600-NEXT: ADD T3.W, KC0[3].W, -PS, 406; R600-NEXT: SETGE * T1.W, |PV.W|, 0.5, 407; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[4].X, 408; R600-NEXT: SETGE T1.Z, |PV.W|, 0.5, 409; R600-NEXT: ADD * T1.W, KC0[3].Z, -PV.Z, 410; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 411; R600-NEXT: TRUNC * T3.W, KC0[3].Y, 412; R600-NEXT: ADD T1.Y, KC0[3].Y, -PV.W, 413; R600-NEXT: SETGE T2.Z, |T1.W|, 0.5, 414; R600-NEXT: BFI_INT T1.W, literal.x, T1.Z, KC0[3].W, 415; R600-NEXT: ADD * T4.W, T0.W, T0.Y, 416; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 417; R600-NEXT: ADD T4.Z, T2.W, PV.W, 418; R600-NEXT: BFI_INT T0.W, literal.x, PV.Z, KC0[3].Z, 419; R600-NEXT: SETGE * T1.W, |PV.Y|, 0.5, 420; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 421; R600-NEXT: ADD T4.Y, T0.Z, PV.W, 422; R600-NEXT: BFI_INT * T0.W, literal.x, PS, KC0[3].Y, 423; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 424; R600-NEXT: ADD T4.X, T3.W, PV.W, 425; R600-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 426; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 427 %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 428 store <4 x float> %result, ptr addrspace(1) %out 429 ret void 430} 431 432define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #0 { 433; GFX6-LABEL: round_v8f32: 434; GFX6: ; %bb.0: 435; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 436; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 437; GFX6-NEXT: s_brev_b32 s6, -2 438; GFX6-NEXT: s_mov_b32 s3, 0xf000 439; GFX6-NEXT: s_mov_b32 s2, -1 440; GFX6-NEXT: s_waitcnt lgkmcnt(0) 441; GFX6-NEXT: v_trunc_f32_e32 v0, s11 442; GFX6-NEXT: v_sub_f32_e32 v1, s11, v0 443; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 444; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 445; GFX6-NEXT: v_mov_b32_e32 v2, s11 446; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2 447; GFX6-NEXT: v_add_f32_e32 v3, v0, v1 448; GFX6-NEXT: v_trunc_f32_e32 v0, s10 449; GFX6-NEXT: v_sub_f32_e32 v1, s10, v0 450; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 451; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 452; GFX6-NEXT: v_mov_b32_e32 v2, s10 453; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v2 454; GFX6-NEXT: v_add_f32_e32 v2, v0, v1 455; GFX6-NEXT: v_trunc_f32_e32 v0, s9 456; GFX6-NEXT: v_sub_f32_e32 v1, s9, v0 457; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 458; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 459; GFX6-NEXT: v_mov_b32_e32 v4, s9 460; GFX6-NEXT: v_bfi_b32 v1, s6, v1, v4 461; GFX6-NEXT: v_add_f32_e32 v1, v0, v1 462; GFX6-NEXT: v_trunc_f32_e32 v0, s8 463; GFX6-NEXT: v_sub_f32_e32 v4, s8, v0 464; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 465; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] 466; GFX6-NEXT: v_mov_b32_e32 v5, s8 467; GFX6-NEXT: v_bfi_b32 v4, s6, v4, v5 468; GFX6-NEXT: v_add_f32_e32 v0, v0, v4 469; GFX6-NEXT: v_trunc_f32_e32 v4, s15 470; GFX6-NEXT: v_sub_f32_e32 v5, s15, v4 471; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 472; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 473; GFX6-NEXT: v_mov_b32_e32 v6, s15 474; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6 475; GFX6-NEXT: v_add_f32_e32 v7, v4, v5 476; GFX6-NEXT: v_trunc_f32_e32 v4, s14 477; GFX6-NEXT: v_sub_f32_e32 v5, s14, v4 478; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 479; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 480; GFX6-NEXT: v_mov_b32_e32 v6, s14 481; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v6 482; GFX6-NEXT: v_add_f32_e32 v6, v4, v5 483; GFX6-NEXT: v_trunc_f32_e32 v4, s13 484; GFX6-NEXT: v_sub_f32_e32 v5, s13, v4 485; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 486; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 487; GFX6-NEXT: v_mov_b32_e32 v8, s13 488; GFX6-NEXT: v_bfi_b32 v5, s6, v5, v8 489; GFX6-NEXT: v_add_f32_e32 v5, v4, v5 490; GFX6-NEXT: v_trunc_f32_e32 v4, s12 491; GFX6-NEXT: v_sub_f32_e32 v8, s12, v4 492; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 493; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] 494; GFX6-NEXT: v_mov_b32_e32 v9, s12 495; GFX6-NEXT: v_bfi_b32 v8, s6, v8, v9 496; GFX6-NEXT: v_add_f32_e32 v4, v4, v8 497; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 498; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 499; GFX6-NEXT: s_endpgm 500; 501; GFX8-LABEL: round_v8f32: 502; GFX8: ; %bb.0: 503; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 504; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 505; GFX8-NEXT: s_brev_b32 s6, -2 506; GFX8-NEXT: s_mov_b32 s3, 0xf000 507; GFX8-NEXT: s_mov_b32 s2, -1 508; GFX8-NEXT: s_waitcnt lgkmcnt(0) 509; GFX8-NEXT: v_trunc_f32_e32 v0, s11 510; GFX8-NEXT: v_sub_f32_e32 v1, s11, v0 511; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 512; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 513; GFX8-NEXT: v_mov_b32_e32 v2, s11 514; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2 515; GFX8-NEXT: v_add_f32_e32 v3, v0, v1 516; GFX8-NEXT: v_trunc_f32_e32 v0, s10 517; GFX8-NEXT: v_sub_f32_e32 v1, s10, v0 518; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 519; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 520; GFX8-NEXT: v_mov_b32_e32 v2, s10 521; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v2 522; GFX8-NEXT: v_add_f32_e32 v2, v0, v1 523; GFX8-NEXT: v_trunc_f32_e32 v0, s9 524; GFX8-NEXT: v_sub_f32_e32 v1, s9, v0 525; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 526; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 527; GFX8-NEXT: v_mov_b32_e32 v4, s9 528; GFX8-NEXT: v_bfi_b32 v1, s6, v1, v4 529; GFX8-NEXT: v_add_f32_e32 v1, v0, v1 530; GFX8-NEXT: v_trunc_f32_e32 v0, s8 531; GFX8-NEXT: v_sub_f32_e32 v4, s8, v0 532; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 533; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] 534; GFX8-NEXT: v_mov_b32_e32 v5, s8 535; GFX8-NEXT: v_bfi_b32 v4, s6, v4, v5 536; GFX8-NEXT: v_add_f32_e32 v0, v0, v4 537; GFX8-NEXT: v_trunc_f32_e32 v4, s15 538; GFX8-NEXT: v_sub_f32_e32 v5, s15, v4 539; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 540; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 541; GFX8-NEXT: v_mov_b32_e32 v6, s15 542; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6 543; GFX8-NEXT: v_add_f32_e32 v7, v4, v5 544; GFX8-NEXT: v_trunc_f32_e32 v4, s14 545; GFX8-NEXT: v_sub_f32_e32 v5, s14, v4 546; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 547; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 548; GFX8-NEXT: v_mov_b32_e32 v6, s14 549; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v6 550; GFX8-NEXT: v_add_f32_e32 v6, v4, v5 551; GFX8-NEXT: v_trunc_f32_e32 v4, s13 552; GFX8-NEXT: v_sub_f32_e32 v5, s13, v4 553; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 554; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 555; GFX8-NEXT: v_mov_b32_e32 v8, s13 556; GFX8-NEXT: v_bfi_b32 v5, s6, v5, v8 557; GFX8-NEXT: v_add_f32_e32 v5, v4, v5 558; GFX8-NEXT: v_trunc_f32_e32 v4, s12 559; GFX8-NEXT: v_sub_f32_e32 v8, s12, v4 560; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 561; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] 562; GFX8-NEXT: v_mov_b32_e32 v9, s12 563; GFX8-NEXT: v_bfi_b32 v8, s6, v8, v9 564; GFX8-NEXT: v_add_f32_e32 v4, v4, v8 565; GFX8-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 566; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 567; GFX8-NEXT: s_endpgm 568; 569; GFX9-LABEL: round_v8f32: 570; GFX9: ; %bb.0: 571; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 572; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 573; GFX9-NEXT: s_brev_b32 s6, -2 574; GFX9-NEXT: s_mov_b32 s3, 0xf000 575; GFX9-NEXT: s_mov_b32 s2, -1 576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 577; GFX9-NEXT: v_trunc_f32_e32 v0, s11 578; GFX9-NEXT: v_sub_f32_e32 v1, s11, v0 579; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 580; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 581; GFX9-NEXT: v_mov_b32_e32 v2, s11 582; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 583; GFX9-NEXT: v_add_f32_e32 v3, v0, v1 584; GFX9-NEXT: v_trunc_f32_e32 v0, s10 585; GFX9-NEXT: v_sub_f32_e32 v1, s10, v0 586; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 587; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 588; GFX9-NEXT: v_mov_b32_e32 v2, s10 589; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v2 590; GFX9-NEXT: v_add_f32_e32 v2, v0, v1 591; GFX9-NEXT: v_trunc_f32_e32 v0, s9 592; GFX9-NEXT: v_sub_f32_e32 v1, s9, v0 593; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 594; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] 595; GFX9-NEXT: v_mov_b32_e32 v4, s9 596; GFX9-NEXT: v_bfi_b32 v1, s6, v1, v4 597; GFX9-NEXT: v_add_f32_e32 v1, v0, v1 598; GFX9-NEXT: v_trunc_f32_e32 v0, s8 599; GFX9-NEXT: v_sub_f32_e32 v4, s8, v0 600; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, 0.5 601; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[4:5] 602; GFX9-NEXT: v_mov_b32_e32 v5, s8 603; GFX9-NEXT: v_bfi_b32 v4, s6, v4, v5 604; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 605; GFX9-NEXT: v_trunc_f32_e32 v4, s15 606; GFX9-NEXT: v_sub_f32_e32 v5, s15, v4 607; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 608; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 609; GFX9-NEXT: v_mov_b32_e32 v6, s15 610; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6 611; GFX9-NEXT: v_add_f32_e32 v7, v4, v5 612; GFX9-NEXT: v_trunc_f32_e32 v4, s14 613; GFX9-NEXT: v_sub_f32_e32 v5, s14, v4 614; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 615; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 616; GFX9-NEXT: v_mov_b32_e32 v6, s14 617; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v6 618; GFX9-NEXT: v_add_f32_e32 v6, v4, v5 619; GFX9-NEXT: v_trunc_f32_e32 v4, s13 620; GFX9-NEXT: v_sub_f32_e32 v5, s13, v4 621; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5 622; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5] 623; GFX9-NEXT: v_mov_b32_e32 v8, s13 624; GFX9-NEXT: v_bfi_b32 v5, s6, v5, v8 625; GFX9-NEXT: v_add_f32_e32 v5, v4, v5 626; GFX9-NEXT: v_trunc_f32_e32 v4, s12 627; GFX9-NEXT: v_sub_f32_e32 v8, s12, v4 628; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5 629; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5] 630; GFX9-NEXT: v_mov_b32_e32 v9, s12 631; GFX9-NEXT: v_bfi_b32 v8, s6, v8, v9 632; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 633; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 634; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 635; GFX9-NEXT: s_endpgm 636; 637; GFX11-LABEL: round_v8f32: 638; GFX11: ; %bb.0: 639; GFX11-NEXT: s_clause 0x1 640; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 641; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 642; GFX11-NEXT: s_mov_b32 s3, 0x31016000 643; GFX11-NEXT: s_waitcnt lgkmcnt(0) 644; GFX11-NEXT: v_trunc_f32_e32 v0, s11 645; GFX11-NEXT: v_trunc_f32_e32 v1, s10 646; GFX11-NEXT: v_trunc_f32_e32 v4, s9 647; GFX11-NEXT: v_trunc_f32_e32 v8, s8 648; GFX11-NEXT: v_trunc_f32_e32 v5, s15 649; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 650; GFX11-NEXT: v_dual_sub_f32 v2, s11, v0 :: v_dual_sub_f32 v3, s10, v1 651; GFX11-NEXT: v_sub_f32_e32 v7, s9, v4 652; GFX11-NEXT: v_trunc_f32_e32 v9, s13 653; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 654; GFX11-NEXT: v_sub_f32_e32 v12, s15, v5 655; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5 656; GFX11-NEXT: v_sub_f32_e32 v11, s8, v8 657; GFX11-NEXT: v_trunc_f32_e32 v6, s14 658; GFX11-NEXT: v_sub_f32_e32 v14, s13, v9 659; GFX11-NEXT: v_trunc_f32_e32 v10, s12 660; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2 661; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5 662; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 663; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s11 664; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2 665; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5 666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 667; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s10 668; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2 669; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5 670; GFX11-NEXT: v_sub_f32_e32 v13, s14, v6 671; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 672; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16 673; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s9 674; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) 675; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2 676; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5 677; GFX11-NEXT: v_add_f32_e32 v1, v4, v7 678; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 679; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s8 680; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2 681; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5 682; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 683; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s15 684; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2 685; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5 686; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 687; GFX11-NEXT: v_add_f32_e32 v7, v5, v12 688; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s14 689; GFX11-NEXT: v_sub_f32_e32 v15, s12, v10 690; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 691; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2 692; GFX11-NEXT: v_add_f32_e32 v6, v6, v13 693; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 694; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5 695; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s13 696; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 697; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2 698; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11 699; GFX11-NEXT: s_mov_b32 s2, -1 700; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 701; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s12 702; GFX11-NEXT: v_add_f32_e32 v4, v10, v4 703; GFX11-NEXT: s_clause 0x1 704; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 705; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 706; GFX11-NEXT: s_endpgm 707; 708; R600-LABEL: round_v8f32: 709; R600: ; %bb.0: 710; R600-NEXT: ALU 50, @4, KC0[CB0:0-32], KC1[] 711; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0 712; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 713; R600-NEXT: CF_END 714; R600-NEXT: ALU clause starting at 4: 715; R600-NEXT: TRUNC * T0.W, KC0[6].X, 716; R600-NEXT: ADD T0.Z, KC0[6].X, -PV.W, 717; R600-NEXT: TRUNC * T1.W, KC0[5].X, 718; R600-NEXT: TRUNC * T2.W, KC0[4].W, 719; R600-NEXT: ADD T1.Z, KC0[4].W, -PV.W, 720; R600-NEXT: ADD T3.W, KC0[5].X, -T1.W, 721; R600-NEXT: SETGE * T4.W, |T0.Z|, 0.5, 722; R600-NEXT: BFI_INT T0.Y, literal.x, PS, KC0[6].X, 723; R600-NEXT: SETGE T0.Z, |PV.W|, 0.5, 724; R600-NEXT: SETGE T3.W, |PV.Z|, 0.5, 725; R600-NEXT: TRUNC * T4.W, KC0[5].Y, 726; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 727; R600-NEXT: ADD T1.Y, KC0[5].Y, -PS, 728; R600-NEXT: BFI_INT T1.Z, literal.x, PV.W, KC0[4].W, 729; R600-NEXT: BFI_INT T3.W, literal.x, PV.Z, KC0[5].X, 730; R600-NEXT: TRUNC * T5.W, KC0[4].Z, 731; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 732; R600-NEXT: TRUNC T0.Z, KC0[4].Y, 733; R600-NEXT: TRUNC * T6.W, KC0[5].W, 734; R600-NEXT: ADD * T7.W, KC0[4].Z, -T5.W, 735; R600-NEXT: TRUNC T0.X, KC0[5].Z, 736; R600-NEXT: SETGE T2.Y, |PV.W|, 0.5, 737; R600-NEXT: ADD T2.Z, KC0[5].W, -T6.W, BS:VEC_102/SCL_221 738; R600-NEXT: ADD T7.W, KC0[4].Y, -T0.Z, 739; R600-NEXT: ADD * T3.W, T1.W, T3.W, 740; R600-NEXT: SETGE T1.X, |PV.W|, 0.5, 741; R600-NEXT: SETGE T4.Y, |PV.Z|, 0.5, 742; R600-NEXT: ADD T3.Z, T2.W, T1.Z, 743; R600-NEXT: BFI_INT T1.W, literal.x, PV.Y, KC0[4].Z, 744; R600-NEXT: ADD * T2.W, KC0[5].Z, -PV.X, 745; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 746; R600-NEXT: SETGE T2.X, |PS|, 0.5, 747; R600-NEXT: ADD T3.Y, T5.W, PV.W, 748; R600-NEXT: BFI_INT T1.Z, literal.x, PV.Y, KC0[5].W, 749; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[4].Y, 750; R600-NEXT: ADD * T0.W, T0.W, T0.Y, 751; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 752; R600-NEXT: ADD T3.X, T0.Z, PV.W, 753; R600-NEXT: ADD T0.Z, T6.W, PV.Z, 754; R600-NEXT: BFI_INT T1.W, literal.x, PV.X, KC0[5].Z, 755; R600-NEXT: SETGE * T2.W, |T1.Y|, 0.5, 756; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 757; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 758; R600-NEXT: ADD T0.Y, T0.X, PV.W, 759; R600-NEXT: BFI_INT * T1.W, literal.y, PS, KC0[5].Y, 760; R600-NEXT: 2(2.802597e-45), 2147483647(nan) 761; R600-NEXT: ADD T0.X, T4.W, PV.W, 762; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, 763; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) 764; R600-NEXT: LSHR * T2.X, PV.W, literal.x, 765; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 766 %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 767 store <8 x float> %result, ptr addrspace(1) %out 768 ret void 769} 770 771define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { 772; GFX6-LABEL: round_f16: 773; GFX6: ; %bb.0: 774; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb 775; GFX6-NEXT: s_waitcnt lgkmcnt(0) 776; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 777; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 778; GFX6-NEXT: v_trunc_f32_e32 v1, v0 779; GFX6-NEXT: v_sub_f32_e32 v2, v0, v1 780; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, 0.5 781; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[2:3] 782; GFX6-NEXT: s_brev_b32 s2, -2 783; GFX6-NEXT: v_bfi_b32 v0, s2, v2, v0 784; GFX6-NEXT: v_add_f32_e32 v0, v1, v0 785; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 786; GFX6-NEXT: s_mov_b32 s3, 0xf000 787; GFX6-NEXT: s_mov_b32 s2, -1 788; GFX6-NEXT: s_waitcnt lgkmcnt(0) 789; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 790; GFX6-NEXT: s_endpgm 791; 792; GFX8-LABEL: round_f16: 793; GFX8: ; %bb.0: 794; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 795; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 796; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 797; GFX8-NEXT: s_movk_i32 s4, 0x7fff 798; GFX8-NEXT: s_mov_b32 s3, 0xf000 799; GFX8-NEXT: s_waitcnt lgkmcnt(0) 800; GFX8-NEXT: v_trunc_f16_e32 v1, s6 801; GFX8-NEXT: v_sub_f16_e32 v2, s6, v1 802; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 803; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 804; GFX8-NEXT: v_mov_b32_e32 v2, s6 805; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 806; GFX8-NEXT: s_mov_b32 s2, -1 807; GFX8-NEXT: v_add_f16_e32 v0, v1, v0 808; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 809; GFX8-NEXT: s_endpgm 810; 811; GFX9-LABEL: round_f16: 812; GFX9: ; %bb.0: 813; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 814; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 815; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 816; GFX9-NEXT: s_movk_i32 s4, 0x7fff 817; GFX9-NEXT: s_mov_b32 s3, 0xf000 818; GFX9-NEXT: s_waitcnt lgkmcnt(0) 819; GFX9-NEXT: v_trunc_f16_e32 v1, s6 820; GFX9-NEXT: v_sub_f16_e32 v2, s6, v1 821; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 822; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 823; GFX9-NEXT: v_mov_b32_e32 v2, s6 824; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 825; GFX9-NEXT: s_mov_b32 s2, -1 826; GFX9-NEXT: v_add_f16_e32 v0, v1, v0 827; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 828; GFX9-NEXT: s_endpgm 829; 830; GFX11-LABEL: round_f16: 831; GFX11: ; %bb.0: 832; GFX11-NEXT: s_clause 0x1 833; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 834; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 835; GFX11-NEXT: s_waitcnt lgkmcnt(0) 836; GFX11-NEXT: v_trunc_f16_e32 v0, s2 837; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 838; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0 839; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5 840; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 841; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3 842; GFX11-NEXT: s_mov_b32 s3, 0x31016000 843; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 844; GFX11-NEXT: s_mov_b32 s2, -1 845; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 846; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 847; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 848; GFX11-NEXT: s_endpgm 849; 850; R600-LABEL: round_f16: 851; R600: ; %bb.0: 852; R600-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 853; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X 854; R600-NEXT: CF_END 855; R600-NEXT: PAD 856; R600-NEXT: ALU clause starting at 4: 857; R600-NEXT: FLT16_TO_FLT32 * T0.W, KC0[2].Z, 858; R600-NEXT: TRUNC * T1.W, PV.W, 859; R600-NEXT: ADD * T2.W, T0.W, -PV.W, 860; R600-NEXT: SETGE * T2.W, |PV.W|, 0.5, 861; R600-NEXT: BFI_INT T0.W, literal.x, PV.W, T0.W, 862; R600-NEXT: AND_INT * T2.W, KC0[2].Y, literal.y, 863; R600-NEXT: 2147483647(nan), 3(4.203895e-45) 864; R600-NEXT: ADD * T0.W, T1.W, PV.W, 865; R600-NEXT: FLT32_TO_FLT16 T0.W, PV.W, 866; R600-NEXT: LSHL * T1.W, T2.W, literal.x, 867; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) 868; R600-NEXT: LSHL T0.X, PV.W, PS, 869; R600-NEXT: LSHL * T0.W, literal.x, PS, 870; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 871; R600-NEXT: MOV T0.Y, 0.0, 872; R600-NEXT: MOV * T0.Z, 0.0, 873; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 874; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 875 %x.arg.trunc = trunc i32 %x.arg to i16 876 %x = bitcast i16 %x.arg.trunc to half 877 %result = call half @llvm.round.f16(half %x) #1 878 store half %result, ptr addrspace(1) %out 879 ret void 880} 881 882; Should be scalarized 883define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { 884; GFX6-LABEL: round_v2f16: 885; GFX6: ; %bb.0: 886; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb 887; GFX6-NEXT: s_waitcnt lgkmcnt(0) 888; GFX6-NEXT: s_lshr_b32 s1, s0, 16 889; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s1 890; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 891; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 892; GFX6-NEXT: s_brev_b32 s4, -2 893; GFX6-NEXT: v_trunc_f32_e32 v3, v1 894; GFX6-NEXT: v_sub_f32_e32 v5, v1, v3 895; GFX6-NEXT: v_trunc_f32_e32 v2, v0 896; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, 0.5 897; GFX6-NEXT: v_sub_f32_e32 v4, v0, v2 898; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[2:3] 899; GFX6-NEXT: v_bfi_b32 v1, s4, v5, v1 900; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, 0.5 901; GFX6-NEXT: v_add_f32_e32 v1, v3, v1 902; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s[2:3] 903; GFX6-NEXT: v_bfi_b32 v0, s4, v3, v0 904; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 905; GFX6-NEXT: v_add_f32_e32 v0, v2, v0 906; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 907; GFX6-NEXT: s_mov_b32 s3, 0xf000 908; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 909; GFX6-NEXT: s_mov_b32 s2, -1 910; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 911; GFX6-NEXT: s_waitcnt lgkmcnt(0) 912; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 913; GFX6-NEXT: s_endpgm 914; 915; GFX8-LABEL: round_v2f16: 916; GFX8: ; %bb.0: 917; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 918; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 919; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00 920; GFX8-NEXT: s_movk_i32 s5, 0x7fff 921; GFX8-NEXT: s_mov_b32 s3, 0xf000 922; GFX8-NEXT: s_waitcnt lgkmcnt(0) 923; GFX8-NEXT: s_lshr_b32 s4, s6, 16 924; GFX8-NEXT: v_trunc_f16_e32 v1, s4 925; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1 926; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 927; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc 928; GFX8-NEXT: v_mov_b32_e32 v3, s4 929; GFX8-NEXT: v_bfi_b32 v2, s5, v2, v3 930; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 931; GFX8-NEXT: v_trunc_f16_e32 v2, s6 932; GFX8-NEXT: v_sub_f16_e32 v3, s6, v2 933; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 934; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 935; GFX8-NEXT: v_mov_b32_e32 v3, s6 936; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v3 937; GFX8-NEXT: v_add_f16_e32 v0, v2, v0 938; GFX8-NEXT: s_mov_b32 s2, -1 939; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 940; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 941; GFX8-NEXT: s_endpgm 942; 943; GFX9-LABEL: round_v2f16: 944; GFX9: ; %bb.0: 945; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 946; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 947; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 948; GFX9-NEXT: s_movk_i32 s5, 0x7fff 949; GFX9-NEXT: s_mov_b32 s3, 0xf000 950; GFX9-NEXT: s_waitcnt lgkmcnt(0) 951; GFX9-NEXT: s_lshr_b32 s4, s6, 16 952; GFX9-NEXT: v_trunc_f16_e32 v1, s4 953; GFX9-NEXT: v_sub_f16_e32 v2, s4, v1 954; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5 955; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc 956; GFX9-NEXT: v_mov_b32_e32 v3, s4 957; GFX9-NEXT: v_bfi_b32 v2, s5, v2, v3 958; GFX9-NEXT: v_add_f16_e32 v1, v1, v2 959; GFX9-NEXT: v_trunc_f16_e32 v2, s6 960; GFX9-NEXT: v_sub_f16_e32 v3, s6, v2 961; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5 962; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 963; GFX9-NEXT: v_mov_b32_e32 v3, s6 964; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v3 965; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 966; GFX9-NEXT: s_mov_b32 s2, -1 967; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 968; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 969; GFX9-NEXT: s_endpgm 970; 971; GFX11-LABEL: round_v2f16: 972; GFX11: ; %bb.0: 973; GFX11-NEXT: s_clause 0x1 974; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 975; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 976; GFX11-NEXT: s_waitcnt lgkmcnt(0) 977; GFX11-NEXT: s_lshr_b32 s3, s2, 16 978; GFX11-NEXT: v_trunc_f16_e32 v1, s2 979; GFX11-NEXT: v_trunc_f16_e32 v0, s3 980; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 981; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1 982; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0 983; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 984; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5 985; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4 986; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 987; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5 988; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3 989; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) 990; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4 991; GFX11-NEXT: s_mov_b32 s3, 0x31016000 992; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 993; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) 994; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2 995; GFX11-NEXT: s_mov_b32 s2, -1 996; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 998; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 999; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1000; GFX11-NEXT: s_endpgm 1001; 1002; R600-LABEL: round_v2f16: 1003; R600: ; %bb.0: 1004; R600-NEXT: ALU 22, @4, KC0[CB0:0-32], KC1[] 1005; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1006; R600-NEXT: CF_END 1007; R600-NEXT: PAD 1008; R600-NEXT: ALU clause starting at 4: 1009; R600-NEXT: LSHR * T0.W, KC0[2].Z, literal.x, 1010; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1011; R600-NEXT: FLT16_TO_FLT32 * T0.W, PV.W, 1012; R600-NEXT: FLT16_TO_FLT32 T1.W, KC0[2].Z, 1013; R600-NEXT: TRUNC * T2.W, PV.W, 1014; R600-NEXT: ADD T3.W, T0.W, -PS, 1015; R600-NEXT: TRUNC * T4.W, PV.W, 1016; R600-NEXT: ADD T5.W, T1.W, -PS, 1017; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, 1018; R600-NEXT: BFI_INT T0.W, literal.x, PS, T0.W, 1019; R600-NEXT: SETGE * T3.W, |PV.W|, 0.5, 1020; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 1021; R600-NEXT: BFI_INT T1.W, literal.x, PS, T1.W, BS:VEC_021/SCL_122 1022; R600-NEXT: ADD * T0.W, T2.W, PV.W, 1023; R600-NEXT: 2147483647(nan), 0(0.000000e+00) 1024; R600-NEXT: FLT32_TO_FLT16 T0.W, PS, 1025; R600-NEXT: ADD * T1.W, T4.W, PV.W, 1026; R600-NEXT: FLT32_TO_FLT16 T1.W, PS, 1027; R600-NEXT: LSHL * T0.W, PV.W, literal.x, 1028; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1029; R600-NEXT: OR_INT T0.X, PV.W, PS, 1030; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1031; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1032 %in = bitcast i32 %in.arg to <2 x half> 1033 %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in) 1034 store <2 x half> %result, ptr addrspace(1) %out 1035 ret void 1036} 1037 1038declare float @llvm.round.f32(float) #1 1039declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 1040declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 1041declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 1042 1043declare half @llvm.round.f16(half) #1 1044declare <2 x half> @llvm.round.v2f16(<2 x half>) #1 1045declare <4 x half> @llvm.round.v4f16(<4 x half>) #1 1046declare <8 x half> @llvm.round.v8f16(<8 x half>) #1 1047 1048attributes #0 = { nounwind } 1049attributes #1 = { nounwind readnone } 1050