1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s 4 5define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { 6; SI-LABEL: round_f64: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 9; SI-NEXT: s_mov_b32 s6, -1 10; SI-NEXT: s_mov_b32 s5, 0xfffff 11; SI-NEXT: s_mov_b32 s4, s6 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 14; SI-NEXT: s_addk_i32 s7, 0xfc01 15; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 16; SI-NEXT: s_and_b32 s8, s3, 0x80000000 17; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] 18; SI-NEXT: s_cmp_lt_i32 s7, 0 19; SI-NEXT: s_cselect_b32 s4, 0, s4 20; SI-NEXT: s_cselect_b32 s5, s8, s5 21; SI-NEXT: s_cmp_gt_i32 s7, 51 22; SI-NEXT: s_cselect_b32 s8, s2, s4 23; SI-NEXT: s_cselect_b32 s9, s3, s5 24; SI-NEXT: v_mov_b32_e32 v0, s8 25; SI-NEXT: v_mov_b32_e32 v1, s9 26; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] 27; SI-NEXT: s_mov_b32 s4, s0 28; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 29; SI-NEXT: s_brev_b32 s2, -2 30; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec 31; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 32; SI-NEXT: v_mov_b32_e32 v0, s0 33; SI-NEXT: v_mov_b32_e32 v1, s3 34; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 35; SI-NEXT: v_mov_b32_e32 v0, 0 36; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] 37; SI-NEXT: s_mov_b32 s7, 0xf000 38; SI-NEXT: s_mov_b32 s5, s1 39; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 40; SI-NEXT: s_endpgm 41; 42; CI-LABEL: round_f64: 43; CI: ; %bb.0: 44; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 45; CI-NEXT: s_brev_b32 s5, -2 46; CI-NEXT: s_mov_b32 s7, 0xf000 47; CI-NEXT: s_mov_b32 s6, -1 48; CI-NEXT: s_waitcnt lgkmcnt(0) 49; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] 50; CI-NEXT: s_mov_b32 s4, s0 51; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] 52; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 53; CI-NEXT: v_mov_b32_e32 v2, s3 54; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec 55; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 56; CI-NEXT: v_mov_b32_e32 v3, s0 57; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 58; CI-NEXT: v_mov_b32_e32 v2, 0 59; CI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 60; CI-NEXT: s_mov_b32 s5, s1 61; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 62; CI-NEXT: s_endpgm 63 %result = call double @llvm.round.f64(double %x) #1 64 store double %result, ptr addrspace(1) %out 65 ret void 66} 67 68define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 69; SI-LABEL: v_round_f64: 70; SI: ; %bb.0: 71; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 72; SI-NEXT: s_mov_b32 s7, 0xf000 73; SI-NEXT: s_mov_b32 s6, 0 74; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 75; SI-NEXT: v_mov_b32_e32 v1, 0 76; SI-NEXT: s_waitcnt lgkmcnt(0) 77; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 78; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 79; SI-NEXT: s_movk_i32 s4, 0xfc01 80; SI-NEXT: s_mov_b32 s2, -1 81; SI-NEXT: s_mov_b32 s3, 0xfffff 82; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 83; SI-NEXT: s_waitcnt vmcnt(0) 84; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 85; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 86; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 87; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 88; SI-NEXT: v_not_b32_e32 v5, v5 89; SI-NEXT: v_not_b32_e32 v4, v4 90; SI-NEXT: v_and_b32_e32 v5, v3, v5 91; SI-NEXT: v_and_b32_e32 v4, v2, v4 92; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 93; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 94; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 95; SI-NEXT: v_cmp_lt_i32_e32 vcc, 51, v6 96; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc 97; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc 98; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 99; SI-NEXT: s_brev_b32 s2, -2 100; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 101; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc 102; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 103; SI-NEXT: v_mov_b32_e32 v2, v1 104; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 105; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 106; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 107; SI-NEXT: s_endpgm 108; 109; CI-LABEL: v_round_f64: 110; CI: ; %bb.0: 111; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 112; CI-NEXT: s_mov_b32 s7, 0xf000 113; CI-NEXT: s_mov_b32 s6, 0 114; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 115; CI-NEXT: v_mov_b32_e32 v1, 0 116; CI-NEXT: s_waitcnt lgkmcnt(0) 117; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 118; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 119; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 120; CI-NEXT: s_brev_b32 s2, -2 121; CI-NEXT: s_waitcnt vmcnt(0) 122; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] 123; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] 124; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 125; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc 126; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 127; CI-NEXT: v_mov_b32_e32 v2, v1 128; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] 129; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 130; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 131; CI-NEXT: s_endpgm 132 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 133 %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid 134 %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid 135 %x = load double, ptr addrspace(1) %gep 136 %result = call double @llvm.round.f64(double %x) #1 137 store double %result, ptr addrspace(1) %out.gep 138 ret void 139} 140 141define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 { 142; SI-LABEL: round_v2f64: 143; SI: ; %bb.0: 144; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 145; SI-NEXT: s_mov_b32 s2, -1 146; SI-NEXT: s_mov_b32 s7, 0xfffff 147; SI-NEXT: s_mov_b32 s6, s2 148; SI-NEXT: s_waitcnt lgkmcnt(0) 149; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 150; SI-NEXT: s_add_i32 s12, s0, 0xfffffc01 151; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 152; SI-NEXT: s_and_b32 s3, s11, 0x80000000 153; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] 154; SI-NEXT: s_cmp_lt_i32 s12, 0 155; SI-NEXT: s_cselect_b32 s0, 0, s0 156; SI-NEXT: s_cselect_b32 s1, s3, s1 157; SI-NEXT: s_cmp_gt_i32 s12, 51 158; SI-NEXT: s_cselect_b32 s12, s10, s0 159; SI-NEXT: s_cselect_b32 s13, s11, s1 160; SI-NEXT: v_mov_b32_e32 v0, s12 161; SI-NEXT: v_mov_b32_e32 v1, s13 162; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] 163; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 164; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 165; SI-NEXT: s_brev_b32 s10, -2 166; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec 167; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 168; SI-NEXT: v_mov_b32_e32 v0, s3 169; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 170; SI-NEXT: s_addk_i32 s3, 0xfc01 171; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 172; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] 173; SI-NEXT: s_and_b32 s6, s9, 0x80000000 174; SI-NEXT: s_cmp_lt_i32 s3, 0 175; SI-NEXT: s_cselect_b32 s4, 0, s4 176; SI-NEXT: s_cselect_b32 s5, s6, s5 177; SI-NEXT: s_cmp_gt_i32 s3, 51 178; SI-NEXT: s_cselect_b32 s4, s8, s4 179; SI-NEXT: s_cselect_b32 s5, s9, s5 180; SI-NEXT: v_mov_b32_e32 v2, s4 181; SI-NEXT: v_mov_b32_e32 v3, s5 182; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] 183; SI-NEXT: v_mov_b32_e32 v1, s11 184; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 185; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 186; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec 187; SI-NEXT: v_mov_b32_e32 v0, 0 188; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 189; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] 190; SI-NEXT: v_mov_b32_e32 v1, s3 191; SI-NEXT: v_mov_b32_e32 v4, s9 192; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 193; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] 194; SI-NEXT: s_mov_b32 s3, 0xf000 195; SI-NEXT: s_waitcnt lgkmcnt(0) 196; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 197; SI-NEXT: s_endpgm 198; 199; CI-LABEL: round_v2f64: 200; CI: ; %bb.0: 201; CI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 202; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 203; CI-NEXT: s_brev_b32 s2, -2 204; CI-NEXT: v_mov_b32_e32 v0, 0 205; CI-NEXT: s_mov_b32 s3, 0xf000 206; CI-NEXT: s_waitcnt lgkmcnt(0) 207; CI-NEXT: v_trunc_f64_e32 v[2:3], s[10:11] 208; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] 209; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] 210; CI-NEXT: v_mov_b32_e32 v1, s11 211; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 212; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] 213; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 214; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 215; CI-NEXT: v_mov_b32_e32 v8, s4 216; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 217; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 218; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 219; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 220; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] 221; CI-NEXT: v_mov_b32_e32 v1, s4 222; CI-NEXT: v_mov_b32_e32 v4, s9 223; CI-NEXT: v_bfi_b32 v1, s2, v1, v4 224; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[0:1] 225; CI-NEXT: s_mov_b32 s2, -1 226; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 227; CI-NEXT: s_endpgm 228 %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 229 store <2 x double> %result, ptr addrspace(1) %out 230 ret void 231} 232 233define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 { 234; SI-LABEL: round_v4f64: 235; SI: ; %bb.0: 236; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 237; SI-NEXT: s_mov_b32 s2, -1 238; SI-NEXT: s_mov_b32 s7, 0xfffff 239; SI-NEXT: s_mov_b32 s6, s2 240; SI-NEXT: v_mov_b32_e32 v4, 0 241; SI-NEXT: s_waitcnt lgkmcnt(0) 242; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 243; SI-NEXT: s_add_i32 s16, s0, 0xfffffc01 244; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s16 245; SI-NEXT: s_and_b32 s3, s11, 0x80000000 246; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] 247; SI-NEXT: s_cmp_lt_i32 s16, 0 248; SI-NEXT: s_cselect_b32 s0, 0, s0 249; SI-NEXT: s_cselect_b32 s1, s3, s1 250; SI-NEXT: s_cmp_gt_i32 s16, 51 251; SI-NEXT: s_cselect_b32 s16, s10, s0 252; SI-NEXT: s_cselect_b32 s17, s11, s1 253; SI-NEXT: v_mov_b32_e32 v0, s16 254; SI-NEXT: v_mov_b32_e32 v1, s17 255; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] 256; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 257; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 258; SI-NEXT: v_mov_b32_e32 v1, s11 259; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec 260; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 261; SI-NEXT: v_mov_b32_e32 v0, s3 262; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 263; SI-NEXT: s_addk_i32 s3, 0xfc01 264; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 265; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] 266; SI-NEXT: s_and_b32 s10, s9, 0x80000000 267; SI-NEXT: s_cmp_lt_i32 s3, 0 268; SI-NEXT: s_cselect_b32 s4, 0, s4 269; SI-NEXT: s_cselect_b32 s5, s10, s5 270; SI-NEXT: s_cmp_gt_i32 s3, 51 271; SI-NEXT: s_brev_b32 s18, -2 272; SI-NEXT: s_cselect_b32 s4, s8, s4 273; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 274; SI-NEXT: s_cselect_b32 s5, s9, s5 275; SI-NEXT: v_mov_b32_e32 v0, s4 276; SI-NEXT: v_mov_b32_e32 v1, s5 277; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] 278; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] 279; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 280; SI-NEXT: v_mov_b32_e32 v6, s9 281; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec 282; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 283; SI-NEXT: v_mov_b32_e32 v5, s3 284; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 285; SI-NEXT: s_addk_i32 s3, 0xfc01 286; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 287; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] 288; SI-NEXT: s_and_b32 s10, s15, 0x80000000 289; SI-NEXT: s_cmp_lt_i32 s3, 0 290; SI-NEXT: s_cselect_b32 s8, 0, s8 291; SI-NEXT: s_cselect_b32 s9, s10, s9 292; SI-NEXT: s_cmp_gt_i32 s3, 51 293; SI-NEXT: s_cselect_b32 s8, s14, s8 294; SI-NEXT: s_cselect_b32 s9, s15, s9 295; SI-NEXT: v_mov_b32_e32 v0, s8 296; SI-NEXT: v_mov_b32_e32 v1, s9 297; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] 298; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 299; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 300; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] 301; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec 302; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 303; SI-NEXT: v_mov_b32_e32 v8, s3 304; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 305; SI-NEXT: s_addk_i32 s3, 0xfc01 306; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 307; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] 308; SI-NEXT: s_and_b32 s6, s13, 0x80000000 309; SI-NEXT: s_cmp_lt_i32 s3, 0 310; SI-NEXT: s_cselect_b32 s4, 0, s4 311; SI-NEXT: s_cselect_b32 s5, s6, s5 312; SI-NEXT: s_cmp_gt_i32 s3, 51 313; SI-NEXT: s_cselect_b32 s5, s13, s5 314; SI-NEXT: s_cselect_b32 s4, s12, s4 315; SI-NEXT: v_mov_b32_e32 v6, s5 316; SI-NEXT: v_mov_b32_e32 v5, s4 317; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] 318; SI-NEXT: v_mov_b32_e32 v9, s15 319; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 320; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 321; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec 322; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 323; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] 324; SI-NEXT: v_mov_b32_e32 v5, s3 325; SI-NEXT: v_mov_b32_e32 v8, s13 326; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 327; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] 328; SI-NEXT: s_mov_b32 s3, 0xf000 329; SI-NEXT: s_waitcnt lgkmcnt(0) 330; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 331; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 332; SI-NEXT: s_endpgm 333; 334; CI-LABEL: round_v4f64: 335; CI: ; %bb.0: 336; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 337; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 338; CI-NEXT: s_brev_b32 s2, -2 339; CI-NEXT: v_mov_b32_e32 v4, 0 340; CI-NEXT: s_mov_b32 s3, 0xf000 341; CI-NEXT: s_waitcnt lgkmcnt(0) 342; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] 343; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] 344; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] 345; CI-NEXT: v_mov_b32_e32 v5, s11 346; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 347; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] 348; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 349; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 350; CI-NEXT: v_mov_b32_e32 v8, s4 351; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 352; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 353; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] 354; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 355; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] 356; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 357; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] 358; CI-NEXT: v_mov_b32_e32 v5, s4 359; CI-NEXT: v_mov_b32_e32 v10, s9 360; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 361; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 362; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] 363; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] 364; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 365; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] 366; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 367; CI-NEXT: v_mov_b32_e32 v5, s4 368; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 369; CI-NEXT: v_mov_b32_e32 v12, s15 370; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec 371; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 372; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 373; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] 374; CI-NEXT: v_mov_b32_e32 v5, s4 375; CI-NEXT: v_mov_b32_e32 v8, s13 376; CI-NEXT: v_bfi_b32 v5, s2, v5, v8 377; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] 378; CI-NEXT: s_mov_b32 s2, -1 379; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 380; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 381; CI-NEXT: s_endpgm 382 %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 383 store <4 x double> %result, ptr addrspace(1) %out 384 ret void 385} 386 387define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 { 388; SI-LABEL: round_v8f64: 389; SI: ; %bb.0: 390; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 391; SI-NEXT: s_mov_b32 s2, -1 392; SI-NEXT: s_mov_b32 s7, 0xfffff 393; SI-NEXT: s_mov_b32 s6, s2 394; SI-NEXT: v_mov_b32_e32 v8, 0 395; SI-NEXT: s_waitcnt lgkmcnt(0) 396; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 397; SI-NEXT: s_add_i32 s24, s0, 0xfffffc01 398; SI-NEXT: s_lshr_b64 s[0:1], s[6:7], s24 399; SI-NEXT: s_and_b32 s3, s11, 0x80000000 400; SI-NEXT: s_andn2_b64 s[0:1], s[10:11], s[0:1] 401; SI-NEXT: s_cmp_lt_i32 s24, 0 402; SI-NEXT: s_cselect_b32 s0, 0, s0 403; SI-NEXT: s_cselect_b32 s1, s3, s1 404; SI-NEXT: s_cmp_gt_i32 s24, 51 405; SI-NEXT: s_cselect_b32 s24, s10, s0 406; SI-NEXT: s_cselect_b32 s25, s11, s1 407; SI-NEXT: v_mov_b32_e32 v0, s24 408; SI-NEXT: v_mov_b32_e32 v1, s25 409; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] 410; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 411; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 412; SI-NEXT: v_mov_b32_e32 v1, s11 413; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec 414; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 415; SI-NEXT: v_mov_b32_e32 v0, s4 416; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 417; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 418; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 419; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] 420; SI-NEXT: s_and_b32 s11, s9, 0x80000000 421; SI-NEXT: s_cmp_lt_i32 s10, 0 422; SI-NEXT: s_cselect_b32 s4, 0, s4 423; SI-NEXT: s_cselect_b32 s5, s11, s5 424; SI-NEXT: s_cmp_gt_i32 s10, 51 425; SI-NEXT: s_brev_b32 s3, -2 426; SI-NEXT: s_cselect_b32 s4, s8, s4 427; SI-NEXT: v_bfi_b32 v9, s3, v0, v1 428; SI-NEXT: s_cselect_b32 s5, s9, s5 429; SI-NEXT: v_mov_b32_e32 v0, s4 430; SI-NEXT: v_mov_b32_e32 v1, s5 431; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] 432; SI-NEXT: v_mov_b32_e32 v5, s9 433; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 434; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] 435; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec 436; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 437; SI-NEXT: v_mov_b32_e32 v4, s8 438; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 439; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 440; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 441; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] 442; SI-NEXT: s_and_b32 s11, s15, 0x80000000 443; SI-NEXT: s_cmp_lt_i32 s10, 0 444; SI-NEXT: s_cselect_b32 s8, 0, s8 445; SI-NEXT: s_cselect_b32 s9, s11, s9 446; SI-NEXT: s_cmp_gt_i32 s10, 51 447; SI-NEXT: s_cselect_b32 s8, s14, s8 448; SI-NEXT: s_cselect_b32 s9, s15, s9 449; SI-NEXT: v_mov_b32_e32 v0, s8 450; SI-NEXT: v_mov_b32_e32 v1, s9 451; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] 452; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 453; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 454; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] 455; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec 456; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 457; SI-NEXT: v_mov_b32_e32 v6, s4 458; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 459; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 460; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 461; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] 462; SI-NEXT: s_and_b32 s11, s13, 0x80000000 463; SI-NEXT: s_cmp_lt_i32 s10, 0 464; SI-NEXT: s_cselect_b32 s4, 0, s4 465; SI-NEXT: s_cselect_b32 s5, s11, s5 466; SI-NEXT: s_cmp_gt_i32 s10, 51 467; SI-NEXT: s_cselect_b32 s4, s12, s4 468; SI-NEXT: s_cselect_b32 s5, s13, s5 469; SI-NEXT: v_mov_b32_e32 v4, s4 470; SI-NEXT: v_mov_b32_e32 v5, s5 471; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] 472; SI-NEXT: v_mov_b32_e32 v7, s15 473; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 474; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 475; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] 476; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec 477; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 478; SI-NEXT: v_mov_b32_e32 v9, s8 479; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 480; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 481; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 482; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[8:9] 483; SI-NEXT: s_and_b32 s11, s19, 0x80000000 484; SI-NEXT: s_cmp_lt_i32 s10, 0 485; SI-NEXT: s_cselect_b32 s8, 0, s8 486; SI-NEXT: s_cselect_b32 s9, s11, s9 487; SI-NEXT: s_cmp_gt_i32 s10, 51 488; SI-NEXT: s_cselect_b32 s8, s18, s8 489; SI-NEXT: s_cselect_b32 s9, s19, s9 490; SI-NEXT: v_mov_b32_e32 v4, s8 491; SI-NEXT: v_mov_b32_e32 v5, s9 492; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] 493; SI-NEXT: v_mov_b32_e32 v10, s13 494; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 495; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 496; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] 497; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec 498; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 499; SI-NEXT: v_mov_b32_e32 v12, s4 500; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 501; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 502; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 503; SI-NEXT: s_andn2_b64 s[4:5], s[16:17], s[4:5] 504; SI-NEXT: s_and_b32 s11, s17, 0x80000000 505; SI-NEXT: s_cmp_lt_i32 s10, 0 506; SI-NEXT: s_cselect_b32 s4, 0, s4 507; SI-NEXT: s_cselect_b32 s5, s11, s5 508; SI-NEXT: s_cmp_gt_i32 s10, 51 509; SI-NEXT: s_cselect_b32 s5, s17, s5 510; SI-NEXT: s_cselect_b32 s4, s16, s4 511; SI-NEXT: v_mov_b32_e32 v10, s5 512; SI-NEXT: v_mov_b32_e32 v9, s4 513; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] 514; SI-NEXT: v_mov_b32_e32 v13, s19 515; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 516; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 517; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] 518; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec 519; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 520; SI-NEXT: v_mov_b32_e32 v14, s8 521; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 522; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 523; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 524; SI-NEXT: s_andn2_b64 s[8:9], s[22:23], s[8:9] 525; SI-NEXT: s_and_b32 s11, s23, 0x80000000 526; SI-NEXT: s_cmp_lt_i32 s10, 0 527; SI-NEXT: s_cselect_b32 s8, 0, s8 528; SI-NEXT: s_cselect_b32 s9, s11, s9 529; SI-NEXT: s_cmp_gt_i32 s10, 51 530; SI-NEXT: s_cselect_b32 s9, s23, s9 531; SI-NEXT: s_cselect_b32 s8, s22, s8 532; SI-NEXT: v_mov_b32_e32 v10, s9 533; SI-NEXT: v_mov_b32_e32 v9, s8 534; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] 535; SI-NEXT: v_mov_b32_e32 v15, s17 536; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 537; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 538; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] 539; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec 540; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 541; SI-NEXT: v_mov_b32_e32 v9, s4 542; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 543; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 544; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 545; SI-NEXT: s_andn2_b64 s[4:5], s[20:21], s[4:5] 546; SI-NEXT: s_and_b32 s6, s21, 0x80000000 547; SI-NEXT: s_cmp_lt_i32 s10, 0 548; SI-NEXT: s_cselect_b32 s4, 0, s4 549; SI-NEXT: s_cselect_b32 s5, s6, s5 550; SI-NEXT: s_cmp_gt_i32 s10, 51 551; SI-NEXT: s_cselect_b32 s5, s21, s5 552; SI-NEXT: s_cselect_b32 s4, s20, s4 553; SI-NEXT: v_mov_b32_e32 v15, s5 554; SI-NEXT: v_mov_b32_e32 v14, s4 555; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] 556; SI-NEXT: v_mov_b32_e32 v16, s23 557; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 558; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 559; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec 560; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 561; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] 562; SI-NEXT: v_mov_b32_e32 v9, s6 563; SI-NEXT: v_mov_b32_e32 v14, s21 564; SI-NEXT: v_bfi_b32 v9, s3, v9, v14 565; SI-NEXT: v_add_f64 v[14:15], s[4:5], v[8:9] 566; SI-NEXT: s_mov_b32 s3, 0xf000 567; SI-NEXT: s_waitcnt lgkmcnt(0) 568; SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 569; SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 570; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 571; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 572; SI-NEXT: s_endpgm 573; 574; CI-LABEL: round_v8f64: 575; CI: ; %bb.0: 576; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 577; CI-NEXT: s_brev_b32 s6, -2 578; CI-NEXT: v_mov_b32_e32 v4, 0 579; CI-NEXT: s_waitcnt lgkmcnt(0) 580; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] 581; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] 582; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] 583; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] 584; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 585; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5 586; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 587; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 588; CI-NEXT: v_mov_b32_e32 v5, s11 589; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec 590; CI-NEXT: v_mov_b32_e32 v2, s7 591; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] 592; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 593; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 594; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] 595; CI-NEXT: v_mov_b32_e32 v5, s0 596; CI-NEXT: v_mov_b32_e32 v10, s9 597; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] 598; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 599; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 600; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] 601; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] 602; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 603; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] 604; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 605; CI-NEXT: v_mov_b32_e32 v5, s0 606; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 607; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] 608; CI-NEXT: v_mov_b32_e32 v12, s15 609; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 610; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 611; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 612; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] 613; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] 614; CI-NEXT: v_mov_b32_e32 v5, s0 615; CI-NEXT: v_mov_b32_e32 v14, s13 616; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 617; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 618; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] 619; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 620; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] 621; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 622; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] 623; CI-NEXT: v_mov_b32_e32 v5, s0 624; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5 625; CI-NEXT: v_mov_b32_e32 v16, s19 626; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 627; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 628; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 629; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] 630; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] 631; CI-NEXT: v_mov_b32_e32 v5, s0 632; CI-NEXT: v_mov_b32_e32 v10, s17 633; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 634; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] 635; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] 636; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] 637; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 638; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] 639; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 640; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 641; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 642; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec 643; CI-NEXT: v_mov_b32_e32 v5, s2 644; CI-NEXT: v_mov_b32_e32 v18, s23 645; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 646; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 647; CI-NEXT: v_mov_b32_e32 v18, s0 648; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 649; CI-NEXT: v_mov_b32_e32 v19, s21 650; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] 651; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 652; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] 653; CI-NEXT: s_mov_b32 s3, 0xf000 654; CI-NEXT: s_mov_b32 s2, -1 655; CI-NEXT: s_waitcnt lgkmcnt(0) 656; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 657; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 658; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 659; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 660; CI-NEXT: s_endpgm 661 %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 662 store <8 x double> %result, ptr addrspace(1) %out 663 ret void 664} 665 666declare i32 @llvm.amdgcn.workitem.id.x() #1 667 668declare double @llvm.round.f64(double) #1 669declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 670declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 671declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 672 673attributes #0 = { nounwind } 674attributes #1 = { nounwind readnone } 675