1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6 7define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 8; SI-LABEL: v_uint_to_fp_i64_to_f64: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 11; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: v_mov_b32_e32 v1, s3 14; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 15; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 16; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 17; SI-NEXT: s_waitcnt vmcnt(0) 18; SI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 19; SI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 20; SI-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 21; SI-NEXT: v_mov_b32_e32 v2, s0 22; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 23; SI-NEXT: v_mov_b32_e32 v3, s1 24; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: v_uint_to_fp_i64_to_f64: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 30; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 31; VI-NEXT: s_waitcnt lgkmcnt(0) 32; VI-NEXT: v_mov_b32_e32 v1, s3 33; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 34; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 36; VI-NEXT: s_waitcnt vmcnt(0) 37; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 38; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 39; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 40; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] 41; VI-NEXT: v_mov_b32_e32 v2, s0 42; VI-NEXT: v_mov_b32_e32 v3, s1 43; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 44; VI-NEXT: s_endpgm 45 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 46 %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 47 %val = load i64, ptr addrspace(1) %gep, align 8 48 %result = uitofp i64 %val to double 49 store double %result, ptr addrspace(1) %out 50 ret void 51} 52 53define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) { 54; SI-LABEL: s_uint_to_fp_i64_to_f64: 55; SI: ; %bb.0: 56; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 57; SI-NEXT: s_waitcnt lgkmcnt(0) 58; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 59; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 60; SI-NEXT: v_mov_b32_e32 v4, s0 61; SI-NEXT: v_mov_b32_e32 v5, s1 62; SI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 63; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 64; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 65; SI-NEXT: s_endpgm 66; 67; VI-LABEL: s_uint_to_fp_i64_to_f64: 68; VI: ; %bb.0: 69; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 70; VI-NEXT: s_waitcnt lgkmcnt(0) 71; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 72; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 73; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 74; VI-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] 75; VI-NEXT: v_mov_b32_e32 v2, s0 76; VI-NEXT: v_mov_b32_e32 v3, s1 77; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 78; VI-NEXT: s_endpgm 79 %cast = uitofp i64 %in to double 80 store double %cast, ptr addrspace(1) %out, align 8 81 ret void 82} 83 84define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) { 85; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64: 86; SI: ; %bb.0: 87; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 88; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 89; SI-NEXT: s_waitcnt lgkmcnt(0) 90; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 91; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 92; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 93; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 94; SI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 95; SI-NEXT: v_ldexp_f64 v[8:9], v[2:3], 32 96; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] 97; SI-NEXT: v_add_f64 v[0:1], v[8:9], v[6:7] 98; SI-NEXT: v_mov_b32_e32 v4, s4 99; SI-NEXT: v_mov_b32_e32 v5, s5 100; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 101; SI-NEXT: s_endpgm 102; 103; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64: 104; VI: ; %bb.0: 105; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 106; VI-NEXT: s_waitcnt lgkmcnt(0) 107; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 108; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 109; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 110; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 111; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 112; VI-NEXT: v_ldexp_f64 v[4:5], v[2:3], 32 113; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 114; VI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 115; VI-NEXT: v_add_f64 v[0:1], v[4:5], v[6:7] 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v5, s1 118; VI-NEXT: v_mov_b32_e32 v4, s0 119; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 120; VI-NEXT: s_endpgm 121 %cast = uitofp <2 x i64> %in to <2 x double> 122 store <2 x double> %cast, ptr addrspace(1) %out, align 16 123 ret void 124} 125 126define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) { 127; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64: 128; SI: ; %bb.0: 129; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8 130; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 131; SI-NEXT: s_waitcnt lgkmcnt(0) 132; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 133; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1 134; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 135; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s0 136; SI-NEXT: v_cvt_f64_u32_e32 v[8:9], s7 137; SI-NEXT: v_cvt_f64_u32_e32 v[10:11], s5 138; SI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 139; SI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 140; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 141; SI-NEXT: v_add_f64 v[0:1], v[4:5], v[6:7] 142; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s6 143; SI-NEXT: v_cvt_f64_u32_e32 v[12:13], s4 144; SI-NEXT: v_ldexp_f64 v[6:7], v[8:9], 32 145; SI-NEXT: v_ldexp_f64 v[8:9], v[10:11], 32 146; SI-NEXT: s_add_u32 s0, s8, 16 147; SI-NEXT: s_addc_u32 s1, s9, 0 148; SI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] 149; SI-NEXT: v_add_f64 v[4:5], v[8:9], v[12:13] 150; SI-NEXT: v_mov_b32_e32 v9, s1 151; SI-NEXT: v_mov_b32_e32 v8, s0 152; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 153; SI-NEXT: s_nop 0 154; SI-NEXT: v_mov_b32_e32 v4, s8 155; SI-NEXT: v_mov_b32_e32 v5, s9 156; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 157; SI-NEXT: s_endpgm 158; 159; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64: 160; VI: ; %bb.0: 161; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20 162; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 163; VI-NEXT: s_waitcnt lgkmcnt(0) 164; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7 165; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5 166; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3 167; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s1 168; VI-NEXT: v_ldexp_f64 v[8:9], v[2:3], 32 169; VI-NEXT: v_ldexp_f64 v[4:5], v[4:5], 32 170; VI-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 171; VI-NEXT: v_ldexp_f64 v[10:11], v[6:7], 32 172; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s6 173; VI-NEXT: v_cvt_f64_u32_e32 v[12:13], s4 174; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2 175; VI-NEXT: v_cvt_f64_u32_e32 v[14:15], s0 176; VI-NEXT: v_add_f64 v[6:7], v[8:9], v[6:7] 177; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] 178; VI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] 179; VI-NEXT: v_add_f64 v[0:1], v[10:11], v[14:15] 180; VI-NEXT: s_add_u32 s0, s8, 16 181; VI-NEXT: s_addc_u32 s1, s9, 0 182; VI-NEXT: v_mov_b32_e32 v11, s1 183; VI-NEXT: v_mov_b32_e32 v8, s8 184; VI-NEXT: v_mov_b32_e32 v10, s0 185; VI-NEXT: v_mov_b32_e32 v9, s9 186; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 187; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 188; VI-NEXT: s_endpgm 189 %cast = uitofp <4 x i64> %in to <4 x double> 190 store <4 x double> %cast, ptr addrspace(1) %out, align 16 191 ret void 192} 193 194define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) { 195; SI-LABEL: s_uint_to_fp_i32_to_f64: 196; SI: ; %bb.0: 197; SI-NEXT: s_load_dword s2, s[8:9], 0x2 198; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 199; SI-NEXT: s_waitcnt lgkmcnt(0) 200; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 201; SI-NEXT: v_mov_b32_e32 v3, s1 202; SI-NEXT: v_mov_b32_e32 v2, s0 203; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 204; SI-NEXT: s_endpgm 205; 206; VI-LABEL: s_uint_to_fp_i32_to_f64: 207; VI: ; %bb.0: 208; VI-NEXT: s_load_dword s2, s[8:9], 0x8 209; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 210; VI-NEXT: s_waitcnt lgkmcnt(0) 211; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 212; VI-NEXT: v_mov_b32_e32 v3, s1 213; VI-NEXT: v_mov_b32_e32 v2, s0 214; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 215; VI-NEXT: s_endpgm 216 %cast = uitofp i32 %in to double 217 store double %cast, ptr addrspace(1) %out, align 8 218 ret void 219} 220 221define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) { 222; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64: 223; GCN: ; %bb.0: 224; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 225; GCN-NEXT: s_waitcnt lgkmcnt(0) 226; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3 227; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 228; GCN-NEXT: v_mov_b32_e32 v5, s1 229; GCN-NEXT: v_mov_b32_e32 v4, s0 230; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 231; GCN-NEXT: s_endpgm 232 %cast = uitofp <2 x i32> %in to <2 x double> 233 store <2 x double> %cast, ptr addrspace(1) %out, align 16 234 ret void 235} 236 237define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) { 238; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64: 239; SI: ; %bb.0: 240; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 241; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 242; SI-NEXT: s_waitcnt lgkmcnt(0) 243; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 244; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 245; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 246; SI-NEXT: s_add_u32 s0, s4, 16 247; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 248; SI-NEXT: s_addc_u32 s1, s5, 0 249; SI-NEXT: v_mov_b32_e32 v9, s1 250; SI-NEXT: v_mov_b32_e32 v8, s0 251; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 252; SI-NEXT: s_nop 0 253; SI-NEXT: v_mov_b32_e32 v4, s4 254; SI-NEXT: v_mov_b32_e32 v5, s5 255; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 256; SI-NEXT: s_endpgm 257; 258; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64: 259; VI: ; %bb.0: 260; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 261; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 262; VI-NEXT: s_waitcnt lgkmcnt(0) 263; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 264; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3 265; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s2 266; VI-NEXT: s_add_u32 s0, s4, 16 267; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1 268; VI-NEXT: s_addc_u32 s1, s5, 0 269; VI-NEXT: v_mov_b32_e32 v9, s1 270; VI-NEXT: v_mov_b32_e32 v8, s0 271; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 272; VI-NEXT: s_nop 0 273; VI-NEXT: v_mov_b32_e32 v4, s4 274; VI-NEXT: v_mov_b32_e32 v5, s5 275; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 276; VI-NEXT: s_endpgm 277 %cast = uitofp <4 x i32> %in to <4 x double> 278 store <4 x double> %cast, ptr addrspace(1) %out, align 16 279 ret void 280} 281 282; We can't fold the SGPRs into v_cndmask_b32_e32, because it already 283; uses an SGPR (implicit vcc). 284define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) { 285; SI-LABEL: uint_to_fp_i1_to_f64: 286; SI: ; %bb.0: 287; SI-NEXT: s_load_dword s2, s[8:9], 0x2 288; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 289; SI-NEXT: v_mov_b32_e32 v0, 0 290; SI-NEXT: s_waitcnt lgkmcnt(0) 291; SI-NEXT: s_cmp_eq_u32 s2, 0 292; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 293; SI-NEXT: v_mov_b32_e32 v3, s1 294; SI-NEXT: v_mov_b32_e32 v1, s2 295; SI-NEXT: v_mov_b32_e32 v2, s0 296; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 297; SI-NEXT: s_endpgm 298; 299; VI-LABEL: uint_to_fp_i1_to_f64: 300; VI: ; %bb.0: 301; VI-NEXT: s_load_dword s2, s[8:9], 0x8 302; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 303; VI-NEXT: v_mov_b32_e32 v0, 0 304; VI-NEXT: s_waitcnt lgkmcnt(0) 305; VI-NEXT: s_cmp_eq_u32 s2, 0 306; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 307; VI-NEXT: v_mov_b32_e32 v3, s1 308; VI-NEXT: v_mov_b32_e32 v1, s2 309; VI-NEXT: v_mov_b32_e32 v2, s0 310; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 311; VI-NEXT: s_endpgm 312 %cmp = icmp eq i32 %in, 0 313 %fp = uitofp i1 %cmp to double 314 store double %fp, ptr addrspace(1) %out, align 4 315 ret void 316} 317 318define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) { 319; SI-LABEL: uint_to_fp_i1_to_f64_load: 320; SI: ; %bb.0: 321; SI-NEXT: s_load_dword s2, s[8:9], 0x2 322; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 323; SI-NEXT: s_waitcnt lgkmcnt(0) 324; SI-NEXT: s_bitcmp1_b32 s2, 0 325; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 326; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] 327; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 328; SI-NEXT: v_mov_b32_e32 v3, s1 329; SI-NEXT: v_mov_b32_e32 v2, s0 330; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 331; SI-NEXT: s_endpgm 332; 333; VI-LABEL: uint_to_fp_i1_to_f64_load: 334; VI: ; %bb.0: 335; VI-NEXT: s_load_dword s2, s[8:9], 0x8 336; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 337; VI-NEXT: s_waitcnt lgkmcnt(0) 338; VI-NEXT: s_bitcmp1_b32 s2, 0 339; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 340; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] 341; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 342; VI-NEXT: v_mov_b32_e32 v3, s1 343; VI-NEXT: v_mov_b32_e32 v2, s0 344; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 345; VI-NEXT: s_endpgm 346 %fp = uitofp i1 %in to double 347 store double %fp, ptr addrspace(1) %out, align 8 348 ret void 349} 350 351define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) { 352; SI-LABEL: s_uint_to_fp_i8_to_f64: 353; SI: ; %bb.0: 354; SI-NEXT: s_load_dword s2, s[8:9], 0x2 355; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 356; SI-NEXT: s_waitcnt lgkmcnt(0) 357; SI-NEXT: s_and_b32 s2, s2, 0xff 358; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 359; SI-NEXT: v_mov_b32_e32 v3, s1 360; SI-NEXT: v_mov_b32_e32 v2, s0 361; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 362; SI-NEXT: s_endpgm 363; 364; VI-LABEL: s_uint_to_fp_i8_to_f64: 365; VI: ; %bb.0: 366; VI-NEXT: s_load_dword s2, s[8:9], 0x8 367; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 368; VI-NEXT: s_waitcnt lgkmcnt(0) 369; VI-NEXT: s_and_b32 s2, s2, 0xff 370; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 371; VI-NEXT: v_mov_b32_e32 v3, s1 372; VI-NEXT: v_mov_b32_e32 v2, s0 373; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 374; VI-NEXT: s_endpgm 375 %fp = uitofp i8 %in to double 376 store double %fp, ptr addrspace(1) %out 377 ret void 378} 379 380; FIXME: Worse on VI 381define double @v_uint_to_fp_i8_to_f64(i8 %in) { 382; SI-LABEL: v_uint_to_fp_i8_to_f64: 383; SI: ; %bb.0: 384; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 385; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 386; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 387; SI-NEXT: s_setpc_b64 s[30:31] 388; 389; VI-LABEL: v_uint_to_fp_i8_to_f64: 390; VI: ; %bb.0: 391; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 392; VI-NEXT: v_mov_b32_e32 v1, 0xffff 393; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 394; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 395; VI-NEXT: s_setpc_b64 s[30:31] 396 %fp = uitofp i8 %in to double 397 ret double %fp 398} 399 400define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { 401; SI-LABEL: s_select_uint_to_fp_i1_vals_f64: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dword s2, s[8:9], 0x2 404; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 405; SI-NEXT: v_mov_b32_e32 v0, 0 406; SI-NEXT: s_waitcnt lgkmcnt(0) 407; SI-NEXT: s_cmp_eq_u32 s2, 0 408; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 409; SI-NEXT: v_mov_b32_e32 v3, s1 410; SI-NEXT: v_mov_b32_e32 v1, s2 411; SI-NEXT: v_mov_b32_e32 v2, s0 412; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 413; SI-NEXT: s_endpgm 414; 415; VI-LABEL: s_select_uint_to_fp_i1_vals_f64: 416; VI: ; %bb.0: 417; VI-NEXT: s_load_dword s2, s[8:9], 0x8 418; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 419; VI-NEXT: v_mov_b32_e32 v0, 0 420; VI-NEXT: s_waitcnt lgkmcnt(0) 421; VI-NEXT: s_cmp_eq_u32 s2, 0 422; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 423; VI-NEXT: v_mov_b32_e32 v3, s1 424; VI-NEXT: v_mov_b32_e32 v1, s2 425; VI-NEXT: v_mov_b32_e32 v2, s0 426; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 427; VI-NEXT: s_endpgm 428 %cmp = icmp eq i32 %in, 0 429 %select = select i1 %cmp, double 1.0, double 0.0 430 store double %select, ptr addrspace(1) %out, align 8 431 ret void 432} 433 434define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { 435; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64: 436; GCN: ; %bb.0: 437; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 438; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 439; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 440; GCN-NEXT: v_mov_b32_e32 v3, 0 441; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 442; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] 443; GCN-NEXT: s_waitcnt vmcnt(0) 444; GCN-NEXT: s_setpc_b64 s[30:31] 445 %cmp = icmp eq i32 %in, 0 446 %select = select i1 %cmp, double 1.0, double 0.0 447 store double %select, ptr addrspace(1) %out, align 8 448 ret void 449} 450 451define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { 452; SI-LABEL: s_select_uint_to_fp_i1_vals_i64: 453; SI: ; %bb.0: 454; SI-NEXT: s_load_dword s2, s[8:9], 0x2 455; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 456; SI-NEXT: v_mov_b32_e32 v0, 0 457; SI-NEXT: s_waitcnt lgkmcnt(0) 458; SI-NEXT: s_cmp_eq_u32 s2, 0 459; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 460; SI-NEXT: v_mov_b32_e32 v3, s1 461; SI-NEXT: v_mov_b32_e32 v1, s2 462; SI-NEXT: v_mov_b32_e32 v2, s0 463; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 464; SI-NEXT: s_endpgm 465; 466; VI-LABEL: s_select_uint_to_fp_i1_vals_i64: 467; VI: ; %bb.0: 468; VI-NEXT: s_load_dword s2, s[8:9], 0x8 469; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 470; VI-NEXT: v_mov_b32_e32 v0, 0 471; VI-NEXT: s_waitcnt lgkmcnt(0) 472; VI-NEXT: s_cmp_eq_u32 s2, 0 473; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 474; VI-NEXT: v_mov_b32_e32 v3, s1 475; VI-NEXT: v_mov_b32_e32 v1, s2 476; VI-NEXT: v_mov_b32_e32 v2, s0 477; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 478; VI-NEXT: s_endpgm 479 %cmp = icmp eq i32 %in, 0 480 %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0 481 store i64 %select, ptr addrspace(1) %out, align 8 482 ret void 483} 484 485define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) { 486; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64: 487; GCN: ; %bb.0: 488; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 489; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 490; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 491; GCN-NEXT: v_mov_b32_e32 v3, 0 492; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 493; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] 494; GCN-NEXT: s_waitcnt vmcnt(0) 495; GCN-NEXT: s_setpc_b64 s[30:31] 496 %cmp = icmp eq i32 %in, 0 497 %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0 498 store i64 %select, ptr addrspace(1) %out, align 8 499 ret void 500} 501 502; TODO: This should swap the selected order / invert the compare and do it. 503define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { 504; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: 505; SI: ; %bb.0: 506; SI-NEXT: s_load_dword s2, s[8:9], 0x2 507; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 508; SI-NEXT: v_mov_b32_e32 v0, 0 509; SI-NEXT: s_waitcnt lgkmcnt(0) 510; SI-NEXT: s_cmp_eq_u32 s2, 0 511; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 512; SI-NEXT: v_mov_b32_e32 v3, s1 513; SI-NEXT: v_mov_b32_e32 v1, s2 514; SI-NEXT: v_mov_b32_e32 v2, s0 515; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 516; SI-NEXT: s_endpgm 517; 518; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64: 519; VI: ; %bb.0: 520; VI-NEXT: s_load_dword s2, s[8:9], 0x8 521; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 522; VI-NEXT: v_mov_b32_e32 v0, 0 523; VI-NEXT: s_waitcnt lgkmcnt(0) 524; VI-NEXT: s_cmp_eq_u32 s2, 0 525; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000 526; VI-NEXT: v_mov_b32_e32 v3, s1 527; VI-NEXT: v_mov_b32_e32 v1, s2 528; VI-NEXT: v_mov_b32_e32 v2, s0 529; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 530; VI-NEXT: s_endpgm 531 %cmp = icmp eq i32 %in, 0 532 %select = select i1 %cmp, double 0.0, double 1.0 533 store double %select, ptr addrspace(1) %out, align 8 534 ret void 535} 536 537define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) { 538; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64: 539; GCN: ; %bb.0: 540; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 541; GCN-NEXT: v_mov_b32_e32 v4, 0x3ff00000 542; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 543; GCN-NEXT: v_mov_b32_e32 v3, 0 544; GCN-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 545; GCN-NEXT: flat_store_dwordx2 v[0:1], v[3:4] 546; GCN-NEXT: s_waitcnt vmcnt(0) 547; GCN-NEXT: s_setpc_b64 s[30:31] 548 %cmp = icmp eq i32 %in, 0 549 %select = select i1 %cmp, double 0.0, double 1.0 550 store double %select, ptr addrspace(1) %out, align 8 551 ret void 552} 553