1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s 3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 9; SI-LABEL: v_uitofp_i32_to_f32_mask255: 10; SI: ; %bb.0: 11; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 13; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 14; SI-NEXT: s_setpc_b64 s[30:31] 15; 16; VI-LABEL: v_uitofp_i32_to_f32_mask255: 17; VI: ; %bb.0: 18; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 20; VI-NEXT: s_setpc_b64 s[30:31] 21 %masked = and i32 %arg0, 255 22 %cvt = uitofp i32 %masked to float 23 ret float %cvt 24} 25 26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 27; SI-LABEL: v_sitofp_i32_to_f32_mask255: 28; SI: ; %bb.0: 29; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 30; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 31; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 32; SI-NEXT: s_setpc_b64 s[30:31] 33; 34; VI-LABEL: v_sitofp_i32_to_f32_mask255: 35; VI: ; %bb.0: 36; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 38; VI-NEXT: s_setpc_b64 s[30:31] 39 %masked = and i32 %arg0, 255 40 %cvt = sitofp i32 %masked to float 41 ret float %cvt 42} 43 44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 45; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255: 46; GCN: ; %bb.0: 47; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 48; GCN-NEXT: v_bfe_u32 v0, v0, 7, 8 49; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 50; GCN-NEXT: s_setpc_b64 s[30:31] 51 %lshr.7 = lshr i32 %arg0, 7 52 %masked = and i32 %lshr.7, 255 53 %cvt = uitofp i32 %masked to float 54 ret float %cvt 55} 56 57define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 58; SI-LABEL: v_uitofp_to_f32_lshr8_mask255: 59; SI: ; %bb.0: 60; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 62; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 63; SI-NEXT: s_setpc_b64 s[30:31] 64; 65; VI-LABEL: v_uitofp_to_f32_lshr8_mask255: 66; VI: ; %bb.0: 67; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 68; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 69; VI-NEXT: s_setpc_b64 s[30:31] 70 %lshr.8 = lshr i32 %arg0, 8 71 %masked = and i32 %lshr.8, 255 72 %cvt = uitofp i32 %masked to float 73 ret float %cvt 74} 75 76define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 77; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 78; SI: ; %bb.0: 79; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 81; SI-NEXT: s_mov_b32 s6, -1 82; SI-NEXT: s_mov_b32 s7, 0xf000 83; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 84; SI-NEXT: s_waitcnt expcnt(0) 85; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 86; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: s_setpc_b64 s[30:31] 89; 90; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 91; VI: ; %bb.0: 92; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 93; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 94; VI-NEXT: flat_store_dword v[0:1], v0 95; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 96; VI-NEXT: s_waitcnt vmcnt(0) 97; VI-NEXT: s_setpc_b64 s[30:31] 98 %lshr.8 = lshr i32 %arg0, 8 99 store i32 %lshr.8, ptr addrspace(1) undef 100 %masked = and i32 %lshr.8, 255 101 %cvt = uitofp i32 %masked to float 102 ret float %cvt 103} 104 105define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 106; SI-LABEL: v_uitofp_to_f32_lshr16_mask255: 107; SI: ; %bb.0: 108; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 109; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 110; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 111; SI-NEXT: s_setpc_b64 s[30:31] 112; 113; VI-LABEL: v_uitofp_to_f32_lshr16_mask255: 114; VI: ; %bb.0: 115; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 116; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 117; VI-NEXT: s_setpc_b64 s[30:31] 118 %lshr.16 = lshr i32 %arg0, 16 119 %masked = and i32 %lshr.16, 255 120 %cvt = uitofp i32 %masked to float 121 ret float %cvt 122} 123 124define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 125; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 126; GCN: ; %bb.0: 127; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 128; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 129; GCN-NEXT: s_setpc_b64 s[30:31] 130 %lshr.16 = lshr i32 %arg0, 24 131 %masked = and i32 %lshr.16, 255 132 %cvt = uitofp i32 %masked to float 133 ret float %cvt 134} 135 136define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 137; SI-LABEL: v_uitofp_i8_to_f32: 138; SI: ; %bb.0: 139; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 140; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 141; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 142; SI-NEXT: s_setpc_b64 s[30:31] 143; 144; VI-LABEL: v_uitofp_i8_to_f32: 145; VI: ; %bb.0: 146; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 148; VI-NEXT: s_setpc_b64 s[30:31] 149 %cvt = uitofp i8 %arg0 to float 150 ret float %cvt 151} 152 153define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 154; SI-LABEL: v_uitofp_v2i8_to_v2f32: 155; SI: ; %bb.0: 156; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 157; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 158; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1 159; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 160; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 161; SI-NEXT: v_mov_b32_e32 v0, v2 162; SI-NEXT: s_setpc_b64 s[30:31] 163; 164; VI-LABEL: v_uitofp_v2i8_to_v2f32: 165; VI: ; %bb.0: 166; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 167; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 168; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 169; VI-NEXT: v_mov_b32_e32 v0, v2 170; VI-NEXT: s_setpc_b64 s[30:31] 171 %val = bitcast i16 %arg0 to <2 x i8> 172 %cvt = uitofp <2 x i8> %val to <2 x float> 173 ret <2 x float> %cvt 174} 175 176define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 177; SI-LABEL: v_uitofp_v3i8_to_v3f32: 178; SI: ; %bb.0: 179; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 181; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 182; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 183; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 184; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 185; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 186; SI-NEXT: v_mov_b32_e32 v0, v3 187; SI-NEXT: s_setpc_b64 s[30:31] 188; 189; VI-LABEL: v_uitofp_v3i8_to_v3f32: 190; VI: ; %bb.0: 191; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 192; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 193; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 194; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 195; VI-NEXT: v_mov_b32_e32 v0, v3 196; VI-NEXT: s_setpc_b64 s[30:31] 197 %trunc = trunc i32 %arg0 to i24 198 %val = bitcast i24 %trunc to <3 x i8> 199 %cvt = uitofp <3 x i8> %val to <3 x float> 200 ret <3 x float> %cvt 201} 202 203define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 204; SI-LABEL: v_uitofp_v4i8_to_v4f32: 205; SI: ; %bb.0: 206; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 207; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 208; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 209; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 210; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 211; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 212; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 213; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 214; SI-NEXT: v_mov_b32_e32 v0, v4 215; SI-NEXT: s_setpc_b64 s[30:31] 216; 217; VI-LABEL: v_uitofp_v4i8_to_v4f32: 218; VI: ; %bb.0: 219; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 220; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 221; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 222; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 223; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 224; VI-NEXT: v_mov_b32_e32 v0, v4 225; VI-NEXT: s_setpc_b64 s[30:31] 226 %val = bitcast i32 %arg0 to <4 x i8> 227 %cvt = uitofp <4 x i8> %val to <4 x float> 228 ret <4 x float> %cvt 229} 230 231define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 232; SI-LABEL: v_uitofp_unpack_i32_to_v4f32: 233; SI: ; %bb.0: 234; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 236; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v1 237; SI-NEXT: v_bfe_u32 v1, v0, 8, 8 238; SI-NEXT: v_bfe_u32 v2, v0, 16, 8 239; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 240; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 241; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 242; SI-NEXT: v_mov_b32_e32 v0, v4 243; SI-NEXT: s_setpc_b64 s[30:31] 244; 245; VI-LABEL: v_uitofp_unpack_i32_to_v4f32: 246; VI: ; %bb.0: 247; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 249; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 250; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 251; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 252; VI-NEXT: v_mov_b32_e32 v0, v4 253; VI-NEXT: s_setpc_b64 s[30:31] 254 %mask.arg0 = and i32 %arg0, 255 255 %cvt0 = uitofp i32 %mask.arg0 to float 256 257 %lshr.8 = lshr i32 %arg0, 8 258 %mask.lshr.8 = and i32 %lshr.8, 255 259 %cvt1 = uitofp i32 %mask.lshr.8 to float 260 261 %lshr.16 = lshr i32 %arg0, 16 262 %mask.lshr.16 = and i32 %lshr.16, 255 263 %cvt2 = uitofp i32 %mask.lshr.16 to float 264 265 %lshr.24 = lshr i32 %arg0, 24 266 %mask.lshr.24 = and i32 %lshr.24, 255 267 %cvt3 = uitofp i32 %mask.lshr.24 to float 268 269 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 270 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 271 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 272 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 273 ret <4 x float> %ins.3 274} 275 276define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 277; SI-LABEL: v_uitofp_i32_to_f16_mask255: 278; SI: ; %bb.0: 279; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 281; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 282; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 283; SI-NEXT: s_setpc_b64 s[30:31] 284; 285; VI-LABEL: v_uitofp_i32_to_f16_mask255: 286; VI: ; %bb.0: 287; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 288; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 289; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 290; VI-NEXT: s_setpc_b64 s[30:31] 291 %masked = and i32 %arg0, 255 292 %cvt = uitofp i32 %masked to half 293 ret half %cvt 294} 295 296define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 297; SI-LABEL: v_sitofp_i32_to_f16_mask255: 298; SI: ; %bb.0: 299; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 300; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 301; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 302; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 303; SI-NEXT: s_setpc_b64 s[30:31] 304; 305; VI-LABEL: v_sitofp_i32_to_f16_mask255: 306; VI: ; %bb.0: 307; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 309; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 310; VI-NEXT: s_setpc_b64 s[30:31] 311 %masked = and i32 %arg0, 255 312 %cvt = sitofp i32 %masked to half 313 ret half %cvt 314} 315 316define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 317; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 318; SI: ; %bb.0: 319; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 320; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 321; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 322; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 323; SI-NEXT: s_setpc_b64 s[30:31] 324; 325; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 326; VI: ; %bb.0: 327; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 328; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 329; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 330; VI-NEXT: s_setpc_b64 s[30:31] 331 %lshr.8 = lshr i32 %arg0, 8 332 %masked = and i32 %lshr.8, 255 333 %cvt = uitofp i32 %masked to half 334 ret half %cvt 335} 336 337define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 338; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 339; SI: ; %bb.0: 340; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 342; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 343; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 344; SI-NEXT: s_setpc_b64 s[30:31] 345; 346; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 347; VI: ; %bb.0: 348; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 349; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 350; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 351; VI-NEXT: s_setpc_b64 s[30:31] 352 %lshr.16 = lshr i32 %arg0, 16 353 %masked = and i32 %lshr.16, 255 354 %cvt = uitofp i32 %masked to half 355 ret half %cvt 356} 357 358define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 359; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255: 360; GCN: ; %bb.0: 361; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 363; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 364; GCN-NEXT: s_setpc_b64 s[30:31] 365 %lshr.16 = lshr i32 %arg0, 24 366 %masked = and i32 %lshr.16, 255 367 %cvt = uitofp i32 %masked to half 368 ret half %cvt 369} 370 371define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 372; SI-LABEL: v_uitofp_i8_to_f16: 373; SI: ; %bb.0: 374; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 375; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 376; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 377; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 378; SI-NEXT: s_setpc_b64 s[30:31] 379; 380; VI-LABEL: v_uitofp_i8_to_f16: 381; VI: ; %bb.0: 382; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 384; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 385; VI-NEXT: s_setpc_b64 s[30:31] 386 %cvt = uitofp i8 %arg0 to half 387 ret half %cvt 388} 389 390define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 391; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 392; GCN: ; %bb.0: 393; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 394; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 395; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 396; GCN-NEXT: s_setpc_b64 s[30:31] 397 %masked = and i32 %arg0, 255 398 %cvt = uitofp i32 %masked to double 399 ret double %cvt 400} 401 402define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 403; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 404; GCN: ; %bb.0: 405; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 406; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8 407; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 408; GCN-NEXT: s_setpc_b64 s[30:31] 409 %lshr.8 = lshr i32 %arg0, 8 410 %masked = and i32 %lshr.8, 255 411 %cvt = uitofp i32 %masked to double 412 ret double %cvt 413} 414 415define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 416; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255: 417; GCN: ; %bb.0: 418; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8 420; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 421; GCN-NEXT: s_setpc_b64 s[30:31] 422 %lshr.16 = lshr i32 %arg0, 16 423 %masked = and i32 %lshr.16, 255 424 %cvt = uitofp i32 %masked to double 425 ret double %cvt 426} 427 428define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 429; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 430; GCN: ; %bb.0: 431; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 433; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 434; GCN-NEXT: s_setpc_b64 s[30:31] 435 %lshr.16 = lshr i32 %arg0, 24 436 %masked = and i32 %lshr.16, 255 437 %cvt = uitofp i32 %masked to double 438 ret double %cvt 439} 440 441define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 442; GCN-LABEL: v_uitofp_i8_to_f64: 443; GCN: ; %bb.0: 444; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 445; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 446; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 447; GCN-NEXT: s_setpc_b64 s[30:31] 448 %cvt = uitofp i8 %arg0 to double 449 ret double %cvt 450} 451 452define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 453; SI-LABEL: load_i8_to_f32: 454; SI: ; %bb.0: 455; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 456; SI-NEXT: s_mov_b32 s6, 0 457; SI-NEXT: s_mov_b32 s7, 0xf000 458; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 459; SI-NEXT: s_waitcnt lgkmcnt(0) 460; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 461; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 462; SI-NEXT: s_mov_b32 s6, -1 463; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 464; SI-NEXT: s_waitcnt vmcnt(0) 465; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 466; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 467; SI-NEXT: s_endpgm 468; 469; VI-LABEL: load_i8_to_f32: 470; VI: ; %bb.0: 471; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 472; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 473; VI-NEXT: s_waitcnt lgkmcnt(0) 474; VI-NEXT: v_mov_b32_e32 v1, s2 475; VI-NEXT: v_mov_b32_e32 v2, s3 476; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 477; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 478; VI-NEXT: flat_load_ubyte v0, v[0:1] 479; VI-NEXT: s_waitcnt vmcnt(0) 480; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 481; VI-NEXT: v_mov_b32_e32 v0, s0 482; VI-NEXT: v_mov_b32_e32 v1, s1 483; VI-NEXT: flat_store_dword v[0:1], v2 484; VI-NEXT: s_endpgm 485 %tid = call i32 @llvm.amdgcn.workitem.id.x() 486 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid 487 %load = load i8, ptr addrspace(1) %gep, align 1 488 %cvt = uitofp i8 %load to float 489 store float %cvt, ptr addrspace(1) %out, align 4 490 ret void 491} 492 493define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 494; SI-LABEL: load_v2i8_to_v2f32: 495; SI: ; %bb.0: 496; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 497; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 498; SI-NEXT: v_mov_b32_e32 v1, 0 499; SI-NEXT: s_mov_b32 s6, 0 500; SI-NEXT: s_mov_b32 s7, 0xf000 501; SI-NEXT: s_waitcnt lgkmcnt(0) 502; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 503; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 504; SI-NEXT: s_mov_b32 s6, -1 505; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 506; SI-NEXT: s_waitcnt vmcnt(0) 507; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 508; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 509; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 510; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 511; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 512; SI-NEXT: s_endpgm 513; 514; VI-LABEL: load_v2i8_to_v2f32: 515; VI: ; %bb.0: 516; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 517; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 518; VI-NEXT: s_waitcnt lgkmcnt(0) 519; VI-NEXT: v_mov_b32_e32 v0, s2 520; VI-NEXT: v_mov_b32_e32 v1, s3 521; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 522; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 523; VI-NEXT: flat_load_ushort v1, v[0:1] 524; VI-NEXT: v_mov_b32_e32 v3, s1 525; VI-NEXT: v_mov_b32_e32 v2, s0 526; VI-NEXT: s_waitcnt vmcnt(0) 527; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 528; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 529; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 530; VI-NEXT: s_endpgm 531 %tid = call i32 @llvm.amdgcn.workitem.id.x() 532 %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid 533 %load = load <2 x i8>, ptr addrspace(1) %gep, align 2 534 %cvt = uitofp <2 x i8> %load to <2 x float> 535 store <2 x float> %cvt, ptr addrspace(1) %out, align 16 536 ret void 537} 538 539define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 540; SI-LABEL: load_v3i8_to_v3f32: 541; SI: ; %bb.0: 542; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 543; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 544; SI-NEXT: v_mov_b32_e32 v1, 0 545; SI-NEXT: s_mov_b32 s6, 0 546; SI-NEXT: s_mov_b32 s7, 0xf000 547; SI-NEXT: s_waitcnt lgkmcnt(0) 548; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 549; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 550; SI-NEXT: s_mov_b32 s6, -1 551; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 552; SI-NEXT: s_waitcnt vmcnt(0) 553; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 554; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 555; SI-NEXT: v_bfe_u32 v3, v0, 16, 8 556; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 557; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 558; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 559; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 560; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 561; SI-NEXT: s_endpgm 562; 563; VI-LABEL: load_v3i8_to_v3f32: 564; VI: ; %bb.0: 565; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 566; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 567; VI-NEXT: s_waitcnt lgkmcnt(0) 568; VI-NEXT: v_mov_b32_e32 v0, s2 569; VI-NEXT: v_mov_b32_e32 v1, s3 570; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 571; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 572; VI-NEXT: flat_load_dword v2, v[0:1] 573; VI-NEXT: v_mov_b32_e32 v4, s1 574; VI-NEXT: v_mov_b32_e32 v3, s0 575; VI-NEXT: s_waitcnt vmcnt(0) 576; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 577; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 578; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 579; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 580; VI-NEXT: s_endpgm 581 %tid = call i32 @llvm.amdgcn.workitem.id.x() 582 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid 583 %load = load <3 x i8>, ptr addrspace(1) %gep, align 4 584 %cvt = uitofp <3 x i8> %load to <3 x float> 585 store <3 x float> %cvt, ptr addrspace(1) %out, align 16 586 ret void 587} 588 589define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 590; SI-LABEL: load_v4i8_to_v4f32: 591; SI: ; %bb.0: 592; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 593; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 594; SI-NEXT: v_mov_b32_e32 v1, 0 595; SI-NEXT: s_mov_b32 s6, 0 596; SI-NEXT: s_mov_b32 s7, 0xf000 597; SI-NEXT: s_waitcnt lgkmcnt(0) 598; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 599; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 600; SI-NEXT: s_mov_b32 s6, -1 601; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 602; SI-NEXT: s_waitcnt vmcnt(0) 603; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 604; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 605; SI-NEXT: v_bfe_u32 v4, v0, 16, 8 606; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 607; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 608; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 609; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 610; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 611; SI-NEXT: s_endpgm 612; 613; VI-LABEL: load_v4i8_to_v4f32: 614; VI: ; %bb.0: 615; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 616; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 617; VI-NEXT: s_waitcnt lgkmcnt(0) 618; VI-NEXT: v_mov_b32_e32 v0, s2 619; VI-NEXT: v_mov_b32_e32 v1, s3 620; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 621; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 622; VI-NEXT: flat_load_dword v3, v[0:1] 623; VI-NEXT: v_mov_b32_e32 v5, s1 624; VI-NEXT: v_mov_b32_e32 v4, s0 625; VI-NEXT: s_waitcnt vmcnt(0) 626; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 627; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 628; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 629; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 630; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 631; VI-NEXT: s_endpgm 632 %tid = call i32 @llvm.amdgcn.workitem.id.x() 633 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 634 %load = load <4 x i8>, ptr addrspace(1) %gep, align 4 635 %cvt = uitofp <4 x i8> %load to <4 x float> 636 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 637 ret void 638} 639 640; This should not be adding instructions to shift into the correct 641; position in the word for the component. 642 643; FIXME: Packing bytes 644define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 645; SI-LABEL: load_v4i8_to_v4f32_unaligned: 646; SI: ; %bb.0: 647; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 648; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 649; SI-NEXT: v_mov_b32_e32 v1, 0 650; SI-NEXT: s_mov_b32 s6, 0 651; SI-NEXT: s_mov_b32 s7, 0xf000 652; SI-NEXT: s_waitcnt lgkmcnt(0) 653; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 654; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 655; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 656; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 657; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 658; SI-NEXT: s_mov_b32 s6, -1 659; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 660; SI-NEXT: s_waitcnt vmcnt(3) 661; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 662; SI-NEXT: s_waitcnt vmcnt(2) 663; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 664; SI-NEXT: s_waitcnt vmcnt(1) 665; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 666; SI-NEXT: s_waitcnt vmcnt(0) 667; SI-NEXT: v_or_b32_e32 v0, v1, v0 668; SI-NEXT: v_or_b32_e32 v1, v2, v3 669; SI-NEXT: v_or_b32_e32 v0, v1, v0 670; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 671; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 672; SI-NEXT: v_bfe_u32 v4, v0, 16, 8 673; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 674; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 675; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 676; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 677; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 678; SI-NEXT: s_endpgm 679; 680; VI-LABEL: load_v4i8_to_v4f32_unaligned: 681; VI: ; %bb.0: 682; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 683; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 684; VI-NEXT: s_waitcnt lgkmcnt(0) 685; VI-NEXT: v_mov_b32_e32 v0, s2 686; VI-NEXT: v_mov_b32_e32 v1, s3 687; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 688; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 689; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 690; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 691; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 692; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 693; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 694; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 695; VI-NEXT: flat_load_ubyte v2, v[2:3] 696; VI-NEXT: flat_load_ubyte v3, v[6:7] 697; VI-NEXT: flat_load_ubyte v4, v[4:5] 698; VI-NEXT: flat_load_ubyte v0, v[0:1] 699; VI-NEXT: s_waitcnt vmcnt(3) 700; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 701; VI-NEXT: s_waitcnt vmcnt(2) 702; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 703; VI-NEXT: s_waitcnt vmcnt(1) 704; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 705; VI-NEXT: s_waitcnt vmcnt(0) 706; VI-NEXT: v_or_b32_e32 v0, v1, v0 707; VI-NEXT: v_or_b32_e32 v1, v2, v3 708; VI-NEXT: v_or_b32_e32 v3, v1, v0 709; VI-NEXT: v_mov_b32_e32 v5, s1 710; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 711; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 712; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 713; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 714; VI-NEXT: v_mov_b32_e32 v4, s0 715; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 716; VI-NEXT: s_endpgm 717 %tid = call i32 @llvm.amdgcn.workitem.id.x() 718 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 719 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 720 %cvt = uitofp <4 x i8> %load to <4 x float> 721 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 722 ret void 723} 724 725define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { 726; SI-LABEL: load_v4i8_to_v4f32_2_uses: 727; SI: ; %bb.0: 728; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 729; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 730; SI-NEXT: v_mov_b32_e32 v1, 0 731; SI-NEXT: s_mov_b32 s2, 0 732; SI-NEXT: s_mov_b32 s3, 0xf000 733; SI-NEXT: s_waitcnt lgkmcnt(0) 734; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 735; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 736; SI-NEXT: s_mov_b32 s2, -1 737; SI-NEXT: s_waitcnt lgkmcnt(0) 738; SI-NEXT: s_mov_b64 s[0:1], s[4:5] 739; SI-NEXT: s_waitcnt vmcnt(0) 740; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 741; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 742; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 743; SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0 744; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 745; SI-NEXT: v_add_i32_e32 v6, vcc, 9, v0 746; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 747; SI-NEXT: v_and_b32_e32 v5, 0xff, v1 748; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 749; SI-NEXT: v_add_i32_e32 v8, vcc, 9, v1 750; SI-NEXT: v_add_i32_e32 v9, vcc, 9, v2 751; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 752; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 753; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 754; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 755; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 756; SI-NEXT: v_and_b32_e32 v7, 0xff, v9 757; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 758; SI-NEXT: s_waitcnt expcnt(0) 759; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 760; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 761; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 762; SI-NEXT: v_or_b32_e32 v0, v6, v0 763; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v4 764; SI-NEXT: v_or_b32_e32 v0, v0, v1 765; SI-NEXT: v_or_b32_e32 v0, v0, v2 766; SI-NEXT: s_mov_b64 s[0:1], s[6:7] 767; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 768; SI-NEXT: s_endpgm 769; 770; VI-LABEL: load_v4i8_to_v4f32_2_uses: 771; VI: ; %bb.0: 772; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 773; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 774; VI-NEXT: v_mov_b32_e32 v6, 9 775; VI-NEXT: v_mov_b32_e32 v7, 8 776; VI-NEXT: s_waitcnt lgkmcnt(0) 777; VI-NEXT: v_mov_b32_e32 v0, s0 778; VI-NEXT: v_mov_b32_e32 v1, s1 779; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 780; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 781; VI-NEXT: flat_load_dword v1, v[0:1] 782; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 783; VI-NEXT: v_mov_b32_e32 v2, 0xff 784; VI-NEXT: s_waitcnt lgkmcnt(0) 785; VI-NEXT: v_mov_b32_e32 v5, s1 786; VI-NEXT: v_mov_b32_e32 v4, s0 787; VI-NEXT: s_waitcnt vmcnt(0) 788; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 789; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 790; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 791; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 792; VI-NEXT: v_add_u16_e32 v9, 9, v1 793; VI-NEXT: v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 794; VI-NEXT: v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 795; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 796; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 797; VI-NEXT: v_add_u16_e32 v8, 9, v8 798; VI-NEXT: v_and_b32_e32 v10, 0xff, v10 799; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 800; VI-NEXT: v_and_b32_e32 v6, 0xff, v6 801; VI-NEXT: v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 802; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 803; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 804; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6 805; VI-NEXT: v_or_b32_e32 v0, v0, v1 806; VI-NEXT: v_or_b32_e32 v2, v0, v2 807; VI-NEXT: v_mov_b32_e32 v0, s2 808; VI-NEXT: v_mov_b32_e32 v1, s3 809; VI-NEXT: flat_store_dword v[0:1], v2 810; VI-NEXT: s_endpgm 811 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 812 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 813 %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4 814 %cvt = uitofp <4 x i8> %load to <4 x float> 815 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 816 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 817 store <4 x i8> %add, ptr addrspace(1) %out2, align 4 818 ret void 819} 820 821define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 822; SI-LABEL: load_v7i8_to_v7f32: 823; SI: ; %bb.0: 824; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 825; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 826; SI-NEXT: v_mov_b32_e32 v1, 0 827; SI-NEXT: s_mov_b32 s6, 0 828; SI-NEXT: s_mov_b32 s7, 0xf000 829; SI-NEXT: s_waitcnt lgkmcnt(0) 830; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 831; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 832; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 833; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 834; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3 835; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4 836; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 837; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6 838; SI-NEXT: s_mov_b32 s6, -1 839; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 840; SI-NEXT: s_waitcnt vmcnt(6) 841; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 842; SI-NEXT: s_waitcnt vmcnt(5) 843; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 844; SI-NEXT: s_waitcnt vmcnt(4) 845; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 846; SI-NEXT: s_waitcnt vmcnt(3) 847; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 848; SI-NEXT: s_waitcnt vmcnt(2) 849; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 850; SI-NEXT: s_waitcnt vmcnt(1) 851; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7 852; SI-NEXT: s_waitcnt vmcnt(0) 853; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 854; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 855; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 856; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 857; SI-NEXT: s_endpgm 858; 859; VI-LABEL: load_v7i8_to_v7f32: 860; VI: ; %bb.0: 861; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 862; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 863; VI-NEXT: s_waitcnt lgkmcnt(0) 864; VI-NEXT: v_mov_b32_e32 v0, s2 865; VI-NEXT: v_mov_b32_e32 v1, s3 866; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 867; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 868; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 869; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 870; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 871; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 872; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 873; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 874; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 875; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 876; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0 877; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc 878; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 879; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc 880; VI-NEXT: flat_load_ubyte v0, v[0:1] 881; VI-NEXT: flat_load_ubyte v1, v[2:3] 882; VI-NEXT: flat_load_ubyte v2, v[4:5] 883; VI-NEXT: flat_load_ubyte v3, v[6:7] 884; VI-NEXT: flat_load_ubyte v4, v[8:9] 885; VI-NEXT: flat_load_ubyte v5, v[10:11] 886; VI-NEXT: flat_load_ubyte v6, v[12:13] 887; VI-NEXT: v_mov_b32_e32 v8, s1 888; VI-NEXT: v_mov_b32_e32 v7, s0 889; VI-NEXT: s_add_u32 s0, s0, 16 890; VI-NEXT: s_addc_u32 s1, s1, 0 891; VI-NEXT: v_mov_b32_e32 v10, s1 892; VI-NEXT: v_mov_b32_e32 v9, s0 893; VI-NEXT: s_waitcnt vmcnt(6) 894; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 895; VI-NEXT: s_waitcnt vmcnt(5) 896; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 897; VI-NEXT: s_waitcnt vmcnt(4) 898; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 899; VI-NEXT: s_waitcnt vmcnt(3) 900; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 901; VI-NEXT: s_waitcnt vmcnt(2) 902; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 903; VI-NEXT: s_waitcnt vmcnt(1) 904; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v5 905; VI-NEXT: s_waitcnt vmcnt(0) 906; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 907; VI-NEXT: flat_store_dwordx4 v[7:8], v[0:3] 908; VI-NEXT: flat_store_dwordx3 v[9:10], v[4:6] 909; VI-NEXT: s_endpgm 910 %tid = call i32 @llvm.amdgcn.workitem.id.x() 911 %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid 912 %load = load <7 x i8>, ptr addrspace(1) %gep, align 1 913 %cvt = uitofp <7 x i8> %load to <7 x float> 914 store <7 x float> %cvt, ptr addrspace(1) %out, align 16 915 ret void 916} 917 918define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 919; SI-LABEL: load_v8i8_to_v8f32: 920; SI: ; %bb.0: 921; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 922; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 923; SI-NEXT: v_mov_b32_e32 v1, 0 924; SI-NEXT: s_mov_b32 s6, 0 925; SI-NEXT: s_mov_b32 s7, 0xf000 926; SI-NEXT: s_waitcnt lgkmcnt(0) 927; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 928; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 929; SI-NEXT: s_mov_b32 s6, -1 930; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 931; SI-NEXT: s_waitcnt vmcnt(0) 932; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 933; SI-NEXT: v_bfe_u32 v4, v0, 8, 8 934; SI-NEXT: v_bfe_u32 v5, v0, 16, 8 935; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 936; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 937; SI-NEXT: v_bfe_u32 v8, v1, 8, 8 938; SI-NEXT: v_bfe_u32 v9, v1, 16, 8 939; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v1 940; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 941; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 942; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v5 943; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 944; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v8 945; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9 946; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 947; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 948; SI-NEXT: s_endpgm 949; 950; VI-LABEL: load_v8i8_to_v8f32: 951; VI: ; %bb.0: 952; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 953; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 954; VI-NEXT: s_waitcnt lgkmcnt(0) 955; VI-NEXT: v_mov_b32_e32 v0, s2 956; VI-NEXT: v_mov_b32_e32 v1, s3 957; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 958; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 959; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] 960; VI-NEXT: v_mov_b32_e32 v9, s1 961; VI-NEXT: v_mov_b32_e32 v8, s0 962; VI-NEXT: s_add_u32 s0, s0, 16 963; VI-NEXT: s_addc_u32 s1, s1, 0 964; VI-NEXT: v_mov_b32_e32 v11, s1 965; VI-NEXT: v_mov_b32_e32 v10, s0 966; VI-NEXT: s_waitcnt vmcnt(0) 967; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 968; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 969; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 970; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v6 971; VI-NEXT: v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 972; VI-NEXT: v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 973; VI-NEXT: v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 974; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v7 975; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 976; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 977; VI-NEXT: s_endpgm 978 %tid = call i32 @llvm.amdgcn.workitem.id.x() 979 %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid 980 %load = load <8 x i8>, ptr addrspace(1) %gep, align 8 981 %cvt = uitofp <8 x i8> %load to <8 x float> 982 store <8 x float> %cvt, ptr addrspace(1) %out, align 16 983 ret void 984} 985 986define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 987; SI-LABEL: i8_zext_inreg_i32_to_f32: 988; SI: ; %bb.0: 989; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 990; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 991; SI-NEXT: v_mov_b32_e32 v1, 0 992; SI-NEXT: s_mov_b32 s6, 0 993; SI-NEXT: s_mov_b32 s7, 0xf000 994; SI-NEXT: s_waitcnt lgkmcnt(0) 995; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 996; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 997; SI-NEXT: s_mov_b32 s6, -1 998; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 999; SI-NEXT: s_waitcnt vmcnt(0) 1000; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 1001; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1002; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1003; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1004; SI-NEXT: s_endpgm 1005; 1006; VI-LABEL: i8_zext_inreg_i32_to_f32: 1007; VI: ; %bb.0: 1008; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1009; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1010; VI-NEXT: s_waitcnt lgkmcnt(0) 1011; VI-NEXT: v_mov_b32_e32 v0, s2 1012; VI-NEXT: v_mov_b32_e32 v1, s3 1013; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1014; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1015; VI-NEXT: flat_load_dword v0, v[0:1] 1016; VI-NEXT: s_waitcnt vmcnt(0) 1017; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 1018; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1019; VI-NEXT: v_mov_b32_e32 v0, s0 1020; VI-NEXT: v_mov_b32_e32 v1, s1 1021; VI-NEXT: flat_store_dword v[0:1], v2 1022; VI-NEXT: s_endpgm 1023 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1024 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1025 %load = load i32, ptr addrspace(1) %gep, align 4 1026 %add = add i32 %load, 2 1027 %inreg = and i32 %add, 255 1028 %cvt = uitofp i32 %inreg to float 1029 store float %cvt, ptr addrspace(1) %out, align 4 1030 ret void 1031} 1032 1033define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1034; SI-LABEL: i8_zext_inreg_hi1_to_f32: 1035; SI: ; %bb.0: 1036; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1037; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1038; SI-NEXT: v_mov_b32_e32 v1, 0 1039; SI-NEXT: s_mov_b32 s6, 0 1040; SI-NEXT: s_mov_b32 s7, 0xf000 1041; SI-NEXT: s_waitcnt lgkmcnt(0) 1042; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1043; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1044; SI-NEXT: s_mov_b32 s6, -1 1045; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1046; SI-NEXT: s_waitcnt vmcnt(0) 1047; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 1048; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1049; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1050; SI-NEXT: s_endpgm 1051; 1052; VI-LABEL: i8_zext_inreg_hi1_to_f32: 1053; VI: ; %bb.0: 1054; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1055; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1056; VI-NEXT: s_waitcnt lgkmcnt(0) 1057; VI-NEXT: v_mov_b32_e32 v0, s2 1058; VI-NEXT: v_mov_b32_e32 v1, s3 1059; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1060; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1061; VI-NEXT: flat_load_dword v0, v[0:1] 1062; VI-NEXT: s_waitcnt vmcnt(0) 1063; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1064; VI-NEXT: v_mov_b32_e32 v0, s0 1065; VI-NEXT: v_mov_b32_e32 v1, s1 1066; VI-NEXT: flat_store_dword v[0:1], v2 1067; VI-NEXT: s_endpgm 1068 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1069 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1070 %load = load i32, ptr addrspace(1) %gep, align 4 1071 %inreg = and i32 %load, 65280 1072 %shr = lshr i32 %inreg, 8 1073 %cvt = uitofp i32 %shr to float 1074 store float %cvt, ptr addrspace(1) %out, align 4 1075 ret void 1076} 1077 1078; We don't get these ones because of the zext, but instcombine removes 1079; them so it shouldn't really matter. 1080define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1081; SI-LABEL: i8_zext_i32_to_f32: 1082; SI: ; %bb.0: 1083; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1084; SI-NEXT: s_mov_b32 s6, 0 1085; SI-NEXT: s_mov_b32 s7, 0xf000 1086; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1087; SI-NEXT: s_waitcnt lgkmcnt(0) 1088; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1089; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1090; SI-NEXT: s_mov_b32 s6, -1 1091; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1092; SI-NEXT: s_waitcnt vmcnt(0) 1093; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1094; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1095; SI-NEXT: s_endpgm 1096; 1097; VI-LABEL: i8_zext_i32_to_f32: 1098; VI: ; %bb.0: 1099; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1100; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1101; VI-NEXT: s_waitcnt lgkmcnt(0) 1102; VI-NEXT: v_mov_b32_e32 v1, s2 1103; VI-NEXT: v_mov_b32_e32 v2, s3 1104; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 1105; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 1106; VI-NEXT: flat_load_ubyte v0, v[0:1] 1107; VI-NEXT: s_waitcnt vmcnt(0) 1108; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 1109; VI-NEXT: v_mov_b32_e32 v0, s0 1110; VI-NEXT: v_mov_b32_e32 v1, s1 1111; VI-NEXT: flat_store_dword v[0:1], v2 1112; VI-NEXT: s_endpgm 1113 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1114 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid 1115 %load = load i8, ptr addrspace(1) %gep, align 1 1116 %ext = zext i8 %load to i32 1117 %cvt = uitofp i32 %ext to float 1118 store float %cvt, ptr addrspace(1) %out, align 4 1119 ret void 1120} 1121 1122define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1123; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 1124; SI: ; %bb.0: 1125; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1126; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1127; SI-NEXT: v_mov_b32_e32 v1, 0 1128; SI-NEXT: s_mov_b32 s6, 0 1129; SI-NEXT: s_mov_b32 s7, 0xf000 1130; SI-NEXT: s_waitcnt lgkmcnt(0) 1131; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1132; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 1133; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 1134; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 1135; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1136; SI-NEXT: s_mov_b32 s6, -1 1137; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1138; SI-NEXT: s_waitcnt vmcnt(3) 1139; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1140; SI-NEXT: s_waitcnt vmcnt(2) 1141; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1142; SI-NEXT: s_waitcnt vmcnt(1) 1143; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1144; SI-NEXT: s_waitcnt vmcnt(0) 1145; SI-NEXT: v_or_b32_e32 v0, v1, v0 1146; SI-NEXT: v_or_b32_e32 v1, v2, v3 1147; SI-NEXT: v_or_b32_e32 v0, v1, v0 1148; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 1149; SI-NEXT: v_bfe_u32 v2, v0, 8, 8 1150; SI-NEXT: v_bfe_u32 v4, v0, 16, 8 1151; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1152; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 1153; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 1154; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 1155; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1156; SI-NEXT: s_endpgm 1157; 1158; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 1159; VI: ; %bb.0: 1160; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1161; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1162; VI-NEXT: s_waitcnt lgkmcnt(0) 1163; VI-NEXT: v_mov_b32_e32 v0, s2 1164; VI-NEXT: v_mov_b32_e32 v1, s3 1165; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1166; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1167; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 1168; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1169; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 1170; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1171; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v0 1172; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1173; VI-NEXT: flat_load_ubyte v2, v[2:3] 1174; VI-NEXT: flat_load_ubyte v3, v[6:7] 1175; VI-NEXT: flat_load_ubyte v4, v[4:5] 1176; VI-NEXT: flat_load_ubyte v0, v[0:1] 1177; VI-NEXT: s_waitcnt vmcnt(3) 1178; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1179; VI-NEXT: s_waitcnt vmcnt(2) 1180; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1181; VI-NEXT: s_waitcnt vmcnt(1) 1182; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1183; VI-NEXT: s_waitcnt vmcnt(0) 1184; VI-NEXT: v_or_b32_e32 v0, v1, v0 1185; VI-NEXT: v_or_b32_e32 v1, v2, v3 1186; VI-NEXT: v_or_b32_e32 v3, v1, v0 1187; VI-NEXT: v_mov_b32_e32 v5, s1 1188; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1189; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1190; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1191; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 1192; VI-NEXT: v_mov_b32_e32 v4, s0 1193; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1194; VI-NEXT: s_endpgm 1195 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1196 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 1197 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 1198 %ext = zext <4 x i8> %load to <4 x i32> 1199 %cvt = uitofp <4 x i32> %ext to <4 x float> 1200 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 1201 ret void 1202} 1203 1204define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1205; SI-LABEL: extract_byte0_to_f32: 1206; SI: ; %bb.0: 1207; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1208; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1209; SI-NEXT: v_mov_b32_e32 v1, 0 1210; SI-NEXT: s_mov_b32 s6, 0 1211; SI-NEXT: s_mov_b32 s7, 0xf000 1212; SI-NEXT: s_waitcnt lgkmcnt(0) 1213; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1214; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 1215; SI-NEXT: s_mov_b32 s6, -1 1216; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1217; SI-NEXT: s_waitcnt vmcnt(0) 1218; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1219; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1220; SI-NEXT: s_endpgm 1221; 1222; VI-LABEL: extract_byte0_to_f32: 1223; VI: ; %bb.0: 1224; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1225; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1226; VI-NEXT: s_waitcnt lgkmcnt(0) 1227; VI-NEXT: v_mov_b32_e32 v0, s2 1228; VI-NEXT: v_mov_b32_e32 v1, s3 1229; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1230; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1231; VI-NEXT: flat_load_ubyte v0, v[0:1] 1232; VI-NEXT: s_waitcnt vmcnt(0) 1233; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 1234; VI-NEXT: v_mov_b32_e32 v0, s0 1235; VI-NEXT: v_mov_b32_e32 v1, s1 1236; VI-NEXT: flat_store_dword v[0:1], v2 1237; VI-NEXT: s_endpgm 1238 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1239 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1240 %val = load i32, ptr addrspace(1) %gep 1241 %and = and i32 %val, 255 1242 %cvt = uitofp i32 %and to float 1243 store float %cvt, ptr addrspace(1) %out 1244 ret void 1245} 1246 1247define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1248; SI-LABEL: extract_byte1_to_f32: 1249; SI: ; %bb.0: 1250; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1251; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1252; SI-NEXT: v_mov_b32_e32 v1, 0 1253; SI-NEXT: s_mov_b32 s6, 0 1254; SI-NEXT: s_mov_b32 s7, 0xf000 1255; SI-NEXT: s_waitcnt lgkmcnt(0) 1256; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1257; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1258; SI-NEXT: s_mov_b32 s6, -1 1259; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1260; SI-NEXT: s_waitcnt vmcnt(0) 1261; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 1262; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1263; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1264; SI-NEXT: s_endpgm 1265; 1266; VI-LABEL: extract_byte1_to_f32: 1267; VI: ; %bb.0: 1268; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1269; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1270; VI-NEXT: s_waitcnt lgkmcnt(0) 1271; VI-NEXT: v_mov_b32_e32 v0, s2 1272; VI-NEXT: v_mov_b32_e32 v1, s3 1273; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1274; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1275; VI-NEXT: flat_load_dword v0, v[0:1] 1276; VI-NEXT: s_waitcnt vmcnt(0) 1277; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1278; VI-NEXT: v_mov_b32_e32 v0, s0 1279; VI-NEXT: v_mov_b32_e32 v1, s1 1280; VI-NEXT: flat_store_dword v[0:1], v2 1281; VI-NEXT: s_endpgm 1282 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1283 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1284 %val = load i32, ptr addrspace(1) %gep 1285 %srl = lshr i32 %val, 8 1286 %and = and i32 %srl, 255 1287 %cvt = uitofp i32 %and to float 1288 store float %cvt, ptr addrspace(1) %out 1289 ret void 1290} 1291 1292define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1293; SI-LABEL: extract_byte2_to_f32: 1294; SI: ; %bb.0: 1295; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1296; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1297; SI-NEXT: v_mov_b32_e32 v1, 0 1298; SI-NEXT: s_mov_b32 s6, 0 1299; SI-NEXT: s_mov_b32 s7, 0xf000 1300; SI-NEXT: s_waitcnt lgkmcnt(0) 1301; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1302; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1303; SI-NEXT: s_mov_b32 s6, -1 1304; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1305; SI-NEXT: s_waitcnt vmcnt(0) 1306; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 1307; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1308; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1309; SI-NEXT: s_endpgm 1310; 1311; VI-LABEL: extract_byte2_to_f32: 1312; VI: ; %bb.0: 1313; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1314; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1315; VI-NEXT: s_waitcnt lgkmcnt(0) 1316; VI-NEXT: v_mov_b32_e32 v0, s2 1317; VI-NEXT: v_mov_b32_e32 v1, s3 1318; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1319; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1320; VI-NEXT: flat_load_dword v0, v[0:1] 1321; VI-NEXT: s_waitcnt vmcnt(0) 1322; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1323; VI-NEXT: v_mov_b32_e32 v0, s0 1324; VI-NEXT: v_mov_b32_e32 v1, s1 1325; VI-NEXT: flat_store_dword v[0:1], v2 1326; VI-NEXT: s_endpgm 1327 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1328 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1329 %val = load i32, ptr addrspace(1) %gep 1330 %srl = lshr i32 %val, 16 1331 %and = and i32 %srl, 255 1332 %cvt = uitofp i32 %and to float 1333 store float %cvt, ptr addrspace(1) %out 1334 ret void 1335} 1336 1337define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1338; SI-LABEL: extract_byte3_to_f32: 1339; SI: ; %bb.0: 1340; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1341; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1342; SI-NEXT: v_mov_b32_e32 v1, 0 1343; SI-NEXT: s_mov_b32 s6, 0 1344; SI-NEXT: s_mov_b32 s7, 0xf000 1345; SI-NEXT: s_waitcnt lgkmcnt(0) 1346; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1347; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1348; SI-NEXT: s_mov_b32 s6, -1 1349; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1350; SI-NEXT: s_waitcnt vmcnt(0) 1351; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 1352; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1353; SI-NEXT: s_endpgm 1354; 1355; VI-LABEL: extract_byte3_to_f32: 1356; VI: ; %bb.0: 1357; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1358; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1359; VI-NEXT: s_waitcnt lgkmcnt(0) 1360; VI-NEXT: v_mov_b32_e32 v0, s2 1361; VI-NEXT: v_mov_b32_e32 v1, s3 1362; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1363; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1364; VI-NEXT: flat_load_dword v0, v[0:1] 1365; VI-NEXT: s_waitcnt vmcnt(0) 1366; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 1367; VI-NEXT: v_mov_b32_e32 v0, s0 1368; VI-NEXT: v_mov_b32_e32 v1, s1 1369; VI-NEXT: flat_store_dword v[0:1], v2 1370; VI-NEXT: s_endpgm 1371 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1372 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 1373 %val = load i32, ptr addrspace(1) %gep 1374 %srl = lshr i32 %val, 24 1375 %and = and i32 %srl, 255 1376 %cvt = uitofp i32 %and to float 1377 store float %cvt, ptr addrspace(1) %out 1378 ret void 1379} 1380 1381define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { 1382; SI-LABEL: cvt_ubyte0_or_multiuse: 1383; SI: ; %bb.0: ; %bb 1384; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1385; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1386; SI-NEXT: v_mov_b32_e32 v1, 0 1387; SI-NEXT: s_mov_b32 s6, 0 1388; SI-NEXT: s_mov_b32 s7, 0xf000 1389; SI-NEXT: s_waitcnt lgkmcnt(0) 1390; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 1391; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1392; SI-NEXT: s_mov_b32 s6, -1 1393; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1394; SI-NEXT: s_waitcnt vmcnt(0) 1395; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1396; SI-NEXT: v_and_b32_e32 v1, 0xff, v0 1397; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1398; SI-NEXT: v_add_f32_e32 v0, v0, v1 1399; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1400; SI-NEXT: s_endpgm 1401; 1402; VI-LABEL: cvt_ubyte0_or_multiuse: 1403; VI: ; %bb.0: ; %bb 1404; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1405; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1406; VI-NEXT: s_waitcnt lgkmcnt(0) 1407; VI-NEXT: v_mov_b32_e32 v0, s0 1408; VI-NEXT: v_mov_b32_e32 v1, s1 1409; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1410; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1411; VI-NEXT: flat_load_dword v0, v[0:1] 1412; VI-NEXT: s_waitcnt vmcnt(0) 1413; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 1414; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1415; VI-NEXT: v_add_f32_e32 v2, v0, v1 1416; VI-NEXT: v_mov_b32_e32 v0, s2 1417; VI-NEXT: v_mov_b32_e32 v1, s3 1418; VI-NEXT: flat_store_dword v[0:1], v2 1419; VI-NEXT: s_endpgm 1420bb: 1421 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 1422 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid 1423 %load = load i32, ptr addrspace(1) %gep 1424 %or = or i32 %load, -2147483647 1425 %and = and i32 %or, 255 1426 %uitofp = uitofp i32 %and to float 1427 %cast = bitcast i32 %or to float 1428 %add = fadd float %cast, %uitofp 1429 store float %add, ptr addrspace(1) %out 1430 ret void 1431} 1432 1433define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { 1434; SI-LABEL: v_test_sitofp_i64_byte_to_f32: 1435; SI: ; %bb.0: 1436; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1437; SI-NEXT: v_ffbh_i32_e32 v2, 0 1438; SI-NEXT: v_add_i32_e32 v2, vcc, -1, v2 1439; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1440; SI-NEXT: v_mov_b32_e32 v1, 0 1441; SI-NEXT: v_min_u32_e32 v2, 32, v2 1442; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 1443; SI-NEXT: v_min_u32_e32 v0, 1, v0 1444; SI-NEXT: v_or_b32_e32 v0, v1, v0 1445; SI-NEXT: v_cvt_f32_i32_e32 v0, v0 1446; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 1447; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 1448; SI-NEXT: s_setpc_b64 s[30:31] 1449; 1450; VI-LABEL: v_test_sitofp_i64_byte_to_f32: 1451; VI: ; %bb.0: 1452; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1453; VI-NEXT: v_ffbh_i32_e32 v2, 0 1454; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v2 1455; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1456; VI-NEXT: v_mov_b32_e32 v1, 0 1457; VI-NEXT: v_min_u32_e32 v2, 32, v2 1458; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1459; VI-NEXT: v_min_u32_e32 v0, 1, v0 1460; VI-NEXT: v_or_b32_e32 v0, v1, v0 1461; VI-NEXT: v_cvt_f32_i32_e32 v0, v0 1462; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 1463; VI-NEXT: v_ldexp_f32 v0, v0, v1 1464; VI-NEXT: s_setpc_b64 s[30:31] 1465 %masked = and i64 %arg0, 255 1466 %itofp = sitofp i64 %masked to float 1467 ret float %itofp 1468} 1469 1470define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) { 1471; SI-LABEL: v_test_uitofp_i64_byte_to_f32: 1472; SI: ; %bb.0: 1473; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1474; SI-NEXT: v_ffbh_u32_e32 v2, 0 1475; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1476; SI-NEXT: v_mov_b32_e32 v1, 0 1477; SI-NEXT: v_min_u32_e32 v2, 32, v2 1478; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 1479; SI-NEXT: v_min_u32_e32 v0, 1, v0 1480; SI-NEXT: v_or_b32_e32 v0, v1, v0 1481; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1482; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v2 1483; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1 1484; SI-NEXT: s_setpc_b64 s[30:31] 1485; 1486; VI-LABEL: v_test_uitofp_i64_byte_to_f32: 1487; VI: ; %bb.0: 1488; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1489; VI-NEXT: v_ffbh_u32_e32 v2, 0 1490; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1491; VI-NEXT: v_mov_b32_e32 v1, 0 1492; VI-NEXT: v_min_u32_e32 v2, 32, v2 1493; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1494; VI-NEXT: v_min_u32_e32 v0, 1, v0 1495; VI-NEXT: v_or_b32_e32 v0, v1, v0 1496; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1497; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 1498; VI-NEXT: v_ldexp_f32 v0, v0, v1 1499; VI-NEXT: s_setpc_b64 s[30:31] 1500 %masked = and i64 %arg0, 255 1501 %itofp = uitofp i64 %masked to float 1502 ret float %itofp 1503} 1504 1505define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) { 1506; SI-LABEL: v_test_sitofp_i16_byte_to_f32: 1507; SI: ; %bb.0: 1508; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1509; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1510; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 1511; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1512; SI-NEXT: s_setpc_b64 s[30:31] 1513; 1514; VI-LABEL: v_test_sitofp_i16_byte_to_f32: 1515; VI: ; %bb.0: 1516; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1517; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1518; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1519; VI-NEXT: s_setpc_b64 s[30:31] 1520 %masked = and i16 %arg0, 255 1521 %itofp = sitofp i16 %masked to float 1522 ret float %itofp 1523} 1524 1525define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) { 1526; SI-LABEL: v_test_uitofp_i16_byte_to_f32: 1527; SI: ; %bb.0: 1528; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1529; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1530; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1531; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1532; SI-NEXT: s_setpc_b64 s[30:31] 1533; 1534; VI-LABEL: v_test_uitofp_i16_byte_to_f32: 1535; VI: ; %bb.0: 1536; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1537; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1538; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1539; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1540; VI-NEXT: s_setpc_b64 s[30:31] 1541 %masked = and i16 %arg0, 255 1542 %itofp = uitofp i16 %masked to float 1543 ret float %itofp 1544} 1545