1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11 7 8declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 9declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 10 11define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 12; GCN-LABEL: v_uitofp_i32_to_f32_mask255: 13; GCN: ; %bb.0: 14; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 16; GCN-NEXT: s_setpc_b64 s[30:31] 17; 18; GFX10-LABEL: v_uitofp_i32_to_f32_mask255: 19; GFX10: ; %bb.0: 20; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 22; GFX10-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX9-LABEL: v_uitofp_i32_to_f32_mask255: 25; GFX9: ; %bb.0: 26; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 28; GFX9-NEXT: s_setpc_b64 s[30:31] 29; 30; GFX11-LABEL: v_uitofp_i32_to_f32_mask255: 31; GFX11: ; %bb.0: 32; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 33; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 34; GFX11-NEXT: s_setpc_b64 s[30:31] 35 %masked = and i32 %arg0, 255 36 %cvt = uitofp i32 %masked to float 37 ret float %cvt 38} 39 40define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind { 41; GCN-LABEL: v_sitofp_i32_to_f32_mask255: 42; GCN: ; %bb.0: 43; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 44; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 45; GCN-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX10-LABEL: v_sitofp_i32_to_f32_mask255: 48; GFX10: ; %bb.0: 49; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 51; GFX10-NEXT: s_setpc_b64 s[30:31] 52; 53; GFX9-LABEL: v_sitofp_i32_to_f32_mask255: 54; GFX9: ; %bb.0: 55; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 56; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 57; GFX9-NEXT: s_setpc_b64 s[30:31] 58; 59; GFX11-LABEL: v_sitofp_i32_to_f32_mask255: 60; GFX11: ; %bb.0: 61; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 62; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 63; GFX11-NEXT: s_setpc_b64 s[30:31] 64 %masked = and i32 %arg0, 255 65 %cvt = sitofp i32 %masked to float 66 ret float %cvt 67} 68 69define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind { 70; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255: 71; GCN: ; %bb.0: 72; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 74; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 75; GCN-NEXT: s_setpc_b64 s[30:31] 76; 77; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255: 78; GFX10: ; %bb.0: 79; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 81; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 82; GFX10-NEXT: s_setpc_b64 s[30:31] 83; 84; GFX9-LABEL: v_uitofp_to_f32_lshr7_mask255: 85; GFX9: ; %bb.0: 86; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 87; GFX9-NEXT: v_lshrrev_b32_e32 v0, 7, v0 88; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 89; GFX9-NEXT: s_setpc_b64 s[30:31] 90; 91; GFX11-LABEL: v_uitofp_to_f32_lshr7_mask255: 92; GFX11: ; %bb.0: 93; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 94; GFX11-NEXT: v_lshrrev_b32_e32 v0, 7, v0 95; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 96; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 97; GFX11-NEXT: s_setpc_b64 s[30:31] 98 %lshr.7 = lshr i32 %arg0, 7 99 %masked = and i32 %lshr.7, 255 100 %cvt = uitofp i32 %masked to float 101 ret float %cvt 102} 103 104define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind { 105; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255: 106; GCN: ; %bb.0: 107; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 109; GCN-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255: 112; GFX10: ; %bb.0: 113; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 115; GFX10-NEXT: s_setpc_b64 s[30:31] 116; 117; GFX9-LABEL: v_uitofp_to_f32_lshr8_mask255: 118; GFX9: ; %bb.0: 119; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 120; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 121; GFX9-NEXT: s_setpc_b64 s[30:31] 122; 123; GFX11-LABEL: v_uitofp_to_f32_lshr8_mask255: 124; GFX11: ; %bb.0: 125; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 127; GFX11-NEXT: s_setpc_b64 s[30:31] 128 %lshr.8 = lshr i32 %arg0, 8 129 %masked = and i32 %lshr.8, 255 130 %cvt = uitofp i32 %masked to float 131 ret float %cvt 132} 133 134define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { 135; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 136; SI: ; %bb.0: 137; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 139; SI-NEXT: s_mov_b32 s7, 0xf000 140; SI-NEXT: s_mov_b32 s6, -1 141; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 142; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 143; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 144; SI-NEXT: s_setpc_b64 s[30:31] 145; 146; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 147; VI: ; %bb.0: 148; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 150; VI-NEXT: s_mov_b32 s7, 0xf000 151; VI-NEXT: s_mov_b32 s6, -1 152; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 153; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 154; VI-NEXT: s_waitcnt vmcnt(0) 155; VI-NEXT: s_setpc_b64 s[30:31] 156; 157; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 158; GFX10: ; %bb.0: 159; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 161; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 162; GFX10-NEXT: global_store_dword v[0:1], v1, off 163; GFX10-NEXT: s_setpc_b64 s[30:31] 164; 165; GFX9-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 166; GFX9: ; %bb.0: 167; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 168; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 169; GFX9-NEXT: global_store_dword v[0:1], v1, off 170; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 171; GFX9-NEXT: s_waitcnt vmcnt(0) 172; GFX9-NEXT: s_setpc_b64 s[30:31] 173; 174; GFX11-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: 175; GFX11: ; %bb.0: 176; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 178; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 179; GFX11-NEXT: global_store_b32 v[0:1], v1, off 180; GFX11-NEXT: s_setpc_b64 s[30:31] 181 %lshr.8 = lshr i32 %arg0, 8 182 store i32 %lshr.8, ptr addrspace(1) undef 183 %masked = and i32 %lshr.8, 255 184 %cvt = uitofp i32 %masked to float 185 ret float %cvt 186} 187 188define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind { 189; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255: 190; GCN: ; %bb.0: 191; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 192; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 193; GCN-NEXT: s_setpc_b64 s[30:31] 194; 195; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255: 196; GFX10: ; %bb.0: 197; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 198; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 199; GFX10-NEXT: s_setpc_b64 s[30:31] 200; 201; GFX9-LABEL: v_uitofp_to_f32_lshr16_mask255: 202; GFX9: ; %bb.0: 203; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 204; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 205; GFX9-NEXT: s_setpc_b64 s[30:31] 206; 207; GFX11-LABEL: v_uitofp_to_f32_lshr16_mask255: 208; GFX11: ; %bb.0: 209; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 211; GFX11-NEXT: s_setpc_b64 s[30:31] 212 %lshr.16 = lshr i32 %arg0, 16 213 %masked = and i32 %lshr.16, 255 214 %cvt = uitofp i32 %masked to float 215 ret float %cvt 216} 217 218define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind { 219; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255: 220; GCN: ; %bb.0: 221; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 222; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 223; GCN-NEXT: s_setpc_b64 s[30:31] 224; 225; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255: 226; GFX10: ; %bb.0: 227; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 228; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 229; GFX10-NEXT: s_setpc_b64 s[30:31] 230; 231; GFX9-LABEL: v_uitofp_to_f32_lshr24_mask255: 232; GFX9: ; %bb.0: 233; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 234; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 235; GFX9-NEXT: s_setpc_b64 s[30:31] 236; 237; GFX11-LABEL: v_uitofp_to_f32_lshr24_mask255: 238; GFX11: ; %bb.0: 239; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 240; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 241; GFX11-NEXT: s_setpc_b64 s[30:31] 242 %lshr.16 = lshr i32 %arg0, 24 243 %masked = and i32 %lshr.16, 255 244 %cvt = uitofp i32 %masked to float 245 ret float %cvt 246} 247 248define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind { 249; GCN-LABEL: v_uitofp_i8_to_f32: 250; GCN: ; %bb.0: 251; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 252; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 253; GCN-NEXT: s_setpc_b64 s[30:31] 254; 255; GFX10-LABEL: v_uitofp_i8_to_f32: 256; GFX10: ; %bb.0: 257; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 258; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 259; GFX10-NEXT: s_setpc_b64 s[30:31] 260; 261; GFX9-LABEL: v_uitofp_i8_to_f32: 262; GFX9: ; %bb.0: 263; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 264; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 265; GFX9-NEXT: s_setpc_b64 s[30:31] 266; 267; GFX11-LABEL: v_uitofp_i8_to_f32: 268; GFX11: ; %bb.0: 269; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 270; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 271; GFX11-NEXT: s_setpc_b64 s[30:31] 272 %cvt = uitofp i8 %arg0 to float 273 ret float %cvt 274} 275 276define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind { 277; GCN-LABEL: v_uitofp_v2i8_to_v2f32: 278; GCN: ; %bb.0: 279; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 281; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 282; GCN-NEXT: v_mov_b32_e32 v0, v2 283; GCN-NEXT: s_setpc_b64 s[30:31] 284; 285; GFX10-LABEL: v_uitofp_v2i8_to_v2f32: 286; GFX10: ; %bb.0: 287; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 288; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 289; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 290; GFX10-NEXT: v_mov_b32_e32 v0, v2 291; GFX10-NEXT: s_setpc_b64 s[30:31] 292; 293; GFX9-LABEL: v_uitofp_v2i8_to_v2f32: 294; GFX9: ; %bb.0: 295; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 296; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 297; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 298; GFX9-NEXT: v_mov_b32_e32 v0, v2 299; GFX9-NEXT: s_setpc_b64 s[30:31] 300; 301; GFX11-LABEL: v_uitofp_v2i8_to_v2f32: 302; GFX11: ; %bb.0: 303; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 304; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 305; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 306; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 307; GFX11-NEXT: v_mov_b32_e32 v0, v2 308; GFX11-NEXT: s_setpc_b64 s[30:31] 309 %val = bitcast i16 %arg0 to <2 x i8> 310 %cvt = uitofp <2 x i8> %val to <2 x float> 311 ret <2 x float> %cvt 312} 313 314define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind { 315; GCN-LABEL: v_uitofp_v3i8_to_v3f32: 316; GCN: ; %bb.0: 317; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 318; GCN-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 319; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 320; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 321; GCN-NEXT: v_mov_b32_e32 v0, v3 322; GCN-NEXT: s_setpc_b64 s[30:31] 323; 324; GFX10-LABEL: v_uitofp_v3i8_to_v3f32: 325; GFX10: ; %bb.0: 326; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 327; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 328; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 329; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 330; GFX10-NEXT: v_mov_b32_e32 v0, v3 331; GFX10-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX9-LABEL: v_uitofp_v3i8_to_v3f32: 334; GFX9: ; %bb.0: 335; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 337; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 338; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 339; GFX9-NEXT: v_mov_b32_e32 v0, v3 340; GFX9-NEXT: s_setpc_b64 s[30:31] 341; 342; GFX11-LABEL: v_uitofp_v3i8_to_v3f32: 343; GFX11: ; %bb.0: 344; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 345; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 346; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 347; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 348; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 349; GFX11-NEXT: v_mov_b32_e32 v0, v3 350; GFX11-NEXT: s_setpc_b64 s[30:31] 351 %trunc = trunc i32 %arg0 to i24 352 %val = bitcast i24 %trunc to <3 x i8> 353 %cvt = uitofp <3 x i8> %val to <3 x float> 354 ret <3 x float> %cvt 355} 356 357define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind { 358; GCN-LABEL: v_uitofp_v4i8_to_v4f32: 359; GCN: ; %bb.0: 360; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 361; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 362; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 363; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 364; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 365; GCN-NEXT: v_mov_b32_e32 v0, v4 366; GCN-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX10-LABEL: v_uitofp_v4i8_to_v4f32: 369; GFX10: ; %bb.0: 370; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 372; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 373; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 374; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 375; GFX10-NEXT: v_mov_b32_e32 v0, v4 376; GFX10-NEXT: s_setpc_b64 s[30:31] 377; 378; GFX9-LABEL: v_uitofp_v4i8_to_v4f32: 379; GFX9: ; %bb.0: 380; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 381; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 382; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 383; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 384; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 385; GFX9-NEXT: v_mov_b32_e32 v0, v4 386; GFX9-NEXT: s_setpc_b64 s[30:31] 387; 388; GFX11-LABEL: v_uitofp_v4i8_to_v4f32: 389; GFX11: ; %bb.0: 390; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 392; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 393; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 394; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 395; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 396; GFX11-NEXT: v_mov_b32_e32 v0, v4 397; GFX11-NEXT: s_setpc_b64 s[30:31] 398 %val = bitcast i32 %arg0 to <4 x i8> 399 %cvt = uitofp <4 x i8> %val to <4 x float> 400 ret <4 x float> %cvt 401} 402 403define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind { 404; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32: 405; GCN: ; %bb.0: 406; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 407; GCN-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 408; GCN-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 409; GCN-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 410; GCN-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 411; GCN-NEXT: v_mov_b32_e32 v0, v4 412; GCN-NEXT: s_setpc_b64 s[30:31] 413; 414; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32: 415; GFX10: ; %bb.0: 416; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 417; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 418; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 419; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 420; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 421; GFX10-NEXT: v_mov_b32_e32 v0, v4 422; GFX10-NEXT: s_setpc_b64 s[30:31] 423; 424; GFX9-LABEL: v_uitofp_unpack_i32_to_v4f32: 425; GFX9: ; %bb.0: 426; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 428; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 429; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 430; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 431; GFX9-NEXT: v_mov_b32_e32 v0, v4 432; GFX9-NEXT: s_setpc_b64 s[30:31] 433; 434; GFX11-LABEL: v_uitofp_unpack_i32_to_v4f32: 435; GFX11: ; %bb.0: 436; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 437; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 438; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 439; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 440; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 441; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 442; GFX11-NEXT: v_mov_b32_e32 v0, v4 443; GFX11-NEXT: s_setpc_b64 s[30:31] 444 %mask.arg0 = and i32 %arg0, 255 445 %cvt0 = uitofp i32 %mask.arg0 to float 446 447 %lshr.8 = lshr i32 %arg0, 8 448 %mask.lshr.8 = and i32 %lshr.8, 255 449 %cvt1 = uitofp i32 %mask.lshr.8 to float 450 451 %lshr.16 = lshr i32 %arg0, 16 452 %mask.lshr.16 = and i32 %lshr.16, 255 453 %cvt2 = uitofp i32 %mask.lshr.16 to float 454 455 %lshr.24 = lshr i32 %arg0, 24 456 %mask.lshr.24 = and i32 %lshr.24, 255 457 %cvt3 = uitofp i32 %mask.lshr.24 to float 458 459 %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0 460 %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1 461 %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2 462 %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3 463 ret <4 x float> %ins.3 464} 465 466define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 467; SI-LABEL: v_uitofp_i32_to_f16_mask255: 468; SI: ; %bb.0: 469; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 471; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 472; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 473; SI-NEXT: s_setpc_b64 s[30:31] 474; 475; VI-LABEL: v_uitofp_i32_to_f16_mask255: 476; VI: ; %bb.0: 477; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 478; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 479; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 480; VI-NEXT: s_setpc_b64 s[30:31] 481; 482; GFX10-LABEL: v_uitofp_i32_to_f16_mask255: 483; GFX10: ; %bb.0: 484; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 485; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 486; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 487; GFX10-NEXT: s_setpc_b64 s[30:31] 488; 489; GFX9-LABEL: v_uitofp_i32_to_f16_mask255: 490; GFX9: ; %bb.0: 491; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 492; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 493; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 494; GFX9-NEXT: s_setpc_b64 s[30:31] 495; 496; GFX11-LABEL: v_uitofp_i32_to_f16_mask255: 497; GFX11: ; %bb.0: 498; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 499; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 500; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 501; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 502; GFX11-NEXT: s_setpc_b64 s[30:31] 503 %masked = and i32 %arg0, 255 504 %cvt = uitofp i32 %masked to half 505 ret half %cvt 506} 507 508define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { 509; SI-LABEL: v_sitofp_i32_to_f16_mask255: 510; SI: ; %bb.0: 511; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 513; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 514; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 515; SI-NEXT: s_setpc_b64 s[30:31] 516; 517; VI-LABEL: v_sitofp_i32_to_f16_mask255: 518; VI: ; %bb.0: 519; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 520; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 521; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 522; VI-NEXT: s_setpc_b64 s[30:31] 523; 524; GFX10-LABEL: v_sitofp_i32_to_f16_mask255: 525; GFX10: ; %bb.0: 526; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 527; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 528; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 529; GFX10-NEXT: s_setpc_b64 s[30:31] 530; 531; GFX9-LABEL: v_sitofp_i32_to_f16_mask255: 532; GFX9: ; %bb.0: 533; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 535; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 536; GFX9-NEXT: s_setpc_b64 s[30:31] 537; 538; GFX11-LABEL: v_sitofp_i32_to_f16_mask255: 539; GFX11: ; %bb.0: 540; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 541; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 542; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 543; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 544; GFX11-NEXT: s_setpc_b64 s[30:31] 545 %masked = and i32 %arg0, 255 546 %cvt = sitofp i32 %masked to half 547 ret half %cvt 548} 549 550define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { 551; SI-LABEL: v_uitofp_to_f16_lshr8_mask255: 552; SI: ; %bb.0: 553; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 554; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 555; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 556; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 557; SI-NEXT: s_setpc_b64 s[30:31] 558; 559; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: 560; VI: ; %bb.0: 561; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 562; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 563; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 564; VI-NEXT: s_setpc_b64 s[30:31] 565; 566; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255: 567; GFX10: ; %bb.0: 568; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 569; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 570; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 571; GFX10-NEXT: s_setpc_b64 s[30:31] 572; 573; GFX9-LABEL: v_uitofp_to_f16_lshr8_mask255: 574; GFX9: ; %bb.0: 575; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 576; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 577; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 578; GFX9-NEXT: s_setpc_b64 s[30:31] 579; 580; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255: 581; GFX11: ; %bb.0: 582; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 583; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 584; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 585; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 586; GFX11-NEXT: s_setpc_b64 s[30:31] 587 %lshr.8 = lshr i32 %arg0, 8 588 %masked = and i32 %lshr.8, 255 589 %cvt = uitofp i32 %masked to half 590 ret half %cvt 591} 592 593define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { 594; SI-LABEL: v_uitofp_to_f16_lshr16_mask255: 595; SI: ; %bb.0: 596; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 597; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 598; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 599; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 600; SI-NEXT: s_setpc_b64 s[30:31] 601; 602; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: 603; VI: ; %bb.0: 604; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 605; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 606; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 607; VI-NEXT: s_setpc_b64 s[30:31] 608; 609; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255: 610; GFX10: ; %bb.0: 611; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 612; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 613; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 614; GFX10-NEXT: s_setpc_b64 s[30:31] 615; 616; GFX9-LABEL: v_uitofp_to_f16_lshr16_mask255: 617; GFX9: ; %bb.0: 618; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 619; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 620; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 621; GFX9-NEXT: s_setpc_b64 s[30:31] 622; 623; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255: 624; GFX11: ; %bb.0: 625; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 626; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 627; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 628; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 629; GFX11-NEXT: s_setpc_b64 s[30:31] 630 %lshr.16 = lshr i32 %arg0, 16 631 %masked = and i32 %lshr.16, 255 632 %cvt = uitofp i32 %masked to half 633 ret half %cvt 634} 635 636define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { 637; SI-LABEL: v_uitofp_to_f16_lshr24_mask255: 638; SI: ; %bb.0: 639; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 640; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 641; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 642; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 643; SI-NEXT: s_setpc_b64 s[30:31] 644; 645; VI-LABEL: v_uitofp_to_f16_lshr24_mask255: 646; VI: ; %bb.0: 647; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 648; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 649; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 650; VI-NEXT: s_setpc_b64 s[30:31] 651; 652; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255: 653; GFX10: ; %bb.0: 654; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 655; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 656; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 657; GFX10-NEXT: s_setpc_b64 s[30:31] 658; 659; GFX9-LABEL: v_uitofp_to_f16_lshr24_mask255: 660; GFX9: ; %bb.0: 661; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 662; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 663; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 664; GFX9-NEXT: s_setpc_b64 s[30:31] 665; 666; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255: 667; GFX11: ; %bb.0: 668; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 669; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 670; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 671; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 672; GFX11-NEXT: s_setpc_b64 s[30:31] 673 %lshr.16 = lshr i32 %arg0, 24 674 %masked = and i32 %lshr.16, 255 675 %cvt = uitofp i32 %masked to half 676 ret half %cvt 677} 678 679define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind { 680; SI-LABEL: v_uitofp_i8_to_f16: 681; SI: ; %bb.0: 682; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 684; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 685; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 686; SI-NEXT: s_setpc_b64 s[30:31] 687; 688; VI-LABEL: v_uitofp_i8_to_f16: 689; VI: ; %bb.0: 690; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 691; VI-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 692; VI-NEXT: s_setpc_b64 s[30:31] 693; 694; GFX10-LABEL: v_uitofp_i8_to_f16: 695; GFX10: ; %bb.0: 696; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 698; GFX10-NEXT: s_setpc_b64 s[30:31] 699; 700; GFX9-LABEL: v_uitofp_i8_to_f16: 701; GFX9: ; %bb.0: 702; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 703; GFX9-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 704; GFX9-NEXT: s_setpc_b64 s[30:31] 705; 706; GFX11-LABEL: v_uitofp_i8_to_f16: 707; GFX11: ; %bb.0: 708; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 709; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 710; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 711; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 712; GFX11-NEXT: s_setpc_b64 s[30:31] 713 %cvt = uitofp i8 %arg0 to half 714 ret half %cvt 715} 716 717define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind { 718; GCN-LABEL: v_uitofp_i32_to_f64_mask255: 719; GCN: ; %bb.0: 720; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 721; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 722; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 723; GCN-NEXT: s_setpc_b64 s[30:31] 724; 725; GFX10-LABEL: v_uitofp_i32_to_f64_mask255: 726; GFX10: ; %bb.0: 727; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 728; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 729; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 730; GFX10-NEXT: s_setpc_b64 s[30:31] 731; 732; GFX9-LABEL: v_uitofp_i32_to_f64_mask255: 733; GFX9: ; %bb.0: 734; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 735; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 736; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 737; GFX9-NEXT: s_setpc_b64 s[30:31] 738; 739; GFX11-LABEL: v_uitofp_i32_to_f64_mask255: 740; GFX11: ; %bb.0: 741; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 743; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 744; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 745; GFX11-NEXT: s_setpc_b64 s[30:31] 746 %masked = and i32 %arg0, 255 747 %cvt = uitofp i32 %masked to double 748 ret double %cvt 749} 750 751define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind { 752; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255: 753; GCN: ; %bb.0: 754; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 755; GCN-NEXT: v_bfe_u32 v0, v0, 8, 8 756; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 757; GCN-NEXT: s_setpc_b64 s[30:31] 758; 759; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255: 760; GFX10: ; %bb.0: 761; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 762; GFX10-NEXT: v_bfe_u32 v0, v0, 8, 8 763; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 764; GFX10-NEXT: s_setpc_b64 s[30:31] 765; 766; GFX9-LABEL: v_uitofp_to_f64_lshr8_mask255: 767; GFX9: ; %bb.0: 768; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 769; GFX9-NEXT: v_bfe_u32 v0, v0, 8, 8 770; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 771; GFX9-NEXT: s_setpc_b64 s[30:31] 772; 773; GFX11-LABEL: v_uitofp_to_f64_lshr8_mask255: 774; GFX11: ; %bb.0: 775; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 776; GFX11-NEXT: v_bfe_u32 v0, v0, 8, 8 777; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 778; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 779; GFX11-NEXT: s_setpc_b64 s[30:31] 780 %lshr.8 = lshr i32 %arg0, 8 781 %masked = and i32 %lshr.8, 255 782 %cvt = uitofp i32 %masked to double 783 ret double %cvt 784} 785 786define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind { 787; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255: 788; GCN: ; %bb.0: 789; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 790; GCN-NEXT: v_bfe_u32 v0, v0, 16, 8 791; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 792; GCN-NEXT: s_setpc_b64 s[30:31] 793; 794; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255: 795; GFX10: ; %bb.0: 796; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 797; GFX10-NEXT: v_bfe_u32 v0, v0, 16, 8 798; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 799; GFX10-NEXT: s_setpc_b64 s[30:31] 800; 801; GFX9-LABEL: v_uitofp_to_f64_lshr16_mask255: 802; GFX9: ; %bb.0: 803; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 804; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 805; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 806; GFX9-NEXT: s_setpc_b64 s[30:31] 807; 808; GFX11-LABEL: v_uitofp_to_f64_lshr16_mask255: 809; GFX11: ; %bb.0: 810; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 811; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 812; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 813; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 814; GFX11-NEXT: s_setpc_b64 s[30:31] 815 %lshr.16 = lshr i32 %arg0, 16 816 %masked = and i32 %lshr.16, 255 817 %cvt = uitofp i32 %masked to double 818 ret double %cvt 819} 820 821define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind { 822; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255: 823; GCN: ; %bb.0: 824; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 825; GCN-NEXT: v_lshrrev_b32_e32 v0, 24, v0 826; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 827; GCN-NEXT: s_setpc_b64 s[30:31] 828; 829; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255: 830; GFX10: ; %bb.0: 831; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 832; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 833; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 834; GFX10-NEXT: s_setpc_b64 s[30:31] 835; 836; GFX9-LABEL: v_uitofp_to_f64_lshr24_mask255: 837; GFX9: ; %bb.0: 838; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 839; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 840; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 841; GFX9-NEXT: s_setpc_b64 s[30:31] 842; 843; GFX11-LABEL: v_uitofp_to_f64_lshr24_mask255: 844; GFX11: ; %bb.0: 845; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 846; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 847; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 848; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 849; GFX11-NEXT: s_setpc_b64 s[30:31] 850 %lshr.16 = lshr i32 %arg0, 24 851 %masked = and i32 %lshr.16, 255 852 %cvt = uitofp i32 %masked to double 853 ret double %cvt 854} 855 856define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { 857; SI-LABEL: v_uitofp_i8_to_f64: 858; SI: ; %bb.0: 859; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 860; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 861; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 862; SI-NEXT: s_setpc_b64 s[30:31] 863; 864; VI-LABEL: v_uitofp_i8_to_f64: 865; VI: ; %bb.0: 866; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 867; VI-NEXT: v_mov_b32_e32 v1, 0xffff 868; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 869; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 870; VI-NEXT: s_setpc_b64 s[30:31] 871; 872; GFX10-LABEL: v_uitofp_i8_to_f64: 873; GFX10: ; %bb.0: 874; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 875; GFX10-NEXT: v_mov_b32_e32 v1, 0xffff 876; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 877; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 878; GFX10-NEXT: s_setpc_b64 s[30:31] 879; 880; GFX9-LABEL: v_uitofp_i8_to_f64: 881; GFX9: ; %bb.0: 882; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 883; GFX9-NEXT: s_mov_b32 s4, 0xffff 884; GFX9-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 885; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 886; GFX9-NEXT: s_setpc_b64 s[30:31] 887; 888; GFX11-LABEL: v_uitofp_i8_to_f64: 889; GFX11: ; %bb.0: 890; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 892; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 893; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 894; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 895; GFX11-NEXT: s_setpc_b64 s[30:31] 896 %cvt = uitofp i8 %arg0 to double 897 ret double %cvt 898} 899 900define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 901; SI-LABEL: load_i8_to_f32: 902; SI: ; %bb.0: 903; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 904; SI-NEXT: s_mov_b32 s7, 0xf000 905; SI-NEXT: v_mov_b32_e32 v1, 0 906; SI-NEXT: s_mov_b32 s10, 0 907; SI-NEXT: s_mov_b32 s11, s7 908; SI-NEXT: s_waitcnt lgkmcnt(0) 909; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 910; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 911; SI-NEXT: s_mov_b32 s6, -1 912; SI-NEXT: s_mov_b32 s4, s0 913; SI-NEXT: s_mov_b32 s5, s1 914; SI-NEXT: s_waitcnt vmcnt(0) 915; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 916; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 917; SI-NEXT: s_endpgm 918; 919; VI-LABEL: load_i8_to_f32: 920; VI: ; %bb.0: 921; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 922; VI-NEXT: s_waitcnt lgkmcnt(0) 923; VI-NEXT: v_mov_b32_e32 v1, s3 924; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 925; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 926; VI-NEXT: flat_load_ubyte v0, v[0:1] 927; VI-NEXT: s_mov_b32 s3, 0xf000 928; VI-NEXT: s_mov_b32 s2, -1 929; VI-NEXT: s_waitcnt vmcnt(0) 930; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 931; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 932; VI-NEXT: s_endpgm 933; 934; GFX10-LABEL: load_i8_to_f32: 935; GFX10: ; %bb.0: 936; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 937; GFX10-NEXT: v_mov_b32_e32 v1, 0 938; GFX10-NEXT: s_waitcnt lgkmcnt(0) 939; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 940; GFX10-NEXT: s_waitcnt vmcnt(0) 941; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 942; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 943; GFX10-NEXT: s_endpgm 944; 945; GFX9-LABEL: load_i8_to_f32: 946; GFX9: ; %bb.0: 947; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 948; GFX9-NEXT: v_mov_b32_e32 v1, 0 949; GFX9-NEXT: s_waitcnt lgkmcnt(0) 950; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] 951; GFX9-NEXT: s_waitcnt vmcnt(0) 952; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 953; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 954; GFX9-NEXT: s_endpgm 955; 956; GFX11-LABEL: load_i8_to_f32: 957; GFX11: ; %bb.0: 958; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 959; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 960; GFX11-NEXT: s_waitcnt lgkmcnt(0) 961; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 962; GFX11-NEXT: s_waitcnt vmcnt(0) 963; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 964; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 965; GFX11-NEXT: s_endpgm 966 %tid = call i32 @llvm.amdgcn.workitem.id.x() 967 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid 968 %load = load i8, ptr addrspace(1) %gep, align 1 969 %cvt = uitofp i8 %load to float 970 store float %cvt, ptr addrspace(1) %out, align 4 971 ret void 972} 973 974define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 975; SI-LABEL: load_v2i8_to_v2f32: 976; SI: ; %bb.0: 977; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 978; SI-NEXT: s_mov_b32 s7, 0xf000 979; SI-NEXT: s_mov_b32 s10, 0 980; SI-NEXT: s_mov_b32 s11, s7 981; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 982; SI-NEXT: s_waitcnt lgkmcnt(0) 983; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 984; SI-NEXT: v_mov_b32_e32 v1, 0 985; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 986; SI-NEXT: s_mov_b32 s6, -1 987; SI-NEXT: s_mov_b32 s4, s0 988; SI-NEXT: s_mov_b32 s5, s1 989; SI-NEXT: s_waitcnt vmcnt(0) 990; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 991; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 992; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 993; SI-NEXT: s_endpgm 994; 995; VI-LABEL: load_v2i8_to_v2f32: 996; VI: ; %bb.0: 997; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 998; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 999; VI-NEXT: s_waitcnt lgkmcnt(0) 1000; VI-NEXT: v_mov_b32_e32 v1, s3 1001; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1002; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1003; VI-NEXT: flat_load_ushort v0, v[0:1] 1004; VI-NEXT: s_mov_b32 s3, 0xf000 1005; VI-NEXT: s_mov_b32 s2, -1 1006; VI-NEXT: s_waitcnt vmcnt(0) 1007; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1008; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1009; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1010; VI-NEXT: s_endpgm 1011; 1012; GFX10-LABEL: load_v2i8_to_v2f32: 1013; GFX10: ; %bb.0: 1014; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1015; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1016; GFX10-NEXT: v_mov_b32_e32 v2, 0 1017; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] 1019; GFX10-NEXT: s_waitcnt vmcnt(0) 1020; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1021; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1022; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1023; GFX10-NEXT: s_endpgm 1024; 1025; GFX9-LABEL: load_v2i8_to_v2f32: 1026; GFX9: ; %bb.0: 1027; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1028; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1029; GFX9-NEXT: v_mov_b32_e32 v2, 0 1030; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1031; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] 1032; GFX9-NEXT: s_waitcnt vmcnt(0) 1033; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1034; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1035; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1036; GFX9-NEXT: s_endpgm 1037; 1038; GFX11-LABEL: load_v2i8_to_v2f32: 1039; GFX11: ; %bb.0: 1040; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1041; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1042; GFX11-NEXT: v_mov_b32_e32 v2, 0 1043; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1044; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1045; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1046; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] 1047; GFX11-NEXT: s_waitcnt vmcnt(0) 1048; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1049; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1050; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1051; GFX11-NEXT: s_endpgm 1052 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1053 %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid 1054 %load = load <2 x i8>, ptr addrspace(1) %gep, align 2 1055 %cvt = uitofp <2 x i8> %load to <2 x float> 1056 store <2 x float> %cvt, ptr addrspace(1) %out, align 16 1057 ret void 1058} 1059 1060define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1061; SI-LABEL: load_v3i8_to_v3f32: 1062; SI: ; %bb.0: 1063; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1064; SI-NEXT: s_mov_b32 s7, 0xf000 1065; SI-NEXT: s_mov_b32 s10, 0 1066; SI-NEXT: s_mov_b32 s11, s7 1067; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1068; SI-NEXT: s_waitcnt lgkmcnt(0) 1069; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1070; SI-NEXT: v_mov_b32_e32 v1, 0 1071; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1072; SI-NEXT: s_mov_b32 s6, -1 1073; SI-NEXT: s_mov_b32 s4, s0 1074; SI-NEXT: s_mov_b32 s5, s1 1075; SI-NEXT: s_waitcnt vmcnt(0) 1076; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 1077; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 1078; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 1079; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 1080; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1081; SI-NEXT: s_endpgm 1082; 1083; VI-LABEL: load_v3i8_to_v3f32: 1084; VI: ; %bb.0: 1085; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1086; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1087; VI-NEXT: s_waitcnt lgkmcnt(0) 1088; VI-NEXT: v_mov_b32_e32 v1, s3 1089; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1090; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1091; VI-NEXT: flat_load_dword v0, v[0:1] 1092; VI-NEXT: s_mov_b32 s3, 0xf000 1093; VI-NEXT: s_mov_b32 s2, -1 1094; VI-NEXT: s_waitcnt vmcnt(0) 1095; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1096; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1097; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1098; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 1099; VI-NEXT: s_endpgm 1100; 1101; GFX10-LABEL: load_v3i8_to_v3f32: 1102; GFX10: ; %bb.0: 1103; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1104; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1105; GFX10-NEXT: v_mov_b32_e32 v3, 0 1106; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1107; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1108; GFX10-NEXT: s_waitcnt vmcnt(0) 1109; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1110; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1111; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1112; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 1113; GFX10-NEXT: s_endpgm 1114; 1115; GFX9-LABEL: load_v3i8_to_v3f32: 1116; GFX9: ; %bb.0: 1117; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1118; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1119; GFX9-NEXT: v_mov_b32_e32 v3, 0 1120; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1121; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 1122; GFX9-NEXT: s_waitcnt vmcnt(0) 1123; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1124; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1125; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1126; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] 1127; GFX9-NEXT: s_endpgm 1128; 1129; GFX11-LABEL: load_v3i8_to_v3f32: 1130; GFX11: ; %bb.0: 1131; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1132; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1134; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1135; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1136; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1137; GFX11-NEXT: s_waitcnt vmcnt(0) 1138; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1139; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1140; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1141; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] 1142; GFX11-NEXT: s_endpgm 1143 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1144 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid 1145 %load = load <3 x i8>, ptr addrspace(1) %gep, align 4 1146 %cvt = uitofp <3 x i8> %load to <3 x float> 1147 store <3 x float> %cvt, ptr addrspace(1) %out, align 16 1148 ret void 1149} 1150 1151define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1152; SI-LABEL: load_v4i8_to_v4f32: 1153; SI: ; %bb.0: 1154; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1155; SI-NEXT: s_mov_b32 s7, 0xf000 1156; SI-NEXT: s_mov_b32 s10, 0 1157; SI-NEXT: s_mov_b32 s11, s7 1158; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1159; SI-NEXT: s_waitcnt lgkmcnt(0) 1160; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1161; SI-NEXT: v_mov_b32_e32 v1, 0 1162; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1163; SI-NEXT: s_mov_b32 s6, -1 1164; SI-NEXT: s_mov_b32 s4, s0 1165; SI-NEXT: s_mov_b32 s5, s1 1166; SI-NEXT: s_waitcnt vmcnt(0) 1167; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1168; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1169; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1170; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1171; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1172; SI-NEXT: s_endpgm 1173; 1174; VI-LABEL: load_v4i8_to_v4f32: 1175; VI: ; %bb.0: 1176; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1177; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1178; VI-NEXT: s_waitcnt lgkmcnt(0) 1179; VI-NEXT: v_mov_b32_e32 v1, s3 1180; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1181; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1182; VI-NEXT: flat_load_dword v0, v[0:1] 1183; VI-NEXT: s_mov_b32 s3, 0xf000 1184; VI-NEXT: s_mov_b32 s2, -1 1185; VI-NEXT: s_waitcnt vmcnt(0) 1186; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1187; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1188; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1189; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1190; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1191; VI-NEXT: s_endpgm 1192; 1193; GFX10-LABEL: load_v4i8_to_v4f32: 1194; GFX10: ; %bb.0: 1195; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1196; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1197; GFX10-NEXT: v_mov_b32_e32 v4, 0 1198; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1199; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1200; GFX10-NEXT: s_waitcnt vmcnt(0) 1201; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1202; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1203; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1204; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1205; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1206; GFX10-NEXT: s_endpgm 1207; 1208; GFX9-LABEL: load_v4i8_to_v4f32: 1209; GFX9: ; %bb.0: 1210; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1211; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1212; GFX9-NEXT: v_mov_b32_e32 v4, 0 1213; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1214; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 1215; GFX9-NEXT: s_waitcnt vmcnt(0) 1216; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1217; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1218; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1219; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1220; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1221; GFX9-NEXT: s_endpgm 1222; 1223; GFX11-LABEL: load_v4i8_to_v4f32: 1224; GFX11: ; %bb.0: 1225; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1226; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1227; GFX11-NEXT: v_mov_b32_e32 v4, 0 1228; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1229; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1230; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1232; GFX11-NEXT: s_waitcnt vmcnt(0) 1233; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1234; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1235; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1236; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1237; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1238; GFX11-NEXT: s_endpgm 1239 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1240 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 1241 %load = load <4 x i8>, ptr addrspace(1) %gep, align 4 1242 %cvt = uitofp <4 x i8> %load to <4 x float> 1243 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 1244 ret void 1245} 1246 1247; This should not be adding instructions to shift into the correct 1248; position in the word for the component. 1249 1250; FIXME: Packing bytes 1251define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1252; SI-LABEL: load_v4i8_to_v4f32_unaligned: 1253; SI: ; %bb.0: 1254; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1255; SI-NEXT: s_mov_b32 s7, 0xf000 1256; SI-NEXT: s_mov_b32 s10, 0 1257; SI-NEXT: s_mov_b32 s11, s7 1258; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1259; SI-NEXT: s_waitcnt lgkmcnt(0) 1260; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1261; SI-NEXT: v_mov_b32_e32 v1, 0 1262; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 1263; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 1264; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 1265; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1266; SI-NEXT: s_mov_b32 s6, -1 1267; SI-NEXT: s_mov_b32 s4, s0 1268; SI-NEXT: s_mov_b32 s5, s1 1269; SI-NEXT: s_waitcnt vmcnt(3) 1270; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 1271; SI-NEXT: s_waitcnt vmcnt(2) 1272; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 1273; SI-NEXT: s_waitcnt vmcnt(1) 1274; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 1275; SI-NEXT: s_waitcnt vmcnt(0) 1276; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1277; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1278; SI-NEXT: s_endpgm 1279; 1280; VI-LABEL: load_v4i8_to_v4f32_unaligned: 1281; VI: ; %bb.0: 1282; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1283; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1284; VI-NEXT: s_waitcnt lgkmcnt(0) 1285; VI-NEXT: v_mov_b32_e32 v1, s3 1286; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1287; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1288; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 1289; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1290; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 1291; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1292; VI-NEXT: flat_load_ubyte v2, v[2:3] 1293; VI-NEXT: flat_load_ubyte v3, v[4:5] 1294; VI-NEXT: flat_load_ubyte v4, v[0:1] 1295; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 1296; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1297; VI-NEXT: flat_load_ubyte v1, v[0:1] 1298; VI-NEXT: s_mov_b32 s3, 0xf000 1299; VI-NEXT: s_mov_b32 s2, -1 1300; VI-NEXT: s_waitcnt vmcnt(3) 1301; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1302; VI-NEXT: s_waitcnt vmcnt(2) 1303; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 1304; VI-NEXT: s_waitcnt vmcnt(1) 1305; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1306; VI-NEXT: s_waitcnt vmcnt(0) 1307; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1308; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1309; VI-NEXT: s_endpgm 1310; 1311; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: 1312; GFX10: ; %bb.0: 1313; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1314; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1315; GFX10-NEXT: v_mov_b32_e32 v6, 0 1316; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX10-NEXT: s_clause 0x3 1318; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 1319; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 1320; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 1321; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] 1322; GFX10-NEXT: s_waitcnt vmcnt(3) 1323; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 1324; GFX10-NEXT: s_waitcnt vmcnt(2) 1325; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1326; GFX10-NEXT: s_waitcnt vmcnt(1) 1327; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1328; GFX10-NEXT: s_waitcnt vmcnt(0) 1329; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 1330; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] 1331; GFX10-NEXT: s_endpgm 1332; 1333; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: 1334; GFX9: ; %bb.0: 1335; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1336; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1337; GFX9-NEXT: v_mov_b32_e32 v6, 0 1338; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1339; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 1340; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 1341; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 1342; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] 1343; GFX9-NEXT: s_waitcnt vmcnt(3) 1344; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 1345; GFX9-NEXT: s_waitcnt vmcnt(2) 1346; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1347; GFX9-NEXT: s_waitcnt vmcnt(1) 1348; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1349; GFX9-NEXT: s_waitcnt vmcnt(0) 1350; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 1351; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] 1352; GFX9-NEXT: s_endpgm 1353; 1354; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: 1355; GFX11: ; %bb.0: 1356; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1357; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1359; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1360; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1361; GFX11-NEXT: s_clause 0x3 1362; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 1363; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 1364; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 1365; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1366; GFX11-NEXT: s_waitcnt vmcnt(3) 1367; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 1368; GFX11-NEXT: s_waitcnt vmcnt(2) 1369; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1370; GFX11-NEXT: s_waitcnt vmcnt(1) 1371; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1372; GFX11-NEXT: s_waitcnt vmcnt(0) 1373; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1374; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] 1375; GFX11-NEXT: s_endpgm 1376 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1377 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 1378 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 1379 %cvt = uitofp <4 x i8> %load to <4 x float> 1380 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 1381 ret void 1382} 1383 1384; The other use of shuffle0_0 make it profitable to lower into v_perm 1385 1386define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind { 1387; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: 1388; SI: ; %bb.0: 1389; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1390; SI-NEXT: s_mov_b32 s11, 0xf000 1391; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1392; SI-NEXT: v_mov_b32_e32 v1, 0 1393; SI-NEXT: s_mov_b32 s14, 0 1394; SI-NEXT: s_mov_b32 s15, s11 1395; SI-NEXT: s_waitcnt lgkmcnt(0) 1396; SI-NEXT: s_mov_b64 s[12:13], s[4:5] 1397; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3 1398; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2 1399; SI-NEXT: s_mov_b64 s[12:13], s[6:7] 1400; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 1401; SI-NEXT: s_mov_b32 s10, -1 1402; SI-NEXT: s_mov_b32 s8, s2 1403; SI-NEXT: s_mov_b32 s9, s3 1404; SI-NEXT: s_mov_b32 s2, s10 1405; SI-NEXT: s_mov_b32 s3, s11 1406; SI-NEXT: s_waitcnt vmcnt(2) 1407; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 1408; SI-NEXT: s_waitcnt vmcnt(1) 1409; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 1410; SI-NEXT: v_or_b32_e32 v5, v5, v4 1411; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1412; SI-NEXT: s_waitcnt vmcnt(0) 1413; SI-NEXT: v_or_b32_e32 v6, v3, v6 1414; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1415; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 1416; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 1417; SI-NEXT: v_mov_b32_e32 v3, v1 1418; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 1419; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24 1420; SI-NEXT: v_or_b32_e32 v4, v4, v6 1421; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1422; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 1423; SI-NEXT: s_endpgm 1424; 1425; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: 1426; VI: ; %bb.0: 1427; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1428; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1429; VI-NEXT: s_mov_b32 s8, 0x4000405 1430; VI-NEXT: s_waitcnt lgkmcnt(0) 1431; VI-NEXT: v_mov_b32_e32 v1, s5 1432; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 1433; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1434; VI-NEXT: v_mov_b32_e32 v1, s7 1435; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 1436; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1437; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2 1438; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1439; VI-NEXT: flat_load_ubyte v6, v[0:1] 1440; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2 1441; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1442; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4 1443; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 1444; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4 1445; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1446; VI-NEXT: flat_load_ubyte v2, v[2:3] 1447; VI-NEXT: flat_load_ubyte v3, v[4:5] 1448; VI-NEXT: flat_load_ubyte v4, v[0:1] 1449; VI-NEXT: s_mov_b32 s7, 0xf000 1450; VI-NEXT: s_mov_b32 s6, -1 1451; VI-NEXT: s_mov_b32 s4, s2 1452; VI-NEXT: s_mov_b32 s5, s3 1453; VI-NEXT: s_mov_b32 s2, s6 1454; VI-NEXT: s_mov_b32 s3, s7 1455; VI-NEXT: s_waitcnt vmcnt(3) 1456; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 1457; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 1458; VI-NEXT: s_waitcnt vmcnt(2) 1459; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 1460; VI-NEXT: s_waitcnt vmcnt(1) 1461; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 1462; VI-NEXT: s_waitcnt vmcnt(0) 1463; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1464; VI-NEXT: v_or_b32_e32 v4, v5, v4 1465; VI-NEXT: v_or_b32_e32 v5, v7, v3 1466; VI-NEXT: v_mov_b32_e32 v3, v1 1467; VI-NEXT: v_perm_b32 v4, v4, v5, s8 1468; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1469; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 1470; VI-NEXT: s_endpgm 1471; 1472; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: 1473; GFX10: ; %bb.0: 1474; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1475; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1476; GFX10-NEXT: v_mov_b32_e32 v7, 0 1477; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX10-NEXT: s_clause 0x3 1479; GFX10-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2 1480; GFX10-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3 1481; GFX10-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 1482; GFX10-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2 1483; GFX10-NEXT: s_waitcnt vmcnt(2) 1484; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 1485; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1486; GFX10-NEXT: s_waitcnt vmcnt(0) 1487; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4 1488; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 1489; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 1490; GFX10-NEXT: v_mov_b32_e32 v3, v1 1491; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 1492; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[8:9] 1493; GFX10-NEXT: global_store_dword v7, v4, s[10:11] 1494; GFX10-NEXT: s_endpgm 1495; 1496; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: 1497; GFX9: ; %bb.0: 1498; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1499; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1500; GFX9-NEXT: v_mov_b32_e32 v5, 0 1501; GFX9-NEXT: s_mov_b32 s0, 0x4000405 1502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX9-NEXT: global_load_ubyte v1, v0, s[12:13] offset:2 1504; GFX9-NEXT: global_load_ubyte v2, v0, s[14:15] offset:3 1505; GFX9-NEXT: global_load_ubyte v3, v0, s[12:13] offset:3 1506; GFX9-NEXT: global_load_ubyte v4, v0, s[14:15] offset:2 1507; GFX9-NEXT: s_waitcnt vmcnt(1) 1508; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 1509; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1510; GFX9-NEXT: s_waitcnt vmcnt(0) 1511; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4 1512; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 1513; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 1514; GFX9-NEXT: v_mov_b32_e32 v3, v1 1515; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0 1516; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[8:9] 1517; GFX9-NEXT: global_store_dword v5, v4, s[10:11] 1518; GFX9-NEXT: s_endpgm 1519; 1520; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: 1521; GFX11: ; %bb.0: 1522; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1523; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1524; GFX11-NEXT: v_mov_b32_e32 v6, 0 1525; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1526; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1527; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1528; GFX11-NEXT: s_clause 0x3 1529; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 1530; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3 1531; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3 1532; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2 1533; GFX11-NEXT: s_waitcnt vmcnt(2) 1534; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1 1535; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1536; GFX11-NEXT: s_waitcnt vmcnt(0) 1537; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0 1538; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 1539; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 1540; GFX11-NEXT: v_mov_b32_e32 v3, v1 1541; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 1542; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405 1543; GFX11-NEXT: s_clause 0x1 1544; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 1545; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] 1546; GFX11-NEXT: s_endpgm 1547 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1548 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 1549 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1550 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 1551 %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1 1552 %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2> 1553 %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float> 1554 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 1555 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4 1556 ret void 1557} 1558 1559; FIXME: Need to handle non-uniform case for function below (load without gep). 1560; Instructions still emitted to repack bytes for add use. 1561define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind { 1562; SI-LABEL: load_v4i8_to_v4f32_2_uses: 1563; SI: ; %bb.0: 1564; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1565; SI-NEXT: s_mov_b32 s3, 0xf000 1566; SI-NEXT: s_mov_b32 s10, 0 1567; SI-NEXT: s_mov_b32 s11, s3 1568; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1569; SI-NEXT: v_mov_b32_e32 v1, 0 1570; SI-NEXT: s_waitcnt lgkmcnt(0) 1571; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 1572; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 1573; SI-NEXT: s_mov_b32 s2, -1 1574; SI-NEXT: s_waitcnt lgkmcnt(0) 1575; SI-NEXT: s_mov_b32 s0, s6 1576; SI-NEXT: s_mov_b32 s1, s7 1577; SI-NEXT: s_mov_b32 s6, s2 1578; SI-NEXT: s_mov_b32 s7, s3 1579; SI-NEXT: s_waitcnt vmcnt(0) 1580; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1581; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 1582; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 1583; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 1584; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 1585; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1586; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 1587; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1588; SI-NEXT: s_waitcnt expcnt(0) 1589; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 1590; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 1591; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5 1592; SI-NEXT: v_or_b32_e32 v0, v6, v0 1593; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 1594; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 1595; SI-NEXT: v_or_b32_e32 v1, v1, v2 1596; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1597; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1598; SI-NEXT: v_or_b32_e32 v0, v1, v0 1599; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 1600; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1601; SI-NEXT: s_endpgm 1602; 1603; VI-LABEL: load_v4i8_to_v4f32_2_uses: 1604; VI: ; %bb.0: 1605; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1606; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1607; VI-NEXT: s_mov_b32 s7, 0xf000 1608; VI-NEXT: s_mov_b32 s6, -1 1609; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 1610; VI-NEXT: s_waitcnt lgkmcnt(0) 1611; VI-NEXT: v_mov_b32_e32 v1, s1 1612; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1613; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1614; VI-NEXT: flat_load_dword v4, v[0:1] 1615; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1616; VI-NEXT: v_mov_b32_e32 v6, 9 1617; VI-NEXT: v_mov_b32_e32 v7, 0x900 1618; VI-NEXT: s_waitcnt lgkmcnt(0) 1619; VI-NEXT: s_mov_b32 s4, s2 1620; VI-NEXT: s_mov_b32 s5, s3 1621; VI-NEXT: s_mov_b32 s2, s6 1622; VI-NEXT: s_mov_b32 s3, s7 1623; VI-NEXT: s_waitcnt vmcnt(0) 1624; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 1625; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 1626; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 1627; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1628; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v4 1629; VI-NEXT: v_add_u16_e32 v9, 9, v4 1630; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1631; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1632; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1633; VI-NEXT: s_nop 0 1634; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1635; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1636; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 1637; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1638; VI-NEXT: v_or_b32_e32 v0, v0, v1 1639; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1640; VI-NEXT: s_endpgm 1641; 1642; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: 1643; GFX10: ; %bb.0: 1644; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1645; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1646; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX10-NEXT: global_load_dword v0, v0, s[0:1] 1648; GFX10-NEXT: s_waitcnt_depctr 0xffe3 1649; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1650; GFX10-NEXT: s_waitcnt vmcnt(0) 1651; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1652; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 1653; GFX10-NEXT: v_add_nc_u16 v4, v0, 9 1654; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1 1655; GFX10-NEXT: v_add_nc_u16 v1, v1, 9 1656; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1657; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1658; GFX10-NEXT: v_mov_b32_e32 v4, 0 1659; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1660; GFX10-NEXT: v_add_nc_u16 v1, 0x900, v1 1661; GFX10-NEXT: v_add_nc_u16 v5, 0x900, v2 1662; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1663; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1664; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1665; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1666; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1667; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1668; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1669; GFX10-NEXT: global_store_dword v4, v5, s[2:3] 1670; GFX10-NEXT: s_endpgm 1671; 1672; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: 1673; GFX9: ; %bb.0: 1674; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1675; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1676; GFX9-NEXT: v_mov_b32_e32 v6, 9 1677; GFX9-NEXT: v_mov_b32_e32 v5, 0 1678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1679; GFX9-NEXT: global_load_dword v4, v0, s[0:1] 1680; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1681; GFX9-NEXT: s_movk_i32 s4, 0xff00 1682; GFX9-NEXT: s_movk_i32 s5, 0x900 1683; GFX9-NEXT: s_waitcnt vmcnt(0) 1684; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 1685; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 1686; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 1687; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 1688; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 1689; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 1690; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1691; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1693; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 1694; GFX9-NEXT: s_nop 0 1695; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1696; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1697; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 1698; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1699; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 1700; GFX9-NEXT: global_store_dword v5, v0, s[2:3] 1701; GFX9-NEXT: s_endpgm 1702; 1703; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: 1704; GFX11: ; %bb.0: 1705; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 1706; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1708; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1709; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 1711; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1712; GFX11-NEXT: s_waitcnt vmcnt(0) 1713; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1714; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 1715; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0 1716; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1717; GFX11-NEXT: v_add_nc_u16 v3, v1, 9 1718; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 1719; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 1720; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1721; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 1722; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 1723; GFX11-NEXT: v_mov_b32_e32 v4, 0 1724; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1725; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 1726; GFX11-NEXT: v_add_nc_u16 v2, 0x900, v2 1727; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 1728; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1729; GFX11-NEXT: v_add_nc_u16 v1, 0x900, v1 1730; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2 1731; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 1732; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1733; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1734; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 1735; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1736; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 1737; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1738; GFX11-NEXT: s_clause 0x1 1739; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1740; GFX11-NEXT: global_store_b32 v4, v5, s[2:3] 1741; GFX11-NEXT: s_endpgm 1742 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 1743 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x 1744 %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4 1745 %cvt = uitofp <4 x i8> %load to <4 x float> 1746 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 1747 %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load 1748 store <4 x i8> %add, ptr addrspace(1) %out2, align 4 1749 ret void 1750} 1751 1752; Make sure this doesn't crash. 1753define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1754; SI-LABEL: load_v7i8_to_v7f32: 1755; SI: ; %bb.0: 1756; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1757; SI-NEXT: s_mov_b32 s7, 0xf000 1758; SI-NEXT: s_mov_b32 s10, 0 1759; SI-NEXT: s_mov_b32 s11, s7 1760; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1761; SI-NEXT: s_waitcnt lgkmcnt(0) 1762; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1763; SI-NEXT: v_mov_b32_e32 v1, 0 1764; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 1765; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 1766; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 1767; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[8:11], 0 addr64 1768; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[8:11], 0 addr64 offset:5 1769; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[8:11], 0 addr64 offset:4 1770; SI-NEXT: buffer_load_ubyte v9, v[0:1], s[8:11], 0 addr64 offset:6 1771; SI-NEXT: s_mov_b32 s6, -1 1772; SI-NEXT: s_mov_b32 s4, s0 1773; SI-NEXT: s_mov_b32 s5, s1 1774; SI-NEXT: s_waitcnt vmcnt(6) 1775; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 1776; SI-NEXT: s_waitcnt vmcnt(5) 1777; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 1778; SI-NEXT: s_waitcnt vmcnt(4) 1779; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 1780; SI-NEXT: s_waitcnt vmcnt(3) 1781; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 1782; SI-NEXT: s_waitcnt vmcnt(2) 1783; SI-NEXT: v_cvt_f32_ubyte0_e32 v5, v7 1784; SI-NEXT: s_waitcnt vmcnt(1) 1785; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 1786; SI-NEXT: s_waitcnt vmcnt(0) 1787; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9 1788; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 1789; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 1790; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1791; SI-NEXT: s_endpgm 1792; 1793; VI-LABEL: load_v7i8_to_v7f32: 1794; VI: ; %bb.0: 1795; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1796; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1797; VI-NEXT: s_waitcnt lgkmcnt(0) 1798; VI-NEXT: v_mov_b32_e32 v1, s3 1799; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1800; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1801; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 1802; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1803; VI-NEXT: flat_load_ubyte v10, v[2:3] 1804; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 1805; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 1806; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v0 1807; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 1808; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 1809; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 1810; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v0 1811; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 1812; VI-NEXT: flat_load_ubyte v6, v[6:7] 1813; VI-NEXT: flat_load_ubyte v7, v[8:9] 1814; VI-NEXT: flat_load_ubyte v8, v[2:3] 1815; VI-NEXT: flat_load_ubyte v2, v[0:1] 1816; VI-NEXT: flat_load_ubyte v4, v[4:5] 1817; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 1818; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1819; VI-NEXT: flat_load_ubyte v9, v[0:1] 1820; VI-NEXT: s_mov_b32 s3, 0xf000 1821; VI-NEXT: s_mov_b32 s2, -1 1822; VI-NEXT: s_waitcnt vmcnt(6) 1823; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10 1824; VI-NEXT: s_waitcnt vmcnt(4) 1825; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v7 1826; VI-NEXT: s_waitcnt vmcnt(2) 1827; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 1828; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v6 1829; VI-NEXT: s_waitcnt vmcnt(1) 1830; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 1831; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 1832; VI-NEXT: s_waitcnt vmcnt(0) 1833; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 1834; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 1835; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1836; VI-NEXT: s_endpgm 1837; 1838; GFX10-LABEL: load_v7i8_to_v7f32: 1839; GFX10: ; %bb.0: 1840; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1841; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1842; GFX10-NEXT: v_mov_b32_e32 v8, 0 1843; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1844; GFX10-NEXT: s_clause 0x5 1845; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 1846; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 1847; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 1848; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 1849; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 1850; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1851; GFX10-NEXT: s_waitcnt vmcnt(5) 1852; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 1853; GFX10-NEXT: s_waitcnt vmcnt(4) 1854; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 1855; GFX10-NEXT: s_waitcnt vmcnt(3) 1856; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1857; GFX10-NEXT: s_waitcnt vmcnt(2) 1858; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 1859; GFX10-NEXT: s_waitcnt vmcnt(1) 1860; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 1861; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 1862; GFX10-NEXT: s_waitcnt vmcnt(0) 1863; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1864; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 1865; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 1866; GFX10-NEXT: s_endpgm 1867; 1868; GFX9-LABEL: load_v7i8_to_v7f32: 1869; GFX9: ; %bb.0: 1870; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1871; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1872; GFX9-NEXT: v_mov_b32_e32 v10, 0 1873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 1875; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 1876; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1877; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 1878; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 1879; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] 1880; GFX9-NEXT: s_waitcnt vmcnt(5) 1881; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 1882; GFX9-NEXT: s_waitcnt vmcnt(4) 1883; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 1884; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 1885; GFX9-NEXT: s_waitcnt vmcnt(3) 1886; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 1887; GFX9-NEXT: s_waitcnt vmcnt(2) 1888; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v7 1889; GFX9-NEXT: s_waitcnt vmcnt(1) 1890; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 1891; GFX9-NEXT: s_waitcnt vmcnt(0) 1892; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 1893; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] 1894; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 1895; GFX9-NEXT: s_endpgm 1896; 1897; GFX11-LABEL: load_v7i8_to_v7f32: 1898; GFX11: ; %bb.0: 1899; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1900; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1901; GFX11-NEXT: v_mov_b32_e32 v8, 0 1902; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1903; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1904; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1905; GFX11-NEXT: s_clause 0x5 1906; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 1907; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 1908; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 1909; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1 1910; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4 1911; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1912; GFX11-NEXT: s_waitcnt vmcnt(5) 1913; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 1914; GFX11-NEXT: s_waitcnt vmcnt(4) 1915; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 1916; GFX11-NEXT: s_waitcnt vmcnt(3) 1917; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1918; GFX11-NEXT: s_waitcnt vmcnt(2) 1919; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 1920; GFX11-NEXT: s_waitcnt vmcnt(1) 1921; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v7 1922; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 1923; GFX11-NEXT: s_waitcnt vmcnt(0) 1924; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1925; GFX11-NEXT: s_clause 0x1 1926; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 1927; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 1928; GFX11-NEXT: s_endpgm 1929 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1930 %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid 1931 %load = load <7 x i8>, ptr addrspace(1) %gep, align 1 1932 %cvt = uitofp <7 x i8> %load to <7 x float> 1933 store <7 x float> %cvt, ptr addrspace(1) %out, align 16 1934 ret void 1935} 1936 1937define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1938; SI-LABEL: load_v8i8_to_v8f32: 1939; SI: ; %bb.0: 1940; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1941; SI-NEXT: s_mov_b32 s7, 0xf000 1942; SI-NEXT: s_mov_b32 s10, 0 1943; SI-NEXT: s_mov_b32 s11, s7 1944; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1945; SI-NEXT: s_waitcnt lgkmcnt(0) 1946; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1947; SI-NEXT: v_mov_b32_e32 v1, 0 1948; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[8:11], 0 addr64 1949; SI-NEXT: s_mov_b32 s6, -1 1950; SI-NEXT: s_mov_b32 s4, s0 1951; SI-NEXT: s_mov_b32 s5, s1 1952; SI-NEXT: s_waitcnt vmcnt(0) 1953; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 1954; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 1955; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 1956; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 1957; SI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 1958; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 1959; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 1960; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 1961; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 1962; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1963; SI-NEXT: s_endpgm 1964; 1965; VI-LABEL: load_v8i8_to_v8f32: 1966; VI: ; %bb.0: 1967; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1968; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1969; VI-NEXT: s_waitcnt lgkmcnt(0) 1970; VI-NEXT: v_mov_b32_e32 v1, s3 1971; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1972; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1973; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] 1974; VI-NEXT: s_mov_b32 s3, 0xf000 1975; VI-NEXT: s_mov_b32 s2, -1 1976; VI-NEXT: s_waitcnt vmcnt(0) 1977; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 1978; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 1979; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 1980; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 1981; VI-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 1982; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 1983; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 1984; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 1985; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1986; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1987; VI-NEXT: s_endpgm 1988; 1989; GFX10-LABEL: load_v8i8_to_v8f32: 1990; GFX10: ; %bb.0: 1991; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1992; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1993; GFX10-NEXT: v_mov_b32_e32 v10, 0 1994; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1995; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] 1996; GFX10-NEXT: s_waitcnt vmcnt(0) 1997; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 1998; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 1999; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 2000; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 2001; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 2002; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 2003; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 2004; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 2005; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 2006; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] 2007; GFX10-NEXT: s_endpgm 2008; 2009; GFX9-LABEL: load_v8i8_to_v8f32: 2010; GFX9: ; %bb.0: 2011; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2012; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2013; GFX9-NEXT: v_mov_b32_e32 v9, 0 2014; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2015; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] 2016; GFX9-NEXT: s_waitcnt vmcnt(0) 2017; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 2018; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 2019; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 2020; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v7 2021; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v7, v8 2022; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 2023; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 2024; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 2025; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 2026; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] 2027; GFX9-NEXT: s_endpgm 2028; 2029; GFX11-LABEL: load_v8i8_to_v8f32: 2030; GFX11: ; %bb.0: 2031; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2032; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2033; GFX11-NEXT: v_mov_b32_e32 v10, 0 2034; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2035; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2036; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2037; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] 2038; GFX11-NEXT: s_waitcnt vmcnt(0) 2039; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 2040; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 2041; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 2042; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 2043; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 2044; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 2045; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 2046; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 2047; GFX11-NEXT: s_clause 0x1 2048; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 2049; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] 2050; GFX11-NEXT: s_endpgm 2051 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2052 %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid 2053 %load = load <8 x i8>, ptr addrspace(1) %gep, align 8 2054 %cvt = uitofp <8 x i8> %load to <8 x float> 2055 store <8 x float> %cvt, ptr addrspace(1) %out, align 16 2056 ret void 2057} 2058 2059define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2060; SI-LABEL: i8_zext_inreg_i32_to_f32: 2061; SI: ; %bb.0: 2062; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2063; SI-NEXT: s_mov_b32 s7, 0xf000 2064; SI-NEXT: s_mov_b32 s10, 0 2065; SI-NEXT: s_mov_b32 s11, s7 2066; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2067; SI-NEXT: s_waitcnt lgkmcnt(0) 2068; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2069; SI-NEXT: v_mov_b32_e32 v1, 0 2070; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2071; SI-NEXT: s_mov_b32 s6, -1 2072; SI-NEXT: s_mov_b32 s4, s0 2073; SI-NEXT: s_mov_b32 s5, s1 2074; SI-NEXT: s_waitcnt vmcnt(0) 2075; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 2076; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2077; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2078; SI-NEXT: s_endpgm 2079; 2080; VI-LABEL: i8_zext_inreg_i32_to_f32: 2081; VI: ; %bb.0: 2082; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2083; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2084; VI-NEXT: s_waitcnt lgkmcnt(0) 2085; VI-NEXT: v_mov_b32_e32 v1, s3 2086; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2087; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2088; VI-NEXT: flat_load_dword v0, v[0:1] 2089; VI-NEXT: s_mov_b32 s3, 0xf000 2090; VI-NEXT: s_mov_b32 s2, -1 2091; VI-NEXT: s_waitcnt vmcnt(0) 2092; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 2093; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2094; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2095; VI-NEXT: s_endpgm 2096; 2097; GFX10-LABEL: i8_zext_inreg_i32_to_f32: 2098; GFX10: ; %bb.0: 2099; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2100; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2101; GFX10-NEXT: v_mov_b32_e32 v1, 0 2102; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2104; GFX10-NEXT: s_waitcnt vmcnt(0) 2105; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 2106; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2107; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2108; GFX10-NEXT: s_endpgm 2109; 2110; GFX9-LABEL: i8_zext_inreg_i32_to_f32: 2111; GFX9: ; %bb.0: 2112; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2113; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2114; GFX9-NEXT: v_mov_b32_e32 v1, 0 2115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2116; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2117; GFX9-NEXT: s_waitcnt vmcnt(0) 2118; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 2119; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2120; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2121; GFX9-NEXT: s_endpgm 2122; 2123; GFX11-LABEL: i8_zext_inreg_i32_to_f32: 2124; GFX11: ; %bb.0: 2125; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2126; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2128; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2129; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2130; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2131; GFX11-NEXT: s_waitcnt vmcnt(0) 2132; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 2133; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2134; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2135; GFX11-NEXT: s_endpgm 2136 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2137 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2138 %load = load i32, ptr addrspace(1) %gep, align 4 2139 %add = add i32 %load, 2 2140 %inreg = and i32 %add, 255 2141 %cvt = uitofp i32 %inreg to float 2142 store float %cvt, ptr addrspace(1) %out, align 4 2143 ret void 2144} 2145 2146define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2147; SI-LABEL: i8_zext_inreg_hi1_to_f32: 2148; SI: ; %bb.0: 2149; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2150; SI-NEXT: s_mov_b32 s7, 0xf000 2151; SI-NEXT: s_mov_b32 s10, 0 2152; SI-NEXT: s_mov_b32 s11, s7 2153; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2154; SI-NEXT: s_waitcnt lgkmcnt(0) 2155; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2156; SI-NEXT: v_mov_b32_e32 v1, 0 2157; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2158; SI-NEXT: s_mov_b32 s6, -1 2159; SI-NEXT: s_mov_b32 s4, s0 2160; SI-NEXT: s_mov_b32 s5, s1 2161; SI-NEXT: s_waitcnt vmcnt(0) 2162; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2163; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2164; SI-NEXT: s_endpgm 2165; 2166; VI-LABEL: i8_zext_inreg_hi1_to_f32: 2167; VI: ; %bb.0: 2168; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2169; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2170; VI-NEXT: s_waitcnt lgkmcnt(0) 2171; VI-NEXT: v_mov_b32_e32 v1, s3 2172; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2173; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2174; VI-NEXT: flat_load_dword v0, v[0:1] 2175; VI-NEXT: s_mov_b32 s3, 0xf000 2176; VI-NEXT: s_mov_b32 s2, -1 2177; VI-NEXT: s_waitcnt vmcnt(0) 2178; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2179; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2180; VI-NEXT: s_endpgm 2181; 2182; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: 2183; GFX10: ; %bb.0: 2184; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2185; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2186; GFX10-NEXT: v_mov_b32_e32 v1, 0 2187; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2188; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2189; GFX10-NEXT: s_waitcnt vmcnt(0) 2190; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2191; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2192; GFX10-NEXT: s_endpgm 2193; 2194; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: 2195; GFX9: ; %bb.0: 2196; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2197; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2198; GFX9-NEXT: v_mov_b32_e32 v1, 0 2199; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2200; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2201; GFX9-NEXT: s_waitcnt vmcnt(0) 2202; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2203; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2204; GFX9-NEXT: s_endpgm 2205; 2206; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: 2207; GFX11: ; %bb.0: 2208; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2209; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2210; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2211; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2212; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2214; GFX11-NEXT: s_waitcnt vmcnt(0) 2215; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2216; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2217; GFX11-NEXT: s_endpgm 2218 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2219 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2220 %load = load i32, ptr addrspace(1) %gep, align 4 2221 %inreg = and i32 %load, 65280 2222 %shr = lshr i32 %inreg, 8 2223 %cvt = uitofp i32 %shr to float 2224 store float %cvt, ptr addrspace(1) %out, align 4 2225 ret void 2226} 2227 2228; We don't get these ones because of the zext, but instcombine removes 2229; them so it shouldn't really matter. 2230define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2231; SI-LABEL: i8_zext_i32_to_f32: 2232; SI: ; %bb.0: 2233; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2234; SI-NEXT: s_mov_b32 s7, 0xf000 2235; SI-NEXT: v_mov_b32_e32 v1, 0 2236; SI-NEXT: s_mov_b32 s10, 0 2237; SI-NEXT: s_mov_b32 s11, s7 2238; SI-NEXT: s_waitcnt lgkmcnt(0) 2239; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2240; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 2241; SI-NEXT: s_mov_b32 s6, -1 2242; SI-NEXT: s_mov_b32 s4, s0 2243; SI-NEXT: s_mov_b32 s5, s1 2244; SI-NEXT: s_waitcnt vmcnt(0) 2245; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2246; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2247; SI-NEXT: s_endpgm 2248; 2249; VI-LABEL: i8_zext_i32_to_f32: 2250; VI: ; %bb.0: 2251; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2252; VI-NEXT: s_waitcnt lgkmcnt(0) 2253; VI-NEXT: v_mov_b32_e32 v1, s3 2254; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2255; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2256; VI-NEXT: flat_load_ubyte v0, v[0:1] 2257; VI-NEXT: s_mov_b32 s3, 0xf000 2258; VI-NEXT: s_mov_b32 s2, -1 2259; VI-NEXT: s_waitcnt vmcnt(0) 2260; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2261; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2262; VI-NEXT: s_endpgm 2263; 2264; GFX10-LABEL: i8_zext_i32_to_f32: 2265; GFX10: ; %bb.0: 2266; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2267; GFX10-NEXT: v_mov_b32_e32 v1, 0 2268; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2269; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 2270; GFX10-NEXT: s_waitcnt vmcnt(0) 2271; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2272; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2273; GFX10-NEXT: s_endpgm 2274; 2275; GFX9-LABEL: i8_zext_i32_to_f32: 2276; GFX9: ; %bb.0: 2277; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2278; GFX9-NEXT: v_mov_b32_e32 v1, 0 2279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2280; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] 2281; GFX9-NEXT: s_waitcnt vmcnt(0) 2282; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2283; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2284; GFX9-NEXT: s_endpgm 2285; 2286; GFX11-LABEL: i8_zext_i32_to_f32: 2287; GFX11: ; %bb.0: 2288; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2289; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2290; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2291; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 2292; GFX11-NEXT: s_waitcnt vmcnt(0) 2293; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2294; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2295; GFX11-NEXT: s_endpgm 2296 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2297 %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid 2298 %load = load i8, ptr addrspace(1) %gep, align 1 2299 %ext = zext i8 %load to i32 2300 %cvt = uitofp i32 %ext to float 2301 store float %cvt, ptr addrspace(1) %out, align 4 2302 ret void 2303} 2304 2305define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2306; SI-LABEL: v4i8_zext_v4i32_to_v4f32: 2307; SI: ; %bb.0: 2308; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2309; SI-NEXT: s_mov_b32 s7, 0xf000 2310; SI-NEXT: s_mov_b32 s10, 0 2311; SI-NEXT: s_mov_b32 s11, s7 2312; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2313; SI-NEXT: s_waitcnt lgkmcnt(0) 2314; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2315; SI-NEXT: v_mov_b32_e32 v1, 0 2316; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 2317; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 2318; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 2319; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 2320; SI-NEXT: s_mov_b32 s6, -1 2321; SI-NEXT: s_mov_b32 s4, s0 2322; SI-NEXT: s_mov_b32 s5, s1 2323; SI-NEXT: s_waitcnt vmcnt(3) 2324; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 2325; SI-NEXT: s_waitcnt vmcnt(2) 2326; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 2327; SI-NEXT: s_waitcnt vmcnt(1) 2328; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 2329; SI-NEXT: s_waitcnt vmcnt(0) 2330; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2331; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2332; SI-NEXT: s_endpgm 2333; 2334; VI-LABEL: v4i8_zext_v4i32_to_v4f32: 2335; VI: ; %bb.0: 2336; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2337; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2338; VI-NEXT: s_waitcnt lgkmcnt(0) 2339; VI-NEXT: v_mov_b32_e32 v1, s3 2340; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2341; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2342; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 2343; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 2344; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 2345; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 2346; VI-NEXT: flat_load_ubyte v2, v[2:3] 2347; VI-NEXT: flat_load_ubyte v3, v[4:5] 2348; VI-NEXT: flat_load_ubyte v4, v[0:1] 2349; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 2350; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2351; VI-NEXT: flat_load_ubyte v1, v[0:1] 2352; VI-NEXT: s_mov_b32 s3, 0xf000 2353; VI-NEXT: s_mov_b32 s2, -1 2354; VI-NEXT: s_waitcnt vmcnt(3) 2355; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 2356; VI-NEXT: s_waitcnt vmcnt(2) 2357; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 2358; VI-NEXT: s_waitcnt vmcnt(1) 2359; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 2360; VI-NEXT: s_waitcnt vmcnt(0) 2361; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 2362; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2363; VI-NEXT: s_endpgm 2364; 2365; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: 2366; GFX10: ; %bb.0: 2367; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2368; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2369; GFX10-NEXT: v_mov_b32_e32 v6, 0 2370; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2371; GFX10-NEXT: s_clause 0x3 2372; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 2373; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 2374; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 2375; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] 2376; GFX10-NEXT: s_waitcnt vmcnt(3) 2377; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 2378; GFX10-NEXT: s_waitcnt vmcnt(2) 2379; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 2380; GFX10-NEXT: s_waitcnt vmcnt(1) 2381; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 2382; GFX10-NEXT: s_waitcnt vmcnt(0) 2383; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 2384; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] 2385; GFX10-NEXT: s_endpgm 2386; 2387; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: 2388; GFX9: ; %bb.0: 2389; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2390; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2391; GFX9-NEXT: v_mov_b32_e32 v6, 0 2392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2393; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 2394; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 2395; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 2396; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] 2397; GFX9-NEXT: s_waitcnt vmcnt(3) 2398; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 2399; GFX9-NEXT: s_waitcnt vmcnt(2) 2400; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 2401; GFX9-NEXT: s_waitcnt vmcnt(1) 2402; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 2403; GFX9-NEXT: s_waitcnt vmcnt(0) 2404; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 2405; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] 2406; GFX9-NEXT: s_endpgm 2407; 2408; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: 2409; GFX11: ; %bb.0: 2410; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2411; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2412; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2413; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2414; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2415; GFX11-NEXT: s_clause 0x3 2416; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 2417; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 2418; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 2419; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 2420; GFX11-NEXT: s_waitcnt vmcnt(3) 2421; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 2422; GFX11-NEXT: s_waitcnt vmcnt(2) 2423; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 2424; GFX11-NEXT: s_waitcnt vmcnt(1) 2425; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 2426; GFX11-NEXT: s_waitcnt vmcnt(0) 2427; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2428; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] 2429; GFX11-NEXT: s_endpgm 2430 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2431 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid 2432 %load = load <4 x i8>, ptr addrspace(1) %gep, align 1 2433 %ext = zext <4 x i8> %load to <4 x i32> 2434 %cvt = uitofp <4 x i32> %ext to <4 x float> 2435 store <4 x float> %cvt, ptr addrspace(1) %out, align 16 2436 ret void 2437} 2438 2439define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2440; SI-LABEL: extract_byte0_to_f32: 2441; SI: ; %bb.0: 2442; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2443; SI-NEXT: s_mov_b32 s7, 0xf000 2444; SI-NEXT: s_mov_b32 s10, 0 2445; SI-NEXT: s_mov_b32 s11, s7 2446; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2447; SI-NEXT: s_waitcnt lgkmcnt(0) 2448; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2449; SI-NEXT: v_mov_b32_e32 v1, 0 2450; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2451; SI-NEXT: s_mov_b32 s6, -1 2452; SI-NEXT: s_mov_b32 s4, s0 2453; SI-NEXT: s_mov_b32 s5, s1 2454; SI-NEXT: s_waitcnt vmcnt(0) 2455; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2456; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2457; SI-NEXT: s_endpgm 2458; 2459; VI-LABEL: extract_byte0_to_f32: 2460; VI: ; %bb.0: 2461; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2462; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2463; VI-NEXT: s_waitcnt lgkmcnt(0) 2464; VI-NEXT: v_mov_b32_e32 v1, s3 2465; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2466; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2467; VI-NEXT: flat_load_dword v0, v[0:1] 2468; VI-NEXT: s_mov_b32 s3, 0xf000 2469; VI-NEXT: s_mov_b32 s2, -1 2470; VI-NEXT: s_waitcnt vmcnt(0) 2471; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2472; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2473; VI-NEXT: s_endpgm 2474; 2475; GFX10-LABEL: extract_byte0_to_f32: 2476; GFX10: ; %bb.0: 2477; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2478; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2479; GFX10-NEXT: v_mov_b32_e32 v1, 0 2480; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2481; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2482; GFX10-NEXT: s_waitcnt vmcnt(0) 2483; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2484; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2485; GFX10-NEXT: s_endpgm 2486; 2487; GFX9-LABEL: extract_byte0_to_f32: 2488; GFX9: ; %bb.0: 2489; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2490; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2491; GFX9-NEXT: v_mov_b32_e32 v1, 0 2492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2493; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2494; GFX9-NEXT: s_waitcnt vmcnt(0) 2495; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2496; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2497; GFX9-NEXT: s_endpgm 2498; 2499; GFX11-LABEL: extract_byte0_to_f32: 2500; GFX11: ; %bb.0: 2501; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2502; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2503; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2504; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2505; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2506; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2507; GFX11-NEXT: s_waitcnt vmcnt(0) 2508; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2509; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2510; GFX11-NEXT: s_endpgm 2511 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2512 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2513 %val = load i32, ptr addrspace(1) %gep 2514 %and = and i32 %val, 255 2515 %cvt = uitofp i32 %and to float 2516 store float %cvt, ptr addrspace(1) %out 2517 ret void 2518} 2519 2520define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2521; SI-LABEL: extract_byte1_to_f32: 2522; SI: ; %bb.0: 2523; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2524; SI-NEXT: s_mov_b32 s7, 0xf000 2525; SI-NEXT: s_mov_b32 s10, 0 2526; SI-NEXT: s_mov_b32 s11, s7 2527; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2528; SI-NEXT: s_waitcnt lgkmcnt(0) 2529; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2530; SI-NEXT: v_mov_b32_e32 v1, 0 2531; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2532; SI-NEXT: s_mov_b32 s6, -1 2533; SI-NEXT: s_mov_b32 s4, s0 2534; SI-NEXT: s_mov_b32 s5, s1 2535; SI-NEXT: s_waitcnt vmcnt(0) 2536; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2537; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2538; SI-NEXT: s_endpgm 2539; 2540; VI-LABEL: extract_byte1_to_f32: 2541; VI: ; %bb.0: 2542; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2543; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2544; VI-NEXT: s_waitcnt lgkmcnt(0) 2545; VI-NEXT: v_mov_b32_e32 v1, s3 2546; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2547; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2548; VI-NEXT: flat_load_dword v0, v[0:1] 2549; VI-NEXT: s_mov_b32 s3, 0xf000 2550; VI-NEXT: s_mov_b32 s2, -1 2551; VI-NEXT: s_waitcnt vmcnt(0) 2552; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2553; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2554; VI-NEXT: s_endpgm 2555; 2556; GFX10-LABEL: extract_byte1_to_f32: 2557; GFX10: ; %bb.0: 2558; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2559; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2560; GFX10-NEXT: v_mov_b32_e32 v1, 0 2561; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2562; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2563; GFX10-NEXT: s_waitcnt vmcnt(0) 2564; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2565; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2566; GFX10-NEXT: s_endpgm 2567; 2568; GFX9-LABEL: extract_byte1_to_f32: 2569; GFX9: ; %bb.0: 2570; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2571; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2572; GFX9-NEXT: v_mov_b32_e32 v1, 0 2573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2574; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2575; GFX9-NEXT: s_waitcnt vmcnt(0) 2576; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2577; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2578; GFX9-NEXT: s_endpgm 2579; 2580; GFX11-LABEL: extract_byte1_to_f32: 2581; GFX11: ; %bb.0: 2582; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2583; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2584; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2585; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2586; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2587; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2588; GFX11-NEXT: s_waitcnt vmcnt(0) 2589; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 2590; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2591; GFX11-NEXT: s_endpgm 2592 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2593 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2594 %val = load i32, ptr addrspace(1) %gep 2595 %srl = lshr i32 %val, 8 2596 %and = and i32 %srl, 255 2597 %cvt = uitofp i32 %and to float 2598 store float %cvt, ptr addrspace(1) %out 2599 ret void 2600} 2601 2602define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2603; SI-LABEL: extract_byte2_to_f32: 2604; SI: ; %bb.0: 2605; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2606; SI-NEXT: s_mov_b32 s7, 0xf000 2607; SI-NEXT: s_mov_b32 s10, 0 2608; SI-NEXT: s_mov_b32 s11, s7 2609; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2610; SI-NEXT: s_waitcnt lgkmcnt(0) 2611; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2612; SI-NEXT: v_mov_b32_e32 v1, 0 2613; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2614; SI-NEXT: s_mov_b32 s6, -1 2615; SI-NEXT: s_mov_b32 s4, s0 2616; SI-NEXT: s_mov_b32 s5, s1 2617; SI-NEXT: s_waitcnt vmcnt(0) 2618; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 2619; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2620; SI-NEXT: s_endpgm 2621; 2622; VI-LABEL: extract_byte2_to_f32: 2623; VI: ; %bb.0: 2624; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2625; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2626; VI-NEXT: s_waitcnt lgkmcnt(0) 2627; VI-NEXT: v_mov_b32_e32 v1, s3 2628; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2629; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2630; VI-NEXT: flat_load_dword v0, v[0:1] 2631; VI-NEXT: s_mov_b32 s3, 0xf000 2632; VI-NEXT: s_mov_b32 s2, -1 2633; VI-NEXT: s_waitcnt vmcnt(0) 2634; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 2635; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2636; VI-NEXT: s_endpgm 2637; 2638; GFX10-LABEL: extract_byte2_to_f32: 2639; GFX10: ; %bb.0: 2640; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2641; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2642; GFX10-NEXT: v_mov_b32_e32 v1, 0 2643; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2644; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2645; GFX10-NEXT: s_waitcnt vmcnt(0) 2646; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 2647; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2648; GFX10-NEXT: s_endpgm 2649; 2650; GFX9-LABEL: extract_byte2_to_f32: 2651; GFX9: ; %bb.0: 2652; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2653; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2654; GFX9-NEXT: v_mov_b32_e32 v1, 0 2655; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2656; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2657; GFX9-NEXT: s_waitcnt vmcnt(0) 2658; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 2659; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2660; GFX9-NEXT: s_endpgm 2661; 2662; GFX11-LABEL: extract_byte2_to_f32: 2663; GFX11: ; %bb.0: 2664; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2665; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2666; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2667; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2668; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2669; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2670; GFX11-NEXT: s_waitcnt vmcnt(0) 2671; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 2672; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2673; GFX11-NEXT: s_endpgm 2674 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2675 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2676 %val = load i32, ptr addrspace(1) %gep 2677 %srl = lshr i32 %val, 16 2678 %and = and i32 %srl, 255 2679 %cvt = uitofp i32 %and to float 2680 store float %cvt, ptr addrspace(1) %out 2681 ret void 2682} 2683 2684define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 2685; SI-LABEL: extract_byte3_to_f32: 2686; SI: ; %bb.0: 2687; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2688; SI-NEXT: s_mov_b32 s7, 0xf000 2689; SI-NEXT: s_mov_b32 s10, 0 2690; SI-NEXT: s_mov_b32 s11, s7 2691; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2692; SI-NEXT: s_waitcnt lgkmcnt(0) 2693; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2694; SI-NEXT: v_mov_b32_e32 v1, 0 2695; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2696; SI-NEXT: s_mov_b32 s6, -1 2697; SI-NEXT: s_mov_b32 s4, s0 2698; SI-NEXT: s_mov_b32 s5, s1 2699; SI-NEXT: s_waitcnt vmcnt(0) 2700; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 2701; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2702; SI-NEXT: s_endpgm 2703; 2704; VI-LABEL: extract_byte3_to_f32: 2705; VI: ; %bb.0: 2706; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2707; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2708; VI-NEXT: s_waitcnt lgkmcnt(0) 2709; VI-NEXT: v_mov_b32_e32 v1, s3 2710; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2711; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2712; VI-NEXT: flat_load_dword v0, v[0:1] 2713; VI-NEXT: s_mov_b32 s3, 0xf000 2714; VI-NEXT: s_mov_b32 s2, -1 2715; VI-NEXT: s_waitcnt vmcnt(0) 2716; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 2717; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2718; VI-NEXT: s_endpgm 2719; 2720; GFX10-LABEL: extract_byte3_to_f32: 2721; GFX10: ; %bb.0: 2722; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2723; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2724; GFX10-NEXT: v_mov_b32_e32 v1, 0 2725; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2726; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 2727; GFX10-NEXT: s_waitcnt vmcnt(0) 2728; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 2729; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2730; GFX10-NEXT: s_endpgm 2731; 2732; GFX9-LABEL: extract_byte3_to_f32: 2733; GFX9: ; %bb.0: 2734; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2735; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2736; GFX9-NEXT: v_mov_b32_e32 v1, 0 2737; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2738; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 2739; GFX9-NEXT: s_waitcnt vmcnt(0) 2740; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 2741; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 2742; GFX9-NEXT: s_endpgm 2743; 2744; GFX11-LABEL: extract_byte3_to_f32: 2745; GFX11: ; %bb.0: 2746; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2747; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2748; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2749; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2750; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2751; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 2752; GFX11-NEXT: s_waitcnt vmcnt(0) 2753; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 2754; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2755; GFX11-NEXT: s_endpgm 2756 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2757 %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid 2758 %val = load i32, ptr addrspace(1) %gep 2759 %srl = lshr i32 %val, 24 2760 %and = and i32 %srl, 255 2761 %cvt = uitofp i32 %and to float 2762 store float %cvt, ptr addrspace(1) %out 2763 ret void 2764} 2765 2766define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) { 2767; SI-LABEL: cvt_ubyte0_or_multiuse: 2768; SI: ; %bb.0: ; %bb 2769; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2770; SI-NEXT: s_mov_b32 s7, 0xf000 2771; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2772; SI-NEXT: v_mov_b32_e32 v1, 0 2773; SI-NEXT: s_mov_b32 s6, -1 2774; SI-NEXT: s_waitcnt lgkmcnt(0) 2775; SI-NEXT: s_mov_b32 s4, s2 2776; SI-NEXT: s_mov_b32 s5, s3 2777; SI-NEXT: s_mov_b32 s2, 0 2778; SI-NEXT: s_mov_b32 s3, s7 2779; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 2780; SI-NEXT: s_waitcnt vmcnt(0) 2781; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 2782; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 2783; SI-NEXT: v_add_f32_e32 v0, v0, v1 2784; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2785; SI-NEXT: s_endpgm 2786; 2787; VI-LABEL: cvt_ubyte0_or_multiuse: 2788; VI: ; %bb.0: ; %bb 2789; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2790; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2791; VI-NEXT: s_mov_b32 s7, 0xf000 2792; VI-NEXT: s_mov_b32 s6, -1 2793; VI-NEXT: s_waitcnt lgkmcnt(0) 2794; VI-NEXT: v_mov_b32_e32 v1, s1 2795; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2796; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2797; VI-NEXT: flat_load_dword v0, v[0:1] 2798; VI-NEXT: s_mov_b32 s4, s2 2799; VI-NEXT: s_mov_b32 s5, s3 2800; VI-NEXT: s_waitcnt vmcnt(0) 2801; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 2802; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 2803; VI-NEXT: v_add_f32_e32 v0, v0, v1 2804; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2805; VI-NEXT: s_endpgm 2806; 2807; GFX10-LABEL: cvt_ubyte0_or_multiuse: 2808; GFX10: ; %bb.0: ; %bb 2809; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2810; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2811; GFX10-NEXT: v_mov_b32_e32 v2, 0 2812; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2813; GFX10-NEXT: global_load_dword v0, v0, s[0:1] 2814; GFX10-NEXT: s_waitcnt vmcnt(0) 2815; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0 2816; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 2817; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 2818; GFX10-NEXT: global_store_dword v2, v0, s[2:3] 2819; GFX10-NEXT: s_endpgm 2820; 2821; GFX9-LABEL: cvt_ubyte0_or_multiuse: 2822; GFX9: ; %bb.0: ; %bb 2823; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2824; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2825; GFX9-NEXT: v_mov_b32_e32 v1, 0 2826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 2828; GFX9-NEXT: s_waitcnt vmcnt(0) 2829; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 2830; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 2831; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 2832; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 2833; GFX9-NEXT: s_endpgm 2834; 2835; GFX11-LABEL: cvt_ubyte0_or_multiuse: 2836; GFX11: ; %bb.0: ; %bb 2837; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2838; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2839; GFX11-NEXT: v_mov_b32_e32 v2, 0 2840; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) 2841; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2842; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2843; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 2844; GFX11-NEXT: s_waitcnt vmcnt(0) 2845; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 2846; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 2847; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2848; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 2849; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] 2850; GFX11-NEXT: s_endpgm 2851bb: 2852 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 2853 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid 2854 %load = load i32, ptr addrspace(1) %gep 2855 %or = or i32 %load, -2147483647 2856 %and = and i32 %or, 255 2857 %uitofp = uitofp i32 %and to float 2858 %cast = bitcast i32 %or to float 2859 %add = fadd float %cast, %uitofp 2860 store float %add, ptr addrspace(1) %out 2861 ret void 2862} 2863 2864%Vec = type { [4 x i8] } 2865 2866define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { 2867; SI-LABEL: cvt_f32_ubyte0_vector: 2868; SI: ; %bb.0: ; %entry 2869; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 2870; SI-NEXT: s_mov_b32 s3, 0xf000 2871; SI-NEXT: s_mov_b32 s2, -1 2872; SI-NEXT: s_waitcnt lgkmcnt(0) 2873; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 2874; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 2875; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 2876; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 2877; SI-NEXT: s_load_dword s0, s[0:1], 0x0 2878; SI-NEXT: s_waitcnt vmcnt(3) 2879; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2880; SI-NEXT: s_waitcnt lgkmcnt(0) 2881; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5 2882; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 2883; SI-NEXT: s_waitcnt vmcnt(2) 2884; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 2885; SI-NEXT: s_waitcnt vmcnt(2) 2886; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 2887; SI-NEXT: s_waitcnt vmcnt(2) 2888; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 2889; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2890; SI-NEXT: s_endpgm 2891; 2892; VI-LABEL: cvt_f32_ubyte0_vector: 2893; VI: ; %bb.0: ; %entry 2894; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 2895; VI-NEXT: s_mov_b32 s3, 0xf000 2896; VI-NEXT: s_mov_b32 s2, -1 2897; VI-NEXT: s_waitcnt lgkmcnt(0) 2898; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 2899; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 2900; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 2901; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 2902; VI-NEXT: s_load_dword s0, s[0:1], 0x0 2903; VI-NEXT: s_waitcnt vmcnt(3) 2904; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 2905; VI-NEXT: s_waitcnt lgkmcnt(0) 2906; VI-NEXT: v_mul_f32_e32 v0, s0, v0 2907; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 2908; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 2909; VI-NEXT: s_waitcnt vmcnt(2) 2910; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 2911; VI-NEXT: s_waitcnt vmcnt(2) 2912; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 2913; VI-NEXT: s_waitcnt vmcnt(2) 2914; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 2915; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2916; VI-NEXT: s_endpgm 2917; 2918; GFX10-LABEL: cvt_f32_ubyte0_vector: 2919; GFX10: ; %bb.0: ; %entry 2920; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 2921; GFX10-NEXT: v_mov_b32_e32 v0, 0 2922; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2923; GFX10-NEXT: s_clause 0x3 2924; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 2925; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 2926; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 2927; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1] 2928; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2929; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 2930; GFX10-NEXT: s_waitcnt vmcnt(3) 2931; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 2932; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5 2934; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 2935; GFX10-NEXT: s_waitcnt vmcnt(2) 2936; GFX10-NEXT: global_store_byte v[0:1], v2, off 2937; GFX10-NEXT: s_waitcnt vmcnt(1) 2938; GFX10-NEXT: global_store_byte v[0:1], v3, off 2939; GFX10-NEXT: s_waitcnt vmcnt(0) 2940; GFX10-NEXT: global_store_byte v[0:1], v4, off 2941; GFX10-NEXT: global_store_byte v[0:1], v0, off 2942; GFX10-NEXT: s_endpgm 2943; 2944; GFX9-LABEL: cvt_f32_ubyte0_vector: 2945; GFX9: ; %bb.0: ; %entry 2946; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 2947; GFX9-NEXT: s_waitcnt vmcnt(0) 2948; GFX9-NEXT: global_load_ubyte v2, v[0:1], off offset:3 2949; GFX9-NEXT: global_load_dword v3, v[0:1], off 2950; GFX9-NEXT: global_load_ubyte v4, v[0:1], off offset:2 2951; GFX9-NEXT: global_load_ubyte v5, v[0:1], off offset:1 2952; GFX9-NEXT: global_load_ubyte v6, v[0:1], off 2953; GFX9-NEXT: s_waitcnt vmcnt(4) 2954; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 2955; GFX9-NEXT: s_waitcnt vmcnt(3) 2956; GFX9-NEXT: v_fma_f32 v0, v3, v0, 0.5 2957; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 2958; GFX9-NEXT: s_waitcnt vmcnt(2) 2959; GFX9-NEXT: global_store_byte v[0:1], v4, off 2960; GFX9-NEXT: s_waitcnt vmcnt(2) 2961; GFX9-NEXT: global_store_byte v[0:1], v5, off 2962; GFX9-NEXT: s_waitcnt vmcnt(2) 2963; GFX9-NEXT: global_store_byte v[0:1], v6, off 2964; GFX9-NEXT: global_store_byte v[0:1], v0, off 2965; GFX9-NEXT: s_endpgm 2966; 2967; GFX11-LABEL: cvt_f32_ubyte0_vector: 2968; GFX11: ; %bb.0: ; %entry 2969; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 2970; GFX11-NEXT: v_mov_b32_e32 v0, 0 2971; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2972; GFX11-NEXT: s_clause 0x3 2973; GFX11-NEXT: global_load_u8 v1, v0, s[0:1] offset:3 2974; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] offset:2 2975; GFX11-NEXT: global_load_u8 v3, v0, s[0:1] offset:1 2976; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] 2977; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 2978; GFX11-NEXT: s_waitcnt vmcnt(3) 2979; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 2980; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2981; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2982; GFX11-NEXT: v_fma_f32 v1, s0, v1, 0.5 2983; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 2984; GFX11-NEXT: s_waitcnt vmcnt(0) 2985; GFX11-NEXT: s_clause 0x3 2986; GFX11-NEXT: global_store_b8 v[0:1], v2, off 2987; GFX11-NEXT: global_store_b8 v[0:1], v3, off 2988; GFX11-NEXT: global_store_b8 v[0:1], v0, off 2989; GFX11-NEXT: global_store_b8 v[0:1], v1, off 2990; GFX11-NEXT: s_endpgm 2991entry: 2992 br label %for.body.i 2993 2994for.body.i: ; preds = %for.body.i, %entry 2995 %retval.sroa.0.0.copyload = load ptr, ptr addrspace(1) undef, align 8 2996 %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 undef 2997 %retval.sroa.0.0..sroa_cast_adr = addrspacecast ptr %add.ptr to ptr addrspace(1) 2998 %retval.sroa.0.0.copyload.i = load i32, ptr addrspace(1) %retval.sroa.0.0..sroa_cast_adr, align 1 2999 %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24 3000 %p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8 3001 %conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float 3002 %0 = load float, ptr addrspace(1) undef, align 8 3003 %mul = fmul contract float %0, %conv12 3004 %add = fadd contract float %mul, 5.000000e-01 3005 %conv13 = fptoui float %add to i8 3006 %retval.sroa.4.0.insert.ext = zext i8 %conv13 to i32 3007 %retval.sroa.4.0.insert.shift = shl nuw i32 %retval.sroa.4.0.insert.ext, 24 3008 %retval.sroa.3.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 16711680 3009 %retval.sroa.3.0.insert.insert = or i32 %retval.sroa.4.0.insert.shift, %retval.sroa.3.0.insert.ext 3010 %retval.sroa.2.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 65280 3011 %retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert, %retval.sroa.2.0.insert.ext 3012 %retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 255 3013 %retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert, %retval.sroa.0.0.insert.ext 3014 store i32 %retval.sroa.0.0.insert.insert, ptr addrspace(1) undef, align 1 3015 ret void 3016} 3017 3018!llvm.module.flags = !{!0} 3019!0 = !{i32 1, !"amdhsa_code_object_version", i32 500} 3020