1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s 5 6; half args should be promoted to float for CI and lower. 7 8define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 { 9; CI-LABEL: load_f16_arg: 10; CI: ; %bb.0: 11; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 12; CI-NEXT: s_load_dword s2, s[8:9], 0x2 13; CI-NEXT: s_waitcnt lgkmcnt(0) 14; CI-NEXT: v_mov_b32_e32 v0, s0 15; CI-NEXT: v_mov_b32_e32 v1, s1 16; CI-NEXT: v_mov_b32_e32 v2, s2 17; CI-NEXT: flat_store_short v[0:1], v2 18; CI-NEXT: s_endpgm 19; 20; VI-LABEL: load_f16_arg: 21; VI: ; %bb.0: 22; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 23; VI-NEXT: s_load_dword s2, s[8:9], 0x8 24; VI-NEXT: s_waitcnt lgkmcnt(0) 25; VI-NEXT: v_mov_b32_e32 v0, s0 26; VI-NEXT: v_mov_b32_e32 v1, s1 27; VI-NEXT: v_mov_b32_e32 v2, s2 28; VI-NEXT: flat_store_short v[0:1], v2 29; VI-NEXT: s_endpgm 30; 31; GFX11-LABEL: load_f16_arg: 32; GFX11: ; %bb.0: 33; GFX11-NEXT: s_clause 0x1 34; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 35; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 36; GFX11-NEXT: s_waitcnt lgkmcnt(0) 37; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 38; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 39; GFX11-NEXT: s_endpgm 40 store half %arg, ptr addrspace(1) %out 41 ret void 42} 43 44define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { 45; CI-LABEL: load_v2f16_arg: 46; CI: ; %bb.0: 47; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 48; CI-NEXT: s_load_dword s2, s[8:9], 0x2 49; CI-NEXT: s_waitcnt lgkmcnt(0) 50; CI-NEXT: v_mov_b32_e32 v0, s0 51; CI-NEXT: v_mov_b32_e32 v1, s1 52; CI-NEXT: v_mov_b32_e32 v2, s2 53; CI-NEXT: flat_store_dword v[0:1], v2 54; CI-NEXT: s_endpgm 55; 56; VI-LABEL: load_v2f16_arg: 57; VI: ; %bb.0: 58; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 59; VI-NEXT: s_load_dword s2, s[8:9], 0x8 60; VI-NEXT: s_waitcnt lgkmcnt(0) 61; VI-NEXT: v_mov_b32_e32 v0, s0 62; VI-NEXT: v_mov_b32_e32 v1, s1 63; VI-NEXT: v_mov_b32_e32 v2, s2 64; VI-NEXT: flat_store_dword v[0:1], v2 65; VI-NEXT: s_endpgm 66; 67; GFX11-LABEL: load_v2f16_arg: 68; GFX11: ; %bb.0: 69; GFX11-NEXT: s_clause 0x1 70; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 71; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 72; GFX11-NEXT: s_waitcnt lgkmcnt(0) 73; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 74; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 75; GFX11-NEXT: s_endpgm 76 store <2 x half> %arg, ptr addrspace(1) %out 77 ret void 78} 79 80define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { 81; CIVI-LABEL: load_v3f16_arg: 82; CIVI: ; %bb.0: 83; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 84; CIVI-NEXT: s_waitcnt lgkmcnt(0) 85; CIVI-NEXT: s_add_u32 s4, s0, 4 86; CIVI-NEXT: s_addc_u32 s5, s1, 0 87; CIVI-NEXT: v_mov_b32_e32 v2, s4 88; CIVI-NEXT: v_mov_b32_e32 v4, s3 89; CIVI-NEXT: v_mov_b32_e32 v0, s0 90; CIVI-NEXT: v_mov_b32_e32 v3, s5 91; CIVI-NEXT: v_mov_b32_e32 v1, s1 92; CIVI-NEXT: v_mov_b32_e32 v5, s2 93; CIVI-NEXT: flat_store_short v[2:3], v4 94; CIVI-NEXT: flat_store_dword v[0:1], v5 95; CIVI-NEXT: s_endpgm 96; 97; GFX11-LABEL: load_v3f16_arg: 98; GFX11: ; %bb.0: 99; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 100; GFX11-NEXT: s_waitcnt lgkmcnt(0) 101; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 102; GFX11-NEXT: v_mov_b32_e32 v2, s2 103; GFX11-NEXT: s_clause 0x1 104; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 105; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] 106; GFX11-NEXT: s_endpgm 107 store <3 x half> %arg, ptr addrspace(1) %out 108 ret void 109} 110 111 112; FIXME: Why not one load? 113define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { 114; CIVI-LABEL: load_v4f16_arg: 115; CIVI: ; %bb.0: 116; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 117; CIVI-NEXT: s_waitcnt lgkmcnt(0) 118; CIVI-NEXT: v_mov_b32_e32 v0, s0 119; CIVI-NEXT: v_mov_b32_e32 v2, s2 120; CIVI-NEXT: v_mov_b32_e32 v1, s1 121; CIVI-NEXT: v_mov_b32_e32 v3, s3 122; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 123; CIVI-NEXT: s_endpgm 124; 125; GFX11-LABEL: load_v4f16_arg: 126; GFX11: ; %bb.0: 127; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 128; GFX11-NEXT: v_mov_b32_e32 v2, 0 129; GFX11-NEXT: s_waitcnt lgkmcnt(0) 130; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 131; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 132; GFX11-NEXT: s_endpgm 133 store <4 x half> %arg, ptr addrspace(1) %out 134 ret void 135} 136 137define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { 138; CI-LABEL: load_v8f16_arg: 139; CI: ; %bb.0: 140; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 141; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 142; CI-NEXT: s_waitcnt lgkmcnt(0) 143; CI-NEXT: v_mov_b32_e32 v4, s4 144; CI-NEXT: v_mov_b32_e32 v0, s0 145; CI-NEXT: v_mov_b32_e32 v5, s5 146; CI-NEXT: v_mov_b32_e32 v1, s1 147; CI-NEXT: v_mov_b32_e32 v2, s2 148; CI-NEXT: v_mov_b32_e32 v3, s3 149; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 150; CI-NEXT: s_endpgm 151; 152; VI-LABEL: load_v8f16_arg: 153; VI: ; %bb.0: 154; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 155; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 156; VI-NEXT: s_waitcnt lgkmcnt(0) 157; VI-NEXT: v_mov_b32_e32 v4, s4 158; VI-NEXT: v_mov_b32_e32 v0, s0 159; VI-NEXT: v_mov_b32_e32 v5, s5 160; VI-NEXT: v_mov_b32_e32 v1, s1 161; VI-NEXT: v_mov_b32_e32 v2, s2 162; VI-NEXT: v_mov_b32_e32 v3, s3 163; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 164; VI-NEXT: s_endpgm 165; 166; GFX11-LABEL: load_v8f16_arg: 167; GFX11: ; %bb.0: 168; GFX11-NEXT: s_clause 0x1 169; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 170; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 171; GFX11-NEXT: v_mov_b32_e32 v4, 0 172; GFX11-NEXT: s_waitcnt lgkmcnt(0) 173; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 174; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 175; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 176; GFX11-NEXT: s_endpgm 177 store <8 x half> %arg, ptr addrspace(1) %out 178 ret void 179} 180 181define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 { 182; CI-LABEL: extload_v2f16_arg: 183; CI: ; %bb.0: 184; CI-NEXT: s_load_dword s2, s[8:9], 0x2 185; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 186; CI-NEXT: s_waitcnt lgkmcnt(0) 187; CI-NEXT: s_lshr_b32 s3, s2, 16 188; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 189; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 190; CI-NEXT: v_mov_b32_e32 v3, s1 191; CI-NEXT: v_mov_b32_e32 v2, s0 192; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 193; CI-NEXT: s_endpgm 194; 195; VI-LABEL: extload_v2f16_arg: 196; VI: ; %bb.0: 197; VI-NEXT: s_load_dword s2, s[8:9], 0x8 198; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 199; VI-NEXT: s_waitcnt lgkmcnt(0) 200; VI-NEXT: s_lshr_b32 s3, s2, 16 201; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 202; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 203; VI-NEXT: v_mov_b32_e32 v3, s1 204; VI-NEXT: v_mov_b32_e32 v2, s0 205; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 206; VI-NEXT: s_endpgm 207; 208; GFX11-LABEL: extload_v2f16_arg: 209; GFX11: ; %bb.0: 210; GFX11-NEXT: s_clause 0x1 211; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 212; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 213; GFX11-NEXT: v_mov_b32_e32 v2, 0 214; GFX11-NEXT: s_waitcnt lgkmcnt(0) 215; GFX11-NEXT: s_lshr_b32 s3, s2, 16 216; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 217; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 218; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 219; GFX11-NEXT: s_endpgm 220 %fpext = fpext <2 x half> %in to <2 x float> 221 store <2 x float> %fpext, ptr addrspace(1) %out 222 ret void 223} 224 225define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 { 226; CI-LABEL: extload_f16_to_f32_arg: 227; CI: ; %bb.0: 228; CI-NEXT: s_load_dword s2, s[8:9], 0x2 229; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 230; CI-NEXT: s_waitcnt lgkmcnt(0) 231; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 232; CI-NEXT: v_mov_b32_e32 v0, s0 233; CI-NEXT: v_mov_b32_e32 v1, s1 234; CI-NEXT: flat_store_dword v[0:1], v2 235; CI-NEXT: s_endpgm 236; 237; VI-LABEL: extload_f16_to_f32_arg: 238; VI: ; %bb.0: 239; VI-NEXT: s_load_dword s2, s[8:9], 0x8 240; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 243; VI-NEXT: v_mov_b32_e32 v0, s0 244; VI-NEXT: v_mov_b32_e32 v1, s1 245; VI-NEXT: flat_store_dword v[0:1], v2 246; VI-NEXT: s_endpgm 247; 248; GFX11-LABEL: extload_f16_to_f32_arg: 249; GFX11: ; %bb.0: 250; GFX11-NEXT: s_clause 0x1 251; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 252; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 253; GFX11-NEXT: v_mov_b32_e32 v0, 0 254; GFX11-NEXT: s_waitcnt lgkmcnt(0) 255; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 256; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 257; GFX11-NEXT: s_endpgm 258 %ext = fpext half %arg to float 259 store float %ext, ptr addrspace(1) %out 260 ret void 261} 262 263define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { 264; CI-LABEL: extload_v2f16_to_v2f32_arg: 265; CI: ; %bb.0: 266; CI-NEXT: s_load_dword s2, s[8:9], 0x2 267; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 268; CI-NEXT: s_waitcnt lgkmcnt(0) 269; CI-NEXT: s_lshr_b32 s3, s2, 16 270; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 271; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 272; CI-NEXT: v_mov_b32_e32 v3, s1 273; CI-NEXT: v_mov_b32_e32 v2, s0 274; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 275; CI-NEXT: s_endpgm 276; 277; VI-LABEL: extload_v2f16_to_v2f32_arg: 278; VI: ; %bb.0: 279; VI-NEXT: s_load_dword s2, s[8:9], 0x8 280; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 281; VI-NEXT: s_waitcnt lgkmcnt(0) 282; VI-NEXT: s_lshr_b32 s3, s2, 16 283; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 284; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 285; VI-NEXT: v_mov_b32_e32 v3, s1 286; VI-NEXT: v_mov_b32_e32 v2, s0 287; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 288; VI-NEXT: s_endpgm 289; 290; GFX11-LABEL: extload_v2f16_to_v2f32_arg: 291; GFX11: ; %bb.0: 292; GFX11-NEXT: s_clause 0x1 293; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 294; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 295; GFX11-NEXT: v_mov_b32_e32 v2, 0 296; GFX11-NEXT: s_waitcnt lgkmcnt(0) 297; GFX11-NEXT: s_lshr_b32 s3, s2, 16 298; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 299; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 300; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 301; GFX11-NEXT: s_endpgm 302 %ext = fpext <2 x half> %arg to <2 x float> 303 store <2 x float> %ext, ptr addrspace(1) %out 304 ret void 305} 306 307define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { 308; CI-LABEL: extload_v3f16_to_v3f32_arg: 309; CI: ; %bb.0: 310; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 311; CI-NEXT: s_waitcnt lgkmcnt(0) 312; CI-NEXT: s_lshr_b32 s4, s2, 16 313; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 314; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 315; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 316; CI-NEXT: v_mov_b32_e32 v4, s1 317; CI-NEXT: v_mov_b32_e32 v3, s0 318; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 319; CI-NEXT: s_endpgm 320; 321; VI-LABEL: extload_v3f16_to_v3f32_arg: 322; VI: ; %bb.0: 323; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 324; VI-NEXT: s_waitcnt lgkmcnt(0) 325; VI-NEXT: s_lshr_b32 s4, s2, 16 326; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 327; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 328; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 329; VI-NEXT: v_mov_b32_e32 v4, s1 330; VI-NEXT: v_mov_b32_e32 v3, s0 331; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 332; VI-NEXT: s_endpgm 333; 334; GFX11-LABEL: extload_v3f16_to_v3f32_arg: 335; GFX11: ; %bb.0: 336; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 337; GFX11-NEXT: v_mov_b32_e32 v3, 0 338; GFX11-NEXT: s_waitcnt lgkmcnt(0) 339; GFX11-NEXT: s_lshr_b32 s4, s2, 16 340; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 341; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 342; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 343; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] 344; GFX11-NEXT: s_endpgm 345 %ext = fpext <3 x half> %arg to <3 x float> 346 store <3 x float> %ext, ptr addrspace(1) %out 347 ret void 348} 349 350define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { 351; CI-LABEL: extload_v4f16_to_v4f32_arg: 352; CI: ; %bb.0: 353; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 354; CI-NEXT: s_waitcnt lgkmcnt(0) 355; CI-NEXT: s_lshr_b32 s4, s3, 16 356; CI-NEXT: s_lshr_b32 s5, s2, 16 357; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 358; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 359; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 360; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 361; CI-NEXT: v_mov_b32_e32 v5, s1 362; CI-NEXT: v_mov_b32_e32 v4, s0 363; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 364; CI-NEXT: s_endpgm 365; 366; VI-LABEL: extload_v4f16_to_v4f32_arg: 367; VI: ; %bb.0: 368; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 369; VI-NEXT: s_waitcnt lgkmcnt(0) 370; VI-NEXT: s_lshr_b32 s4, s3, 16 371; VI-NEXT: s_lshr_b32 s5, s2, 16 372; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 373; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 374; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 375; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 376; VI-NEXT: v_mov_b32_e32 v5, s1 377; VI-NEXT: v_mov_b32_e32 v4, s0 378; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 379; VI-NEXT: s_endpgm 380; 381; GFX11-LABEL: extload_v4f16_to_v4f32_arg: 382; GFX11: ; %bb.0: 383; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 384; GFX11-NEXT: v_mov_b32_e32 v4, 0 385; GFX11-NEXT: s_waitcnt lgkmcnt(0) 386; GFX11-NEXT: s_lshr_b32 s4, s3, 16 387; GFX11-NEXT: s_lshr_b32 s5, s2, 16 388; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 389; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 390; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 391; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 392; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 393; GFX11-NEXT: s_endpgm 394 %ext = fpext <4 x half> %arg to <4 x float> 395 store <4 x float> %ext, ptr addrspace(1) %out 396 ret void 397} 398 399define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { 400; CI-LABEL: extload_v8f16_to_v8f32_arg: 401; CI: ; %bb.0: 402; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 403; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 404; CI-NEXT: s_waitcnt lgkmcnt(0) 405; CI-NEXT: s_lshr_b32 s6, s1, 16 406; CI-NEXT: s_lshr_b32 s7, s0, 16 407; CI-NEXT: s_lshr_b32 s8, s3, 16 408; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 409; CI-NEXT: s_lshr_b32 s6, s2, 16 410; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 411; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 412; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 413; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 414; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 415; CI-NEXT: s_add_u32 s0, s4, 16 416; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 417; CI-NEXT: s_addc_u32 s1, s5, 0 418; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 419; CI-NEXT: v_mov_b32_e32 v9, s1 420; CI-NEXT: v_mov_b32_e32 v8, s0 421; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 422; CI-NEXT: s_nop 0 423; CI-NEXT: v_mov_b32_e32 v4, s4 424; CI-NEXT: v_mov_b32_e32 v5, s5 425; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 426; CI-NEXT: s_endpgm 427; 428; VI-LABEL: extload_v8f16_to_v8f32_arg: 429; VI: ; %bb.0: 430; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 431; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 432; VI-NEXT: s_waitcnt lgkmcnt(0) 433; VI-NEXT: s_lshr_b32 s6, s1, 16 434; VI-NEXT: s_lshr_b32 s7, s0, 16 435; VI-NEXT: s_lshr_b32 s8, s3, 16 436; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 437; VI-NEXT: s_lshr_b32 s6, s2, 16 438; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 439; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 440; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 441; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 442; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 443; VI-NEXT: s_add_u32 s0, s4, 16 444; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 445; VI-NEXT: s_addc_u32 s1, s5, 0 446; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 447; VI-NEXT: v_mov_b32_e32 v9, s1 448; VI-NEXT: v_mov_b32_e32 v8, s0 449; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 450; VI-NEXT: s_nop 0 451; VI-NEXT: v_mov_b32_e32 v4, s4 452; VI-NEXT: v_mov_b32_e32 v5, s5 453; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 454; VI-NEXT: s_endpgm 455; 456; GFX11-LABEL: extload_v8f16_to_v8f32_arg: 457; GFX11: ; %bb.0: 458; GFX11-NEXT: s_clause 0x1 459; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 460; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 461; GFX11-NEXT: v_mov_b32_e32 v8, 0 462; GFX11-NEXT: s_waitcnt lgkmcnt(0) 463; GFX11-NEXT: s_lshr_b32 s8, s3, 16 464; GFX11-NEXT: s_lshr_b32 s9, s2, 16 465; GFX11-NEXT: s_lshr_b32 s6, s1, 16 466; GFX11-NEXT: s_lshr_b32 s7, s0, 16 467; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s3 468; GFX11-NEXT: v_cvt_f32_f16_e32 v4, s2 469; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s8 470; GFX11-NEXT: v_cvt_f32_f16_e32 v5, s9 471; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s1 472; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 473; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 474; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s7 475; GFX11-NEXT: s_clause 0x1 476; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 477; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] 478; GFX11-NEXT: s_endpgm 479 %ext = fpext <8 x half> %arg to <8 x float> 480 store <8 x float> %ext, ptr addrspace(1) %out 481 ret void 482} 483 484define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 { 485; CI-LABEL: extload_f16_to_f64_arg: 486; CI: ; %bb.0: 487; CI-NEXT: s_load_dword s0, s[8:9], 0x2 488; CI-NEXT: s_waitcnt lgkmcnt(0) 489; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 490; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 491; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 492; CI-NEXT: s_waitcnt lgkmcnt(0) 493; CI-NEXT: v_mov_b32_e32 v3, s1 494; CI-NEXT: v_mov_b32_e32 v2, s0 495; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 496; CI-NEXT: s_endpgm 497; 498; VI-LABEL: extload_f16_to_f64_arg: 499; VI: ; %bb.0: 500; VI-NEXT: s_load_dword s0, s[8:9], 0x8 501; VI-NEXT: s_waitcnt lgkmcnt(0) 502; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 503; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 504; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 505; VI-NEXT: s_waitcnt lgkmcnt(0) 506; VI-NEXT: v_mov_b32_e32 v3, s1 507; VI-NEXT: v_mov_b32_e32 v2, s0 508; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 509; VI-NEXT: s_endpgm 510; 511; GFX11-LABEL: extload_f16_to_f64_arg: 512; GFX11: ; %bb.0: 513; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 514; GFX11-NEXT: v_mov_b32_e32 v2, 0 515; GFX11-NEXT: s_waitcnt lgkmcnt(0) 516; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 517; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 518; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 519; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 520; GFX11-NEXT: s_waitcnt lgkmcnt(0) 521; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 522; GFX11-NEXT: s_endpgm 523 %ext = fpext half %arg to double 524 store double %ext, ptr addrspace(1) %out 525 ret void 526} 527 528define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 { 529; CI-LABEL: extload_v2f16_to_v2f64_arg: 530; CI: ; %bb.0: 531; CI-NEXT: s_load_dword s0, s[8:9], 0x2 532; CI-NEXT: s_waitcnt lgkmcnt(0) 533; CI-NEXT: s_lshr_b32 s1, s0, 16 534; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 535; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 536; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 537; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 538; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 539; CI-NEXT: s_waitcnt lgkmcnt(0) 540; CI-NEXT: v_mov_b32_e32 v5, s1 541; CI-NEXT: v_mov_b32_e32 v4, s0 542; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 543; CI-NEXT: s_endpgm 544; 545; VI-LABEL: extload_v2f16_to_v2f64_arg: 546; VI: ; %bb.0: 547; VI-NEXT: s_load_dword s0, s[8:9], 0x8 548; VI-NEXT: s_waitcnt lgkmcnt(0) 549; VI-NEXT: s_lshr_b32 s1, s0, 16 550; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 551; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 552; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 553; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 554; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 555; VI-NEXT: s_waitcnt lgkmcnt(0) 556; VI-NEXT: v_mov_b32_e32 v5, s1 557; VI-NEXT: v_mov_b32_e32 v4, s0 558; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 559; VI-NEXT: s_endpgm 560; 561; GFX11-LABEL: extload_v2f16_to_v2f64_arg: 562; GFX11: ; %bb.0: 563; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x8 564; GFX11-NEXT: v_mov_b32_e32 v4, 0 565; GFX11-NEXT: s_waitcnt lgkmcnt(0) 566; GFX11-NEXT: s_lshr_b32 s1, s0, 16 567; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 568; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 569; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 570; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 571; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 572; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 573; GFX11-NEXT: s_waitcnt lgkmcnt(0) 574; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 575; GFX11-NEXT: s_endpgm 576 %ext = fpext <2 x half> %arg to <2 x double> 577 store <2 x double> %ext, ptr addrspace(1) %out 578 ret void 579} 580 581define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { 582; CI-LABEL: extload_v3f16_to_v3f64_arg: 583; CI: ; %bb.0: 584; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 585; CI-NEXT: s_waitcnt lgkmcnt(0) 586; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 587; CI-NEXT: s_lshr_b32 s4, s2, 16 588; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 589; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 590; CI-NEXT: s_add_u32 s2, s0, 16 591; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 592; CI-NEXT: s_addc_u32 s3, s1, 0 593; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 594; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 595; CI-NEXT: v_mov_b32_e32 v7, s3 596; CI-NEXT: v_mov_b32_e32 v6, s2 597; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 598; CI-NEXT: v_mov_b32_e32 v5, s1 599; CI-NEXT: v_mov_b32_e32 v4, s0 600; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 601; CI-NEXT: s_endpgm 602; 603; VI-LABEL: extload_v3f16_to_v3f64_arg: 604; VI: ; %bb.0: 605; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 606; VI-NEXT: s_waitcnt lgkmcnt(0) 607; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 608; VI-NEXT: s_lshr_b32 s4, s2, 16 609; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 610; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 611; VI-NEXT: s_add_u32 s2, s0, 16 612; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 613; VI-NEXT: s_addc_u32 s3, s1, 0 614; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 615; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 616; VI-NEXT: v_mov_b32_e32 v7, s3 617; VI-NEXT: v_mov_b32_e32 v6, s2 618; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 619; VI-NEXT: v_mov_b32_e32 v5, s1 620; VI-NEXT: v_mov_b32_e32 v4, s0 621; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 622; VI-NEXT: s_endpgm 623; 624; GFX11-LABEL: extload_v3f16_to_v3f64_arg: 625; GFX11: ; %bb.0: 626; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 627; GFX11-NEXT: s_waitcnt lgkmcnt(0) 628; GFX11-NEXT: s_lshr_b32 s4, s2, 16 629; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 630; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 631; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s2 632; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 633; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 634; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v1 635; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 636; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v6 637; GFX11-NEXT: v_mov_b32_e32 v6, 0 638; GFX11-NEXT: s_clause 0x1 639; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 640; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 641; GFX11-NEXT: s_endpgm 642 %ext = fpext <3 x half> %arg to <3 x double> 643 store <3 x double> %ext, ptr addrspace(1) %out 644 ret void 645} 646 647define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { 648; CI-LABEL: extload_v4f16_to_v4f64_arg: 649; CI: ; %bb.0: 650; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 651; CI-NEXT: s_waitcnt lgkmcnt(0) 652; CI-NEXT: s_lshr_b32 s4, s3, 16 653; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 654; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 655; CI-NEXT: s_lshr_b32 s5, s2, 16 656; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 657; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 658; CI-NEXT: s_add_u32 s2, s0, 16 659; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 660; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 661; CI-NEXT: s_addc_u32 s3, s1, 0 662; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 663; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 664; CI-NEXT: v_mov_b32_e32 v9, s3 665; CI-NEXT: v_mov_b32_e32 v8, s2 666; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 667; CI-NEXT: s_nop 0 668; CI-NEXT: v_mov_b32_e32 v0, s0 669; CI-NEXT: v_mov_b32_e32 v1, s1 670; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 671; CI-NEXT: s_endpgm 672; 673; VI-LABEL: extload_v4f16_to_v4f64_arg: 674; VI: ; %bb.0: 675; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 676; VI-NEXT: s_waitcnt lgkmcnt(0) 677; VI-NEXT: s_lshr_b32 s5, s3, 16 678; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 679; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 680; VI-NEXT: s_lshr_b32 s4, s2, 16 681; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 682; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 683; VI-NEXT: s_add_u32 s2, s0, 16 684; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 685; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 686; VI-NEXT: s_addc_u32 s3, s1, 0 687; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 688; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 689; VI-NEXT: v_mov_b32_e32 v9, s3 690; VI-NEXT: v_mov_b32_e32 v8, s2 691; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 692; VI-NEXT: s_nop 0 693; VI-NEXT: v_mov_b32_e32 v0, s0 694; VI-NEXT: v_mov_b32_e32 v1, s1 695; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 696; VI-NEXT: s_endpgm 697; 698; GFX11-LABEL: extload_v4f16_to_v4f64_arg: 699; GFX11: ; %bb.0: 700; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 701; GFX11-NEXT: s_waitcnt lgkmcnt(0) 702; GFX11-NEXT: s_lshr_b32 s5, s3, 16 703; GFX11-NEXT: s_lshr_b32 s4, s2, 16 704; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 705; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s5 706; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 707; GFX11-NEXT: v_cvt_f32_f16_e32 v8, s4 708; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 709; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 710; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 711; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 712; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 713; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 714; GFX11-NEXT: v_mov_b32_e32 v8, 0 715; GFX11-NEXT: s_clause 0x1 716; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 717; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 718; GFX11-NEXT: s_endpgm 719 %ext = fpext <4 x half> %arg to <4 x double> 720 store <4 x double> %ext, ptr addrspace(1) %out 721 ret void 722} 723 724define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 { 725; CI-LABEL: extload_v8f16_to_v8f64_arg: 726; CI: ; %bb.0: 727; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4 728; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 729; CI-NEXT: s_waitcnt lgkmcnt(0) 730; CI-NEXT: s_lshr_b32 s6, s3, 16 731; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 732; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 733; CI-NEXT: s_lshr_b32 s7, s2, 16 734; CI-NEXT: s_lshr_b32 s8, s1, 16 735; CI-NEXT: s_lshr_b32 s6, s0, 16 736; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 737; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 738; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 739; CI-NEXT: s_add_u32 s0, s4, 48 740; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 741; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 742; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 743; CI-NEXT: s_addc_u32 s1, s5, 0 744; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 745; CI-NEXT: v_mov_b32_e32 v17, s1 746; CI-NEXT: v_mov_b32_e32 v16, s0 747; CI-NEXT: s_add_u32 s0, s4, 32 748; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 749; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 750; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 751; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 752; CI-NEXT: s_addc_u32 s1, s5, 0 753; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 754; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 755; CI-NEXT: v_mov_b32_e32 v13, s1 756; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 757; CI-NEXT: v_mov_b32_e32 v12, s0 758; CI-NEXT: s_add_u32 s0, s4, 16 759; CI-NEXT: s_addc_u32 s1, s5, 0 760; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 761; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 762; CI-NEXT: s_nop 0 763; CI-NEXT: v_mov_b32_e32 v9, s1 764; CI-NEXT: v_mov_b32_e32 v8, s0 765; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 766; CI-NEXT: s_nop 0 767; CI-NEXT: v_mov_b32_e32 v4, s4 768; CI-NEXT: v_mov_b32_e32 v5, s5 769; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 770; CI-NEXT: s_endpgm 771; 772; VI-LABEL: extload_v8f16_to_v8f64_arg: 773; VI: ; %bb.0: 774; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10 775; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 776; VI-NEXT: s_waitcnt lgkmcnt(0) 777; VI-NEXT: s_lshr_b32 s6, s0, 16 778; VI-NEXT: s_lshr_b32 s8, s2, 16 779; VI-NEXT: s_lshr_b32 s9, s3, 16 780; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 781; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 782; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 783; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 784; VI-NEXT: s_lshr_b32 s7, s1, 16 785; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 786; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 787; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 788; VI-NEXT: s_add_u32 s0, s4, 48 789; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 790; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 791; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 792; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 793; VI-NEXT: s_addc_u32 s1, s5, 0 794; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 795; VI-NEXT: v_mov_b32_e32 v17, s1 796; VI-NEXT: v_mov_b32_e32 v16, s0 797; VI-NEXT: s_add_u32 s0, s4, 32 798; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 799; VI-NEXT: s_addc_u32 s1, s5, 0 800; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 801; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 802; VI-NEXT: v_mov_b32_e32 v13, s1 803; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 804; VI-NEXT: v_mov_b32_e32 v12, s0 805; VI-NEXT: s_add_u32 s0, s4, 16 806; VI-NEXT: s_addc_u32 s1, s5, 0 807; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 808; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 809; VI-NEXT: s_nop 0 810; VI-NEXT: v_mov_b32_e32 v9, s1 811; VI-NEXT: v_mov_b32_e32 v8, s0 812; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 813; VI-NEXT: s_nop 0 814; VI-NEXT: v_mov_b32_e32 v4, s4 815; VI-NEXT: v_mov_b32_e32 v5, s5 816; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 817; VI-NEXT: s_endpgm 818; 819; GFX11-LABEL: extload_v8f16_to_v8f64_arg: 820; GFX11: ; %bb.0: 821; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x10 822; GFX11-NEXT: s_waitcnt lgkmcnt(0) 823; GFX11-NEXT: s_lshr_b32 s9, s3, 16 824; GFX11-NEXT: s_lshr_b32 s8, s2, 16 825; GFX11-NEXT: s_lshr_b32 s7, s1, 16 826; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s3 827; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 828; GFX11-NEXT: s_lshr_b32 s6, s0, 16 829; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s2 830; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 831; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s1 832; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s7 833; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0 834; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s6 835; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 836; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 837; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 838; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 839; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 840; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 841; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 842; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 843; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 844; GFX11-NEXT: v_mov_b32_e32 v16, 0 845; GFX11-NEXT: s_waitcnt lgkmcnt(0) 846; GFX11-NEXT: s_clause 0x3 847; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 848; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 849; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 850; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] 851; GFX11-NEXT: s_endpgm 852 %ext = fpext <8 x half> %arg to <8 x double> 853 store <8 x double> %ext, ptr addrspace(1) %out 854 ret void 855} 856 857define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 858; CIVI-LABEL: global_load_store_f16: 859; CIVI: ; %bb.0: 860; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 861; CIVI-NEXT: s_waitcnt lgkmcnt(0) 862; CIVI-NEXT: v_mov_b32_e32 v0, s2 863; CIVI-NEXT: v_mov_b32_e32 v1, s3 864; CIVI-NEXT: flat_load_ushort v2, v[0:1] 865; CIVI-NEXT: v_mov_b32_e32 v0, s0 866; CIVI-NEXT: v_mov_b32_e32 v1, s1 867; CIVI-NEXT: s_waitcnt vmcnt(0) 868; CIVI-NEXT: flat_store_short v[0:1], v2 869; CIVI-NEXT: s_endpgm 870; 871; GFX11-LABEL: global_load_store_f16: 872; GFX11: ; %bb.0: 873; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 874; GFX11-NEXT: v_mov_b32_e32 v0, 0 875; GFX11-NEXT: s_waitcnt lgkmcnt(0) 876; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 877; GFX11-NEXT: s_waitcnt vmcnt(0) 878; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 879; GFX11-NEXT: s_endpgm 880 %val = load half, ptr addrspace(1) %in 881 store half %val, ptr addrspace(1) %out 882 ret void 883} 884 885define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 886; CIVI-LABEL: global_load_store_v2f16: 887; CIVI: ; %bb.0: 888; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 889; CIVI-NEXT: s_waitcnt lgkmcnt(0) 890; CIVI-NEXT: v_mov_b32_e32 v0, s2 891; CIVI-NEXT: v_mov_b32_e32 v1, s3 892; CIVI-NEXT: flat_load_dword v2, v[0:1] 893; CIVI-NEXT: v_mov_b32_e32 v0, s0 894; CIVI-NEXT: v_mov_b32_e32 v1, s1 895; CIVI-NEXT: s_waitcnt vmcnt(0) 896; CIVI-NEXT: flat_store_dword v[0:1], v2 897; CIVI-NEXT: s_endpgm 898; 899; GFX11-LABEL: global_load_store_v2f16: 900; GFX11: ; %bb.0: 901; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 902; GFX11-NEXT: v_mov_b32_e32 v0, 0 903; GFX11-NEXT: s_waitcnt lgkmcnt(0) 904; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 905; GFX11-NEXT: s_waitcnt vmcnt(0) 906; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 907; GFX11-NEXT: s_endpgm 908 %val = load <2 x half>, ptr addrspace(1) %in 909 store <2 x half> %val, ptr addrspace(1) %out 910 ret void 911} 912 913define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { 914; CIVI-LABEL: global_load_store_v4f16: 915; CIVI: ; %bb.0: 916; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 917; CIVI-NEXT: s_waitcnt lgkmcnt(0) 918; CIVI-NEXT: v_mov_b32_e32 v0, s0 919; CIVI-NEXT: v_mov_b32_e32 v1, s1 920; CIVI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 921; CIVI-NEXT: v_mov_b32_e32 v2, s2 922; CIVI-NEXT: v_mov_b32_e32 v3, s3 923; CIVI-NEXT: s_waitcnt vmcnt(0) 924; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 925; CIVI-NEXT: s_endpgm 926; 927; GFX11-LABEL: global_load_store_v4f16: 928; GFX11: ; %bb.0: 929; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 930; GFX11-NEXT: v_mov_b32_e32 v2, 0 931; GFX11-NEXT: s_waitcnt lgkmcnt(0) 932; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] 933; GFX11-NEXT: s_waitcnt vmcnt(0) 934; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 935; GFX11-NEXT: s_endpgm 936 %val = load <4 x half>, ptr addrspace(1) %in 937 store <4 x half> %val, ptr addrspace(1) %out 938 ret void 939} 940 941define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 942; CIVI-LABEL: global_load_store_v8f16: 943; CIVI: ; %bb.0: 944; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 945; CIVI-NEXT: s_waitcnt lgkmcnt(0) 946; CIVI-NEXT: v_mov_b32_e32 v0, s2 947; CIVI-NEXT: v_mov_b32_e32 v1, s3 948; CIVI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 949; CIVI-NEXT: v_mov_b32_e32 v4, s0 950; CIVI-NEXT: v_mov_b32_e32 v5, s1 951; CIVI-NEXT: s_waitcnt vmcnt(0) 952; CIVI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 953; CIVI-NEXT: s_endpgm 954; 955; GFX11-LABEL: global_load_store_v8f16: 956; GFX11: ; %bb.0: 957; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 958; GFX11-NEXT: v_mov_b32_e32 v4, 0 959; GFX11-NEXT: s_waitcnt lgkmcnt(0) 960; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 961; GFX11-NEXT: s_waitcnt vmcnt(0) 962; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 963; GFX11-NEXT: s_endpgm 964 %val = load <8 x half>, ptr addrspace(1) %in 965 store <8 x half> %val, ptr addrspace(1) %out 966 ret void 967} 968 969define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 970; CIVI-LABEL: global_extload_f16_to_f32: 971; CIVI: ; %bb.0: 972; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 973; CIVI-NEXT: s_waitcnt lgkmcnt(0) 974; CIVI-NEXT: v_mov_b32_e32 v0, s2 975; CIVI-NEXT: v_mov_b32_e32 v1, s3 976; CIVI-NEXT: flat_load_ushort v0, v[0:1] 977; CIVI-NEXT: v_mov_b32_e32 v1, s1 978; CIVI-NEXT: s_waitcnt vmcnt(0) 979; CIVI-NEXT: v_cvt_f32_f16_e32 v2, v0 980; CIVI-NEXT: v_mov_b32_e32 v0, s0 981; CIVI-NEXT: flat_store_dword v[0:1], v2 982; CIVI-NEXT: s_endpgm 983; 984; GFX11-LABEL: global_extload_f16_to_f32: 985; GFX11: ; %bb.0: 986; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 987; GFX11-NEXT: v_mov_b32_e32 v0, 0 988; GFX11-NEXT: s_waitcnt lgkmcnt(0) 989; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 990; GFX11-NEXT: s_waitcnt vmcnt(0) 991; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 992; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 993; GFX11-NEXT: s_endpgm 994 %val = load half, ptr addrspace(1) %in 995 %cvt = fpext half %val to float 996 store float %cvt, ptr addrspace(1) %out 997 ret void 998} 999 1000define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1001; CI-LABEL: global_extload_v2f16_to_v2f32: 1002; CI: ; %bb.0: 1003; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1004; CI-NEXT: s_waitcnt lgkmcnt(0) 1005; CI-NEXT: v_mov_b32_e32 v0, s2 1006; CI-NEXT: v_mov_b32_e32 v1, s3 1007; CI-NEXT: flat_load_dword v1, v[0:1] 1008; CI-NEXT: v_mov_b32_e32 v2, s0 1009; CI-NEXT: v_mov_b32_e32 v3, s1 1010; CI-NEXT: s_waitcnt vmcnt(0) 1011; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 1012; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1013; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1014; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1015; CI-NEXT: s_endpgm 1016; 1017; VI-LABEL: global_extload_v2f16_to_v2f32: 1018; VI: ; %bb.0: 1019; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1020; VI-NEXT: s_waitcnt lgkmcnt(0) 1021; VI-NEXT: v_mov_b32_e32 v0, s2 1022; VI-NEXT: v_mov_b32_e32 v1, s3 1023; VI-NEXT: flat_load_dword v1, v[0:1] 1024; VI-NEXT: v_mov_b32_e32 v2, s0 1025; VI-NEXT: v_mov_b32_e32 v3, s1 1026; VI-NEXT: s_waitcnt vmcnt(0) 1027; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 1028; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1029; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1030; VI-NEXT: s_endpgm 1031; 1032; GFX11-LABEL: global_extload_v2f16_to_v2f32: 1033; GFX11: ; %bb.0: 1034; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1035; GFX11-NEXT: v_mov_b32_e32 v2, 0 1036; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1037; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] 1038; GFX11-NEXT: s_waitcnt vmcnt(0) 1039; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1040; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1041; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1042; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 1043; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1044; GFX11-NEXT: s_endpgm 1045 %val = load <2 x half>, ptr addrspace(1) %in 1046 %cvt = fpext <2 x half> %val to <2 x float> 1047 store <2 x float> %cvt, ptr addrspace(1) %out 1048 ret void 1049} 1050 1051define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1052; CI-LABEL: global_extload_v3f16_to_v3f32: 1053; CI: ; %bb.0: 1054; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1055; CI-NEXT: s_waitcnt lgkmcnt(0) 1056; CI-NEXT: v_mov_b32_e32 v0, s2 1057; CI-NEXT: v_mov_b32_e32 v1, s3 1058; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 1059; CI-NEXT: v_mov_b32_e32 v3, s0 1060; CI-NEXT: v_mov_b32_e32 v4, s1 1061; CI-NEXT: s_waitcnt vmcnt(0) 1062; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 1063; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1064; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1065; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1066; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1067; CI-NEXT: s_endpgm 1068; 1069; VI-LABEL: global_extload_v3f16_to_v3f32: 1070; VI: ; %bb.0: 1071; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1072; VI-NEXT: s_waitcnt lgkmcnt(0) 1073; VI-NEXT: v_mov_b32_e32 v0, s2 1074; VI-NEXT: v_mov_b32_e32 v1, s3 1075; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 1076; VI-NEXT: v_mov_b32_e32 v3, s0 1077; VI-NEXT: v_mov_b32_e32 v4, s1 1078; VI-NEXT: s_waitcnt vmcnt(0) 1079; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 1080; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 1081; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1082; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 1083; VI-NEXT: s_endpgm 1084; 1085; GFX11-LABEL: global_extload_v3f16_to_v3f32: 1086; GFX11: ; %bb.0: 1087; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1088; GFX11-NEXT: v_mov_b32_e32 v3, 0 1089; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] 1091; GFX11-NEXT: s_waitcnt vmcnt(0) 1092; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 1093; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1094; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 1095; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 1096; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 1097; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] 1098; GFX11-NEXT: s_endpgm 1099 %val = load <3 x half>, ptr addrspace(1) %in 1100 %cvt = fpext <3 x half> %val to <3 x float> 1101 store <3 x float> %cvt, ptr addrspace(1) %out 1102 ret void 1103} 1104 1105define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1106; CI-LABEL: global_extload_v4f16_to_v4f32: 1107; CI: ; %bb.0: 1108; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1109; CI-NEXT: s_waitcnt lgkmcnt(0) 1110; CI-NEXT: v_mov_b32_e32 v0, s2 1111; CI-NEXT: v_mov_b32_e32 v1, s3 1112; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] 1113; CI-NEXT: v_mov_b32_e32 v5, s1 1114; CI-NEXT: s_waitcnt vmcnt(0) 1115; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 1116; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 1117; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1118; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 1119; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 1120; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 1121; CI-NEXT: v_mov_b32_e32 v4, s0 1122; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1123; CI-NEXT: s_endpgm 1124; 1125; VI-LABEL: global_extload_v4f16_to_v4f32: 1126; VI: ; %bb.0: 1127; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1128; VI-NEXT: s_waitcnt lgkmcnt(0) 1129; VI-NEXT: v_mov_b32_e32 v0, s2 1130; VI-NEXT: v_mov_b32_e32 v1, s3 1131; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 1132; VI-NEXT: s_waitcnt vmcnt(0) 1133; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 1134; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 1135; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1136; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1137; VI-NEXT: v_mov_b32_e32 v4, s0 1138; VI-NEXT: v_mov_b32_e32 v5, s1 1139; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1140; VI-NEXT: s_endpgm 1141; 1142; GFX11-LABEL: global_extload_v4f16_to_v4f32: 1143; GFX11: ; %bb.0: 1144; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1145; GFX11-NEXT: v_mov_b32_e32 v4, 0 1146; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] 1148; GFX11-NEXT: s_waitcnt vmcnt(0) 1149; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1150; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 1151; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1152; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 1153; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1154; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 1155; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v5 1156; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1157; GFX11-NEXT: s_endpgm 1158 %val = load <4 x half>, ptr addrspace(1) %in 1159 %cvt = fpext <4 x half> %val to <4 x float> 1160 store <4 x float> %cvt, ptr addrspace(1) %out 1161 ret void 1162} 1163 1164define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1165; CI-LABEL: global_extload_v8f16_to_v8f32: 1166; CI: ; %bb.0: 1167; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1168; CI-NEXT: s_waitcnt lgkmcnt(0) 1169; CI-NEXT: v_mov_b32_e32 v0, s2 1170; CI-NEXT: v_mov_b32_e32 v1, s3 1171; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1172; CI-NEXT: s_add_u32 s2, s0, 16 1173; CI-NEXT: s_addc_u32 s3, s1, 0 1174; CI-NEXT: v_mov_b32_e32 v13, s1 1175; CI-NEXT: v_mov_b32_e32 v12, s0 1176; CI-NEXT: s_waitcnt vmcnt(0) 1177; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 1178; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 1179; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1180; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1181; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 1182; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 1183; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1184; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1185; CI-NEXT: v_cvt_f32_f16_e32 v11, v3 1186; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 1187; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1188; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 1189; CI-NEXT: v_mov_b32_e32 v0, s2 1190; CI-NEXT: v_mov_b32_e32 v1, s3 1191; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 1192; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1193; CI-NEXT: s_endpgm 1194; 1195; VI-LABEL: global_extload_v8f16_to_v8f32: 1196; VI: ; %bb.0: 1197; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1198; VI-NEXT: s_waitcnt lgkmcnt(0) 1199; VI-NEXT: v_mov_b32_e32 v0, s2 1200; VI-NEXT: v_mov_b32_e32 v1, s3 1201; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1202; VI-NEXT: s_add_u32 s2, s0, 16 1203; VI-NEXT: s_addc_u32 s3, s1, 0 1204; VI-NEXT: v_mov_b32_e32 v13, s1 1205; VI-NEXT: v_mov_b32_e32 v12, s0 1206; VI-NEXT: s_waitcnt vmcnt(0) 1207; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 1208; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 1209; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1210; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1211; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 1212; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 1213; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1214; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1215; VI-NEXT: v_mov_b32_e32 v0, s2 1216; VI-NEXT: v_mov_b32_e32 v1, s3 1217; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 1218; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1219; VI-NEXT: s_endpgm 1220; 1221; GFX11-LABEL: global_extload_v8f16_to_v8f32: 1222; GFX11: ; %bb.0: 1223; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1224; GFX11-NEXT: v_mov_b32_e32 v12, 0 1225; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1226; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] 1227; GFX11-NEXT: s_waitcnt vmcnt(0) 1228; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 1229; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 1230; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 1231; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0 1232; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1233; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1234; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v3 1235; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2 1236; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v5 1237; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9 1238; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v1 1239; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 1240; GFX11-NEXT: s_clause 0x1 1241; GFX11-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:16 1242; GFX11-NEXT: global_store_b128 v12, v[4:7], s[0:1] 1243; GFX11-NEXT: s_endpgm 1244 %val = load <8 x half>, ptr addrspace(1) %in 1245 %cvt = fpext <8 x half> %val to <8 x float> 1246 store <8 x float> %cvt, ptr addrspace(1) %out 1247 ret void 1248} 1249 1250define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1251; CI-LABEL: global_extload_v16f16_to_v16f32: 1252; CI: ; %bb.0: 1253; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1254; CI-NEXT: s_waitcnt lgkmcnt(0) 1255; CI-NEXT: s_add_u32 s4, s2, 16 1256; CI-NEXT: v_mov_b32_e32 v5, s3 1257; CI-NEXT: s_addc_u32 s5, s3, 0 1258; CI-NEXT: v_mov_b32_e32 v0, s4 1259; CI-NEXT: v_mov_b32_e32 v4, s2 1260; CI-NEXT: v_mov_b32_e32 v1, s5 1261; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1262; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1263; CI-NEXT: s_add_u32 s2, s0, 16 1264; CI-NEXT: s_addc_u32 s3, s1, 0 1265; CI-NEXT: v_mov_b32_e32 v14, s3 1266; CI-NEXT: v_mov_b32_e32 v13, s2 1267; CI-NEXT: s_add_u32 s2, s0, 48 1268; CI-NEXT: s_addc_u32 s3, s1, 0 1269; CI-NEXT: s_waitcnt vmcnt(1) 1270; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 1271; CI-NEXT: s_waitcnt vmcnt(0) 1272; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 1273; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 1274; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1275; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1276; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 1277; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 1278; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1279; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 1280; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 1281; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] 1282; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1283; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 1284; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1285; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 1286; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 1287; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 1288; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 1289; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 1290; CI-NEXT: v_mov_b32_e32 v5, s1 1291; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 1292; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 1293; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 1294; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 1295; CI-NEXT: v_mov_b32_e32 v4, s0 1296; CI-NEXT: s_add_u32 s0, s0, 32 1297; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 1298; CI-NEXT: s_addc_u32 s1, s1, 0 1299; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 1300; CI-NEXT: v_mov_b32_e32 v15, s3 1301; CI-NEXT: v_mov_b32_e32 v17, s1 1302; CI-NEXT: v_mov_b32_e32 v14, s2 1303; CI-NEXT: v_mov_b32_e32 v16, s0 1304; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1305; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] 1306; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] 1307; CI-NEXT: s_endpgm 1308; 1309; VI-LABEL: global_extload_v16f16_to_v16f32: 1310; VI: ; %bb.0: 1311; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1312; VI-NEXT: s_waitcnt lgkmcnt(0) 1313; VI-NEXT: v_mov_b32_e32 v0, s2 1314; VI-NEXT: v_mov_b32_e32 v1, s3 1315; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1316; VI-NEXT: s_add_u32 s2, s2, 16 1317; VI-NEXT: s_addc_u32 s3, s3, 0 1318; VI-NEXT: v_mov_b32_e32 v5, s3 1319; VI-NEXT: v_mov_b32_e32 v4, s2 1320; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1321; VI-NEXT: s_add_u32 s2, s0, 16 1322; VI-NEXT: s_addc_u32 s3, s1, 0 1323; VI-NEXT: v_mov_b32_e32 v19, s3 1324; VI-NEXT: v_mov_b32_e32 v18, s2 1325; VI-NEXT: s_add_u32 s2, s0, 48 1326; VI-NEXT: v_mov_b32_e32 v17, s1 1327; VI-NEXT: s_addc_u32 s3, s1, 0 1328; VI-NEXT: v_mov_b32_e32 v16, s0 1329; VI-NEXT: s_add_u32 s0, s0, 32 1330; VI-NEXT: s_addc_u32 s1, s1, 0 1331; VI-NEXT: v_mov_b32_e32 v21, s3 1332; VI-NEXT: v_mov_b32_e32 v20, s2 1333; VI-NEXT: s_waitcnt vmcnt(1) 1334; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 1335; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 1336; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1337; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1338; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1339; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 1340; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1341; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1342; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] 1343; VI-NEXT: s_waitcnt vmcnt(1) 1344; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 1345; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 1346; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 1347; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1348; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1349; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 1350; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1351; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1352; VI-NEXT: v_mov_b32_e32 v5, s1 1353; VI-NEXT: v_mov_b32_e32 v4, s0 1354; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1355; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] 1356; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1357; VI-NEXT: s_endpgm 1358; 1359; GFX11-LABEL: global_extload_v16f16_to_v16f32: 1360; GFX11: ; %bb.0: 1361; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1362; GFX11-NEXT: v_mov_b32_e32 v20, 0 1363; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX11-NEXT: s_clause 0x1 1365; GFX11-NEXT: global_load_b128 v[0:3], v20, s[2:3] 1366; GFX11-NEXT: global_load_b128 v[4:7], v20, s[2:3] offset:16 1367; GFX11-NEXT: s_waitcnt vmcnt(1) 1368; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 1369; GFX11-NEXT: s_waitcnt vmcnt(0) 1370; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v7 1371; GFX11-NEXT: v_cvt_f32_f16_e32 v16, v6 1372; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1373; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1374; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0 1375; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1376; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 1377; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v3 1378; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2 1379; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1380; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 1381; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v5 1382; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v4 1383; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 1384; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 1385; GFX11-NEXT: v_cvt_f32_f16_e32 v19, v7 1386; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v6 1387; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v1 1388; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v3 1389; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v5 1390; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 1391; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v13 1392; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9 1393; GFX11-NEXT: s_clause 0x3 1394; GFX11-NEXT: global_store_b128 v20, v[16:19], s[0:1] offset:48 1395; GFX11-NEXT: global_store_b128 v20, v[0:3], s[0:1] offset:32 1396; GFX11-NEXT: global_store_b128 v20, v[12:15], s[0:1] offset:16 1397; GFX11-NEXT: global_store_b128 v20, v[8:11], s[0:1] 1398; GFX11-NEXT: s_endpgm 1399 %val = load <16 x half>, ptr addrspace(1) %in 1400 %cvt = fpext <16 x half> %val to <16 x float> 1401 store <16 x float> %cvt, ptr addrspace(1) %out 1402 ret void 1403} 1404 1405define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1406; CIVI-LABEL: global_extload_f16_to_f64: 1407; CIVI: ; %bb.0: 1408; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1409; CIVI-NEXT: s_waitcnt lgkmcnt(0) 1410; CIVI-NEXT: v_mov_b32_e32 v0, s2 1411; CIVI-NEXT: v_mov_b32_e32 v1, s3 1412; CIVI-NEXT: flat_load_ushort v0, v[0:1] 1413; CIVI-NEXT: v_mov_b32_e32 v2, s0 1414; CIVI-NEXT: v_mov_b32_e32 v3, s1 1415; CIVI-NEXT: s_waitcnt vmcnt(0) 1416; CIVI-NEXT: v_cvt_f32_f16_e32 v0, v0 1417; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1418; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1419; CIVI-NEXT: s_endpgm 1420; 1421; GFX11-LABEL: global_extload_f16_to_f64: 1422; GFX11: ; %bb.0: 1423; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1424; GFX11-NEXT: v_mov_b32_e32 v2, 0 1425; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1426; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] 1427; GFX11-NEXT: s_waitcnt vmcnt(0) 1428; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1429; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1430; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1431; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1432; GFX11-NEXT: s_endpgm 1433 %val = load half, ptr addrspace(1) %in 1434 %cvt = fpext half %val to double 1435 store double %cvt, ptr addrspace(1) %out 1436 ret void 1437} 1438 1439define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1440; CI-LABEL: global_extload_v2f16_to_v2f64: 1441; CI: ; %bb.0: 1442; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1443; CI-NEXT: s_waitcnt lgkmcnt(0) 1444; CI-NEXT: v_mov_b32_e32 v0, s2 1445; CI-NEXT: v_mov_b32_e32 v1, s3 1446; CI-NEXT: flat_load_dword v0, v[0:1] 1447; CI-NEXT: v_mov_b32_e32 v4, s0 1448; CI-NEXT: v_mov_b32_e32 v5, s1 1449; CI-NEXT: s_waitcnt vmcnt(0) 1450; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1451; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1452; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 1453; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1454; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1455; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1456; CI-NEXT: s_endpgm 1457; 1458; VI-LABEL: global_extload_v2f16_to_v2f64: 1459; VI: ; %bb.0: 1460; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1461; VI-NEXT: s_waitcnt lgkmcnt(0) 1462; VI-NEXT: v_mov_b32_e32 v0, s2 1463; VI-NEXT: v_mov_b32_e32 v1, s3 1464; VI-NEXT: flat_load_dword v0, v[0:1] 1465; VI-NEXT: v_mov_b32_e32 v4, s0 1466; VI-NEXT: v_mov_b32_e32 v5, s1 1467; VI-NEXT: s_waitcnt vmcnt(0) 1468; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1469; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1470; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1471; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1472; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1473; VI-NEXT: s_endpgm 1474; 1475; GFX11-LABEL: global_extload_v2f16_to_v2f64: 1476; GFX11: ; %bb.0: 1477; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1478; GFX11-NEXT: v_mov_b32_e32 v4, 0 1479; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] 1481; GFX11-NEXT: s_waitcnt vmcnt(0) 1482; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1483; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1484; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1485; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 1486; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1487; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1488; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1489; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1490; GFX11-NEXT: s_endpgm 1491 %val = load <2 x half>, ptr addrspace(1) %in 1492 %cvt = fpext <2 x half> %val to <2 x double> 1493 store <2 x double> %cvt, ptr addrspace(1) %out 1494 ret void 1495} 1496 1497define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1498; CI-LABEL: global_extload_v3f16_to_v3f64: 1499; CI: ; %bb.0: 1500; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1501; CI-NEXT: s_waitcnt lgkmcnt(0) 1502; CI-NEXT: v_mov_b32_e32 v0, s2 1503; CI-NEXT: v_mov_b32_e32 v1, s3 1504; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1505; CI-NEXT: s_add_u32 s2, s0, 16 1506; CI-NEXT: s_addc_u32 s3, s1, 0 1507; CI-NEXT: v_mov_b32_e32 v7, s3 1508; CI-NEXT: v_mov_b32_e32 v6, s2 1509; CI-NEXT: s_waitcnt vmcnt(0) 1510; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1511; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1512; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1513; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1514; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 1515; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1516; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1517; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 1518; CI-NEXT: v_mov_b32_e32 v5, s1 1519; CI-NEXT: v_mov_b32_e32 v4, s0 1520; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1521; CI-NEXT: s_endpgm 1522; 1523; VI-LABEL: global_extload_v3f16_to_v3f64: 1524; VI: ; %bb.0: 1525; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1526; VI-NEXT: s_waitcnt lgkmcnt(0) 1527; VI-NEXT: v_mov_b32_e32 v0, s2 1528; VI-NEXT: v_mov_b32_e32 v1, s3 1529; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1530; VI-NEXT: s_add_u32 s2, s0, 16 1531; VI-NEXT: s_addc_u32 s3, s1, 0 1532; VI-NEXT: v_mov_b32_e32 v5, s1 1533; VI-NEXT: v_mov_b32_e32 v4, s0 1534; VI-NEXT: s_waitcnt vmcnt(0) 1535; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1536; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1537; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1538; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 1539; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1540; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1541; VI-NEXT: v_mov_b32_e32 v9, s3 1542; VI-NEXT: v_mov_b32_e32 v8, s2 1543; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] 1544; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1545; VI-NEXT: s_endpgm 1546; 1547; GFX11-LABEL: global_extload_v3f16_to_v3f64: 1548; GFX11: ; %bb.0: 1549; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1550; GFX11-NEXT: v_mov_b32_e32 v6, 0 1551; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] 1553; GFX11-NEXT: s_waitcnt vmcnt(0) 1554; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1555; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 1556; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1557; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1558; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 1559; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1560; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1561; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1562; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1563; GFX11-NEXT: s_clause 0x1 1564; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 1565; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 1566; GFX11-NEXT: s_endpgm 1567 %val = load <3 x half>, ptr addrspace(1) %in 1568 %cvt = fpext <3 x half> %val to <3 x double> 1569 store <3 x double> %cvt, ptr addrspace(1) %out 1570 ret void 1571} 1572 1573define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1574; CI-LABEL: global_extload_v4f16_to_v4f64: 1575; CI: ; %bb.0: 1576; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1577; CI-NEXT: s_waitcnt lgkmcnt(0) 1578; CI-NEXT: v_mov_b32_e32 v0, s2 1579; CI-NEXT: v_mov_b32_e32 v1, s3 1580; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1581; CI-NEXT: s_add_u32 s2, s0, 16 1582; CI-NEXT: s_addc_u32 s3, s1, 0 1583; CI-NEXT: v_mov_b32_e32 v9, s1 1584; CI-NEXT: v_mov_b32_e32 v8, s0 1585; CI-NEXT: s_waitcnt vmcnt(0) 1586; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1587; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 1588; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1589; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1590; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1591; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 1592; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1593; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1594; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 1595; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1596; CI-NEXT: v_mov_b32_e32 v11, s3 1597; CI-NEXT: v_mov_b32_e32 v10, s2 1598; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1599; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1600; CI-NEXT: s_endpgm 1601; 1602; VI-LABEL: global_extload_v4f16_to_v4f64: 1603; VI: ; %bb.0: 1604; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1605; VI-NEXT: s_waitcnt lgkmcnt(0) 1606; VI-NEXT: v_mov_b32_e32 v0, s2 1607; VI-NEXT: v_mov_b32_e32 v1, s3 1608; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1609; VI-NEXT: s_add_u32 s2, s0, 16 1610; VI-NEXT: s_addc_u32 s3, s1, 0 1611; VI-NEXT: v_mov_b32_e32 v9, s1 1612; VI-NEXT: v_mov_b32_e32 v8, s0 1613; VI-NEXT: s_waitcnt vmcnt(0) 1614; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1615; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1616; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1617; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1618; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1619; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1620; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1621; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1622; VI-NEXT: v_mov_b32_e32 v11, s3 1623; VI-NEXT: v_mov_b32_e32 v10, s2 1624; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1625; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1626; VI-NEXT: s_endpgm 1627; 1628; GFX11-LABEL: global_extload_v4f16_to_v4f64: 1629; GFX11: ; %bb.0: 1630; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1631; GFX11-NEXT: v_mov_b32_e32 v8, 0 1632; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1633; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] 1634; GFX11-NEXT: s_waitcnt vmcnt(0) 1635; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1636; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1637; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 1638; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 1639; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1640; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 1641; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 1642; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1643; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1644; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1645; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1646; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 1647; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 1648; GFX11-NEXT: s_clause 0x1 1649; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 1650; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 1651; GFX11-NEXT: s_endpgm 1652 %val = load <4 x half>, ptr addrspace(1) %in 1653 %cvt = fpext <4 x half> %val to <4 x double> 1654 store <4 x double> %cvt, ptr addrspace(1) %out 1655 ret void 1656} 1657 1658define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1659; CI-LABEL: global_extload_v8f16_to_v8f64: 1660; CI: ; %bb.0: 1661; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1662; CI-NEXT: s_waitcnt lgkmcnt(0) 1663; CI-NEXT: v_mov_b32_e32 v0, s2 1664; CI-NEXT: v_mov_b32_e32 v1, s3 1665; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1666; CI-NEXT: s_add_u32 s2, s0, 48 1667; CI-NEXT: s_addc_u32 s3, s1, 0 1668; CI-NEXT: v_mov_b32_e32 v7, s3 1669; CI-NEXT: v_mov_b32_e32 v6, s2 1670; CI-NEXT: s_add_u32 s2, s0, 32 1671; CI-NEXT: v_mov_b32_e32 v13, s1 1672; CI-NEXT: s_addc_u32 s3, s1, 0 1673; CI-NEXT: v_mov_b32_e32 v12, s0 1674; CI-NEXT: s_add_u32 s0, s0, 16 1675; CI-NEXT: v_mov_b32_e32 v15, s3 1676; CI-NEXT: s_addc_u32 s1, s1, 0 1677; CI-NEXT: v_mov_b32_e32 v14, s2 1678; CI-NEXT: s_waitcnt vmcnt(0) 1679; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1680; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1681; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1682; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 1683; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 1684; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1685; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 1686; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 1687; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 1688; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 1689; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 1690; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1691; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 1692; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 1693; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1694; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] 1695; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1696; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1697; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 1698; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1699; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 1700; CI-NEXT: v_mov_b32_e32 v17, s1 1701; CI-NEXT: v_mov_b32_e32 v16, s0 1702; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1703; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1704; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1705; CI-NEXT: s_endpgm 1706; 1707; VI-LABEL: global_extload_v8f16_to_v8f64: 1708; VI: ; %bb.0: 1709; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1710; VI-NEXT: s_waitcnt lgkmcnt(0) 1711; VI-NEXT: v_mov_b32_e32 v0, s2 1712; VI-NEXT: v_mov_b32_e32 v1, s3 1713; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1714; VI-NEXT: s_add_u32 s2, s0, 48 1715; VI-NEXT: s_addc_u32 s3, s1, 0 1716; VI-NEXT: v_mov_b32_e32 v8, s3 1717; VI-NEXT: v_mov_b32_e32 v7, s2 1718; VI-NEXT: s_add_u32 s2, s0, 32 1719; VI-NEXT: v_mov_b32_e32 v13, s1 1720; VI-NEXT: s_addc_u32 s3, s1, 0 1721; VI-NEXT: v_mov_b32_e32 v12, s0 1722; VI-NEXT: s_add_u32 s0, s0, 16 1723; VI-NEXT: v_mov_b32_e32 v15, s3 1724; VI-NEXT: s_addc_u32 s1, s1, 0 1725; VI-NEXT: v_mov_b32_e32 v14, s2 1726; VI-NEXT: s_waitcnt vmcnt(0) 1727; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1728; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1729; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1730; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1731; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1732; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 1733; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1734; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 1735; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1736; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1737; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1738; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] 1739; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 1740; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 1741; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1742; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 1743; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 1744; VI-NEXT: v_mov_b32_e32 v17, s1 1745; VI-NEXT: v_mov_b32_e32 v16, s0 1746; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1747; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] 1748; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1749; VI-NEXT: s_endpgm 1750; 1751; GFX11-LABEL: global_extload_v8f16_to_v8f64: 1752; GFX11: ; %bb.0: 1753; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1754; GFX11-NEXT: v_mov_b32_e32 v16, 0 1755; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1756; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] 1757; GFX11-NEXT: s_waitcnt vmcnt(0) 1758; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0 1759; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 1760; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 1761; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 1762; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2 1763; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1764; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 1765; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 1766; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 1767; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v5 1768; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 1769; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v9 1770; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 1771; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7 1772; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 1773; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1774; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v6 1775; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1776; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1777; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1778; GFX11-NEXT: s_clause 0x3 1779; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 1780; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 1781; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 1782; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] 1783; GFX11-NEXT: s_endpgm 1784 %val = load <8 x half>, ptr addrspace(1) %in 1785 %cvt = fpext <8 x half> %val to <8 x double> 1786 store <8 x double> %cvt, ptr addrspace(1) %out 1787 ret void 1788} 1789 1790define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 1791; CI-LABEL: global_extload_v16f16_to_v16f64: 1792; CI: ; %bb.0: 1793; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1794; CI-NEXT: s_waitcnt lgkmcnt(0) 1795; CI-NEXT: v_mov_b32_e32 v0, s2 1796; CI-NEXT: v_mov_b32_e32 v1, s3 1797; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1798; CI-NEXT: s_add_u32 s2, s2, 16 1799; CI-NEXT: s_addc_u32 s3, s3, 0 1800; CI-NEXT: v_mov_b32_e32 v5, s3 1801; CI-NEXT: v_mov_b32_e32 v4, s2 1802; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1803; CI-NEXT: s_add_u32 s2, s0, 48 1804; CI-NEXT: s_addc_u32 s3, s1, 0 1805; CI-NEXT: v_mov_b32_e32 v15, s3 1806; CI-NEXT: v_mov_b32_e32 v14, s2 1807; CI-NEXT: s_add_u32 s2, s0, 32 1808; CI-NEXT: s_addc_u32 s3, s1, 0 1809; CI-NEXT: v_mov_b32_e32 v17, s3 1810; CI-NEXT: v_mov_b32_e32 v16, s2 1811; CI-NEXT: s_add_u32 s2, s0, 16 1812; CI-NEXT: s_addc_u32 s3, s1, 0 1813; CI-NEXT: v_mov_b32_e32 v19, s3 1814; CI-NEXT: v_mov_b32_e32 v18, s2 1815; CI-NEXT: s_add_u32 s2, s0, 0x70 1816; CI-NEXT: s_addc_u32 s3, s1, 0 1817; CI-NEXT: v_mov_b32_e32 v13, s1 1818; CI-NEXT: v_mov_b32_e32 v12, s0 1819; CI-NEXT: s_waitcnt vmcnt(1) 1820; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 1821; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1822; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 1823; CI-NEXT: s_waitcnt vmcnt(0) 1824; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 1825; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 1826; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1827; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1828; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1829; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1830; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 1831; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1832; CI-NEXT: v_mov_b32_e32 v15, s3 1833; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 1834; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 1835; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1836; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1837; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1838; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1839; CI-NEXT: v_mov_b32_e32 v14, s2 1840; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 1841; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 1842; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1843; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1844; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 1845; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 1846; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 1847; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1848; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 1849; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1850; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1851; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 1852; CI-NEXT: s_add_u32 s2, s0, 0x60 1853; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 1854; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 1855; CI-NEXT: s_addc_u32 s3, s1, 0 1856; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1857; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1858; CI-NEXT: v_mov_b32_e32 v17, s3 1859; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1860; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1861; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 1862; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 1863; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 1864; CI-NEXT: v_mov_b32_e32 v16, s2 1865; CI-NEXT: s_add_u32 s2, s0, 0x50 1866; CI-NEXT: s_addc_u32 s3, s1, 0 1867; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 1868; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1869; CI-NEXT: s_add_u32 s0, s0, 64 1870; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] 1871; CI-NEXT: s_addc_u32 s1, s1, 0 1872; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 1873; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 1874; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1875; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 1876; CI-NEXT: v_mov_b32_e32 v19, s3 1877; CI-NEXT: v_mov_b32_e32 v13, s1 1878; CI-NEXT: v_mov_b32_e32 v18, s2 1879; CI-NEXT: v_mov_b32_e32 v12, s0 1880; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1881; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1882; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1883; CI-NEXT: s_endpgm 1884; 1885; VI-LABEL: global_extload_v16f16_to_v16f64: 1886; VI: ; %bb.0: 1887; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1888; VI-NEXT: s_waitcnt lgkmcnt(0) 1889; VI-NEXT: v_mov_b32_e32 v0, s2 1890; VI-NEXT: v_mov_b32_e32 v1, s3 1891; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 1892; VI-NEXT: s_add_u32 s2, s2, 16 1893; VI-NEXT: s_addc_u32 s3, s3, 0 1894; VI-NEXT: v_mov_b32_e32 v0, s2 1895; VI-NEXT: v_mov_b32_e32 v1, s3 1896; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1897; VI-NEXT: s_add_u32 s2, s0, 48 1898; VI-NEXT: s_addc_u32 s3, s1, 0 1899; VI-NEXT: v_mov_b32_e32 v14, s3 1900; VI-NEXT: v_mov_b32_e32 v13, s2 1901; VI-NEXT: s_add_u32 s2, s0, 32 1902; VI-NEXT: s_addc_u32 s3, s1, 0 1903; VI-NEXT: v_mov_b32_e32 v16, s3 1904; VI-NEXT: v_mov_b32_e32 v15, s2 1905; VI-NEXT: s_add_u32 s2, s0, 16 1906; VI-NEXT: s_addc_u32 s3, s1, 0 1907; VI-NEXT: v_mov_b32_e32 v18, s3 1908; VI-NEXT: v_mov_b32_e32 v17, s2 1909; VI-NEXT: s_add_u32 s2, s0, 0x50 1910; VI-NEXT: v_mov_b32_e32 v12, s1 1911; VI-NEXT: s_addc_u32 s3, s1, 0 1912; VI-NEXT: v_mov_b32_e32 v11, s0 1913; VI-NEXT: s_waitcnt vmcnt(1) 1914; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1915; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1916; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1917; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1918; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1919; VI-NEXT: s_nop 0 1920; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1921; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1922; VI-NEXT: s_waitcnt vmcnt(1) 1923; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 1924; VI-NEXT: v_mov_b32_e32 v14, s3 1925; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1926; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1927; VI-NEXT: v_mov_b32_e32 v13, s2 1928; VI-NEXT: s_add_u32 s2, s0, 64 1929; VI-NEXT: s_addc_u32 s3, s1, 0 1930; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] 1931; VI-NEXT: v_mov_b32_e32 v16, s3 1932; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1933; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1934; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 1935; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1936; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 1937; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1938; VI-NEXT: v_mov_b32_e32 v15, s2 1939; VI-NEXT: s_add_u32 s2, s0, 0x70 1940; VI-NEXT: s_addc_u32 s3, s1, 0 1941; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] 1942; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1943; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 1944; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 1945; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1946; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1947; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 1948; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 1949; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1950; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 1951; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 1952; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1953; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 1954; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 1955; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 1956; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1957; VI-NEXT: s_add_u32 s0, s0, 0x60 1958; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] 1959; VI-NEXT: s_addc_u32 s1, s1, 0 1960; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1961; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1962; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1963; VI-NEXT: v_mov_b32_e32 v20, s3 1964; VI-NEXT: v_mov_b32_e32 v14, s1 1965; VI-NEXT: v_mov_b32_e32 v19, s2 1966; VI-NEXT: v_mov_b32_e32 v13, s0 1967; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] 1968; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] 1969; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] 1970; VI-NEXT: s_endpgm 1971; 1972; GFX11-LABEL: global_extload_v16f16_to_v16f64: 1973; GFX11: ; %bb.0: 1974; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 1975; GFX11-NEXT: v_mov_b32_e32 v32, 0 1976; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1977; GFX11-NEXT: s_clause 0x1 1978; GFX11-NEXT: global_load_b128 v[0:3], v32, s[2:3] 1979; GFX11-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16 1980; GFX11-NEXT: s_waitcnt vmcnt(1) 1981; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 1982; GFX11-NEXT: s_waitcnt vmcnt(0) 1983; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v5 1984; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 1985; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v4 1986; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v7 1987; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 1988; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v6 1989; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 1990; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v3 1991; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1992; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2 1993; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1994; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v4 1995; GFX11-NEXT: v_cvt_f32_f16_e32 v22, v5 1996; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 1997; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v23 1998; GFX11-NEXT: v_cvt_f32_f16_e32 v34, v11 1999; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v19 2000; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 2001; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7 2002; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v6 2003; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0 2004; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 2005; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 2006; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v22 2007; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v10 2008; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v18 2009; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v11 2010; GFX11-NEXT: v_cvt_f32_f16_e32 v33, v9 2011; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v15 2012; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v7 2013; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v14 2014; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v6 2015; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 2016; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 2017; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 2018; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 2019; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 2020; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v34 2021; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v33 2022; GFX11-NEXT: s_clause 0x7 2023; GFX11-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80 2024; GFX11-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64 2025; GFX11-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112 2026; GFX11-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96 2027; GFX11-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 2028; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 2029; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 2030; GFX11-NEXT: global_store_b128 v32, v[0:3], s[0:1] 2031; GFX11-NEXT: s_endpgm 2032 %val = load <16 x half>, ptr addrspace(1) %in 2033 %cvt = fpext <16 x half> %val to <16 x double> 2034 store <16 x double> %cvt, ptr addrspace(1) %out 2035 ret void 2036} 2037 2038define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2039; CIVI-LABEL: global_truncstore_f32_to_f16: 2040; CIVI: ; %bb.0: 2041; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2042; CIVI-NEXT: s_waitcnt lgkmcnt(0) 2043; CIVI-NEXT: v_mov_b32_e32 v0, s2 2044; CIVI-NEXT: v_mov_b32_e32 v1, s3 2045; CIVI-NEXT: flat_load_dword v0, v[0:1] 2046; CIVI-NEXT: v_mov_b32_e32 v1, s1 2047; CIVI-NEXT: s_waitcnt vmcnt(0) 2048; CIVI-NEXT: v_cvt_f16_f32_e32 v2, v0 2049; CIVI-NEXT: v_mov_b32_e32 v0, s0 2050; CIVI-NEXT: flat_store_short v[0:1], v2 2051; CIVI-NEXT: s_endpgm 2052; 2053; GFX11-LABEL: global_truncstore_f32_to_f16: 2054; GFX11: ; %bb.0: 2055; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2056; GFX11-NEXT: v_mov_b32_e32 v0, 0 2057; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2058; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 2059; GFX11-NEXT: s_waitcnt vmcnt(0) 2060; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2061; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 2062; GFX11-NEXT: s_endpgm 2063 %val = load float, ptr addrspace(1) %in 2064 %cvt = fptrunc float %val to half 2065 store half %cvt, ptr addrspace(1) %out 2066 ret void 2067} 2068 2069define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2070; CI-LABEL: global_truncstore_v2f32_to_v2f16: 2071; CI: ; %bb.0: 2072; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2073; CI-NEXT: s_waitcnt lgkmcnt(0) 2074; CI-NEXT: v_mov_b32_e32 v0, s2 2075; CI-NEXT: v_mov_b32_e32 v1, s3 2076; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2077; CI-NEXT: s_waitcnt vmcnt(0) 2078; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 2079; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 2080; CI-NEXT: v_mov_b32_e32 v0, s0 2081; CI-NEXT: v_mov_b32_e32 v1, s1 2082; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2083; CI-NEXT: v_or_b32_e32 v2, v3, v2 2084; CI-NEXT: flat_store_dword v[0:1], v2 2085; CI-NEXT: s_endpgm 2086; 2087; VI-LABEL: global_truncstore_v2f32_to_v2f16: 2088; VI: ; %bb.0: 2089; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2090; VI-NEXT: s_waitcnt lgkmcnt(0) 2091; VI-NEXT: v_mov_b32_e32 v0, s2 2092; VI-NEXT: v_mov_b32_e32 v1, s3 2093; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2094; VI-NEXT: s_waitcnt vmcnt(0) 2095; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2096; VI-NEXT: v_cvt_f16_f32_e32 v3, v0 2097; VI-NEXT: v_mov_b32_e32 v0, s0 2098; VI-NEXT: v_mov_b32_e32 v1, s1 2099; VI-NEXT: v_or_b32_e32 v2, v3, v2 2100; VI-NEXT: flat_store_dword v[0:1], v2 2101; VI-NEXT: s_endpgm 2102; 2103; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: 2104; GFX11: ; %bb.0: 2105; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2106; GFX11-NEXT: v_mov_b32_e32 v2, 0 2107; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2108; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 2109; GFX11-NEXT: s_waitcnt vmcnt(0) 2110; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2111; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2113; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 2114; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 2115; GFX11-NEXT: s_endpgm 2116 %val = load <2 x float>, ptr addrspace(1) %in 2117 %cvt = fptrunc <2 x float> %val to <2 x half> 2118 store <2 x half> %cvt, ptr addrspace(1) %out 2119 ret void 2120} 2121 2122define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2123; CI-LABEL: global_truncstore_v3f32_to_v3f16: 2124; CI: ; %bb.0: 2125; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2126; CI-NEXT: s_waitcnt lgkmcnt(0) 2127; CI-NEXT: v_mov_b32_e32 v0, s2 2128; CI-NEXT: v_mov_b32_e32 v1, s3 2129; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 2130; CI-NEXT: s_add_u32 s2, s0, 4 2131; CI-NEXT: s_addc_u32 s3, s1, 0 2132; CI-NEXT: s_waitcnt vmcnt(0) 2133; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 2134; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2135; CI-NEXT: v_cvt_f16_f32_e32 v4, v0 2136; CI-NEXT: v_mov_b32_e32 v0, s2 2137; CI-NEXT: v_mov_b32_e32 v1, s3 2138; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2139; CI-NEXT: flat_store_short v[0:1], v2 2140; CI-NEXT: v_mov_b32_e32 v0, s0 2141; CI-NEXT: v_or_b32_e32 v2, v4, v3 2142; CI-NEXT: v_mov_b32_e32 v1, s1 2143; CI-NEXT: flat_store_dword v[0:1], v2 2144; CI-NEXT: s_endpgm 2145; 2146; VI-LABEL: global_truncstore_v3f32_to_v3f16: 2147; VI: ; %bb.0: 2148; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2149; VI-NEXT: s_waitcnt lgkmcnt(0) 2150; VI-NEXT: v_mov_b32_e32 v0, s2 2151; VI-NEXT: v_mov_b32_e32 v1, s3 2152; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 2153; VI-NEXT: s_add_u32 s2, s0, 4 2154; VI-NEXT: s_addc_u32 s3, s1, 0 2155; VI-NEXT: s_waitcnt vmcnt(0) 2156; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 2157; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2158; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 2159; VI-NEXT: v_mov_b32_e32 v0, s2 2160; VI-NEXT: v_mov_b32_e32 v1, s3 2161; VI-NEXT: flat_store_short v[0:1], v2 2162; VI-NEXT: v_mov_b32_e32 v0, s0 2163; VI-NEXT: v_or_b32_e32 v3, v4, v3 2164; VI-NEXT: v_mov_b32_e32 v1, s1 2165; VI-NEXT: flat_store_dword v[0:1], v3 2166; VI-NEXT: s_endpgm 2167; 2168; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: 2169; GFX11: ; %bb.0: 2170; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2171; GFX11-NEXT: v_mov_b32_e32 v3, 0 2172; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2173; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] 2174; GFX11-NEXT: s_waitcnt vmcnt(0) 2175; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2176; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2177; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 2178; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2179; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 2180; GFX11-NEXT: s_clause 0x1 2181; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 2182; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] 2183; GFX11-NEXT: s_endpgm 2184 %val = load <3 x float>, ptr addrspace(1) %in 2185 %cvt = fptrunc <3 x float> %val to <3 x half> 2186 store <3 x half> %cvt, ptr addrspace(1) %out 2187 ret void 2188} 2189 2190define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2191; CI-LABEL: global_truncstore_v4f32_to_v4f16: 2192; CI: ; %bb.0: 2193; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2194; CI-NEXT: s_waitcnt lgkmcnt(0) 2195; CI-NEXT: v_mov_b32_e32 v0, s2 2196; CI-NEXT: v_mov_b32_e32 v1, s3 2197; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2198; CI-NEXT: v_mov_b32_e32 v4, s0 2199; CI-NEXT: v_mov_b32_e32 v5, s1 2200; CI-NEXT: s_waitcnt vmcnt(0) 2201; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2202; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2203; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2204; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2205; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2206; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 2207; CI-NEXT: v_or_b32_e32 v1, v2, v3 2208; CI-NEXT: v_or_b32_e32 v0, v0, v6 2209; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2210; CI-NEXT: s_endpgm 2211; 2212; VI-LABEL: global_truncstore_v4f32_to_v4f16: 2213; VI: ; %bb.0: 2214; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2215; VI-NEXT: s_waitcnt lgkmcnt(0) 2216; VI-NEXT: v_mov_b32_e32 v0, s2 2217; VI-NEXT: v_mov_b32_e32 v1, s3 2218; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2219; VI-NEXT: s_waitcnt vmcnt(0) 2220; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2221; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 2222; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2223; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 2224; VI-NEXT: v_mov_b32_e32 v0, s0 2225; VI-NEXT: v_mov_b32_e32 v1, s1 2226; VI-NEXT: v_or_b32_e32 v3, v2, v3 2227; VI-NEXT: v_or_b32_e32 v2, v5, v4 2228; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2229; VI-NEXT: s_endpgm 2230; 2231; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: 2232; GFX11: ; %bb.0: 2233; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2234; GFX11-NEXT: v_mov_b32_e32 v4, 0 2235; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2236; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 2237; GFX11-NEXT: s_waitcnt vmcnt(0) 2238; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2239; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 2240; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v1 2241; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2242; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 2243; GFX11-NEXT: v_pack_b32_f16 v1, v2, v3 2244; GFX11-NEXT: v_pack_b32_f16 v0, v0, v5 2245; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 2246; GFX11-NEXT: s_endpgm 2247 %val = load <4 x float>, ptr addrspace(1) %in 2248 %cvt = fptrunc <4 x float> %val to <4 x half> 2249 store <4 x half> %cvt, ptr addrspace(1) %out 2250 ret void 2251} 2252 2253define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2254; CI-LABEL: global_truncstore_v8f32_to_v8f16: 2255; CI: ; %bb.0: 2256; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2257; CI-NEXT: s_waitcnt lgkmcnt(0) 2258; CI-NEXT: v_mov_b32_e32 v0, s2 2259; CI-NEXT: v_mov_b32_e32 v1, s3 2260; CI-NEXT: s_add_u32 s2, s2, 16 2261; CI-NEXT: s_addc_u32 s3, s3, 0 2262; CI-NEXT: v_mov_b32_e32 v5, s3 2263; CI-NEXT: v_mov_b32_e32 v4, s2 2264; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2265; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2266; CI-NEXT: v_mov_b32_e32 v8, s0 2267; CI-NEXT: v_mov_b32_e32 v9, s1 2268; CI-NEXT: s_waitcnt vmcnt(1) 2269; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2270; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2271; CI-NEXT: s_waitcnt vmcnt(0) 2272; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2273; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 2274; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2275; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2276; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 2277; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2278; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2279; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 2280; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2281; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2282; CI-NEXT: v_or_b32_e32 v1, v2, v3 2283; CI-NEXT: v_or_b32_e32 v0, v0, v10 2284; CI-NEXT: v_or_b32_e32 v3, v6, v7 2285; CI-NEXT: v_or_b32_e32 v2, v4, v5 2286; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2287; CI-NEXT: s_endpgm 2288; 2289; VI-LABEL: global_truncstore_v8f32_to_v8f16: 2290; VI: ; %bb.0: 2291; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2292; VI-NEXT: s_waitcnt lgkmcnt(0) 2293; VI-NEXT: v_mov_b32_e32 v0, s2 2294; VI-NEXT: v_mov_b32_e32 v1, s3 2295; VI-NEXT: s_add_u32 s2, s2, 16 2296; VI-NEXT: s_addc_u32 s3, s3, 0 2297; VI-NEXT: v_mov_b32_e32 v5, s3 2298; VI-NEXT: v_mov_b32_e32 v4, s2 2299; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2300; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2301; VI-NEXT: v_mov_b32_e32 v8, s0 2302; VI-NEXT: v_mov_b32_e32 v9, s1 2303; VI-NEXT: s_waitcnt vmcnt(1) 2304; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2305; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 2306; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2307; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 2308; VI-NEXT: s_waitcnt vmcnt(0) 2309; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2310; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2311; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2312; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 2313; VI-NEXT: v_or_b32_e32 v1, v2, v3 2314; VI-NEXT: v_or_b32_e32 v0, v0, v10 2315; VI-NEXT: v_or_b32_e32 v3, v6, v7 2316; VI-NEXT: v_or_b32_e32 v2, v4, v5 2317; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 2318; VI-NEXT: s_endpgm 2319; 2320; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: 2321; GFX11: ; %bb.0: 2322; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2323; GFX11-NEXT: v_mov_b32_e32 v8, 0 2324; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2325; GFX11-NEXT: s_clause 0x1 2326; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 2327; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] 2328; GFX11-NEXT: s_waitcnt vmcnt(1) 2329; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2330; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 2331; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2332; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2333; GFX11-NEXT: s_waitcnt vmcnt(0) 2334; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7 2335; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 2336; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 2337; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 2338; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3 2339; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1 2340; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7 2341; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 2342; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 2343; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] 2344; GFX11-NEXT: s_endpgm 2345 %val = load <8 x float>, ptr addrspace(1) %in 2346 %cvt = fptrunc <8 x float> %val to <8 x half> 2347 store <8 x half> %cvt, ptr addrspace(1) %out 2348 ret void 2349} 2350 2351define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2352; CI-LABEL: global_truncstore_v16f32_to_v16f16: 2353; CI: ; %bb.0: 2354; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2355; CI-NEXT: s_waitcnt lgkmcnt(0) 2356; CI-NEXT: s_add_u32 s4, s2, 32 2357; CI-NEXT: s_addc_u32 s5, s3, 0 2358; CI-NEXT: v_mov_b32_e32 v0, s4 2359; CI-NEXT: v_mov_b32_e32 v1, s5 2360; CI-NEXT: s_add_u32 s4, s2, 48 2361; CI-NEXT: s_addc_u32 s5, s3, 0 2362; CI-NEXT: v_mov_b32_e32 v9, s3 2363; CI-NEXT: v_mov_b32_e32 v4, s4 2364; CI-NEXT: v_mov_b32_e32 v8, s2 2365; CI-NEXT: s_add_u32 s2, s2, 16 2366; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2367; CI-NEXT: v_mov_b32_e32 v5, s5 2368; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2369; CI-NEXT: s_addc_u32 s3, s3, 0 2370; CI-NEXT: v_mov_b32_e32 v13, s3 2371; CI-NEXT: v_mov_b32_e32 v12, s2 2372; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 2373; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 2374; CI-NEXT: s_add_u32 s2, s0, 16 2375; CI-NEXT: s_addc_u32 s3, s1, 0 2376; CI-NEXT: s_waitcnt vmcnt(3) 2377; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2378; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2379; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2380; CI-NEXT: s_waitcnt vmcnt(2) 2381; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2382; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 2383; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2384; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 2385; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 2386; CI-NEXT: s_waitcnt vmcnt(1) 2387; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 2388; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 2389; CI-NEXT: s_waitcnt vmcnt(0) 2390; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 2391; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 2392; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 2393; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 2394; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 2395; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 2396; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2397; CI-NEXT: v_mov_b32_e32 v5, s3 2398; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 2399; CI-NEXT: v_or_b32_e32 v1, v2, v3 2400; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 2401; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 2402; CI-NEXT: v_mov_b32_e32 v4, s2 2403; CI-NEXT: v_or_b32_e32 v0, v0, v18 2404; CI-NEXT: v_or_b32_e32 v3, v6, v2 2405; CI-NEXT: v_or_b32_e32 v2, v17, v7 2406; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 2407; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 2408; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 2409; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 2410; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2411; CI-NEXT: v_mov_b32_e32 v5, s1 2412; CI-NEXT: v_or_b32_e32 v1, v10, v6 2413; CI-NEXT: v_or_b32_e32 v0, v8, v7 2414; CI-NEXT: v_or_b32_e32 v3, v14, v9 2415; CI-NEXT: v_or_b32_e32 v2, v12, v11 2416; CI-NEXT: v_mov_b32_e32 v4, s0 2417; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2418; CI-NEXT: s_endpgm 2419; 2420; VI-LABEL: global_truncstore_v16f32_to_v16f16: 2421; VI: ; %bb.0: 2422; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2423; VI-NEXT: s_waitcnt lgkmcnt(0) 2424; VI-NEXT: s_add_u32 s4, s2, 32 2425; VI-NEXT: s_addc_u32 s5, s3, 0 2426; VI-NEXT: v_mov_b32_e32 v0, s4 2427; VI-NEXT: v_mov_b32_e32 v1, s5 2428; VI-NEXT: s_add_u32 s4, s2, 48 2429; VI-NEXT: s_addc_u32 s5, s3, 0 2430; VI-NEXT: v_mov_b32_e32 v9, s3 2431; VI-NEXT: v_mov_b32_e32 v4, s4 2432; VI-NEXT: v_mov_b32_e32 v8, s2 2433; VI-NEXT: s_add_u32 s2, s2, 16 2434; VI-NEXT: v_mov_b32_e32 v5, s5 2435; VI-NEXT: s_addc_u32 s3, s3, 0 2436; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2437; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 2438; VI-NEXT: v_mov_b32_e32 v13, s3 2439; VI-NEXT: v_mov_b32_e32 v12, s2 2440; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 2441; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 2442; VI-NEXT: s_add_u32 s2, s0, 16 2443; VI-NEXT: s_addc_u32 s3, s1, 0 2444; VI-NEXT: s_waitcnt vmcnt(3) 2445; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2446; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 2447; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2448; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 2449; VI-NEXT: s_waitcnt vmcnt(2) 2450; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2451; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 2452; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2453; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 2454; VI-NEXT: s_waitcnt vmcnt(1) 2455; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2456; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 2457; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2458; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 2459; VI-NEXT: s_waitcnt vmcnt(0) 2460; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2461; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 2462; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2463; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 2464; VI-NEXT: v_mov_b32_e32 v5, s3 2465; VI-NEXT: v_mov_b32_e32 v4, s2 2466; VI-NEXT: v_or_b32_e32 v1, v2, v3 2467; VI-NEXT: v_or_b32_e32 v0, v0, v16 2468; VI-NEXT: v_or_b32_e32 v3, v6, v7 2469; VI-NEXT: v_or_b32_e32 v2, v18, v17 2470; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2471; VI-NEXT: v_mov_b32_e32 v5, s1 2472; VI-NEXT: v_or_b32_e32 v1, v10, v11 2473; VI-NEXT: v_or_b32_e32 v0, v8, v9 2474; VI-NEXT: v_or_b32_e32 v3, v14, v15 2475; VI-NEXT: v_or_b32_e32 v2, v12, v13 2476; VI-NEXT: v_mov_b32_e32 v4, s0 2477; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2478; VI-NEXT: s_endpgm 2479; 2480; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: 2481; GFX11: ; %bb.0: 2482; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2483; GFX11-NEXT: v_mov_b32_e32 v16, 0 2484; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2485; GFX11-NEXT: s_clause 0x3 2486; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] offset:16 2487; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] 2488; GFX11-NEXT: global_load_b128 v[8:11], v16, s[2:3] offset:48 2489; GFX11-NEXT: global_load_b128 v[12:15], v16, s[2:3] offset:32 2490; GFX11-NEXT: s_waitcnt vmcnt(3) 2491; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 2492; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 2493; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 2494; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 2495; GFX11-NEXT: s_waitcnt vmcnt(2) 2496; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7 2497; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 2498; GFX11-NEXT: v_cvt_f16_f32_e32 v17, v5 2499; GFX11-NEXT: v_cvt_f16_f32_e32 v18, v4 2500; GFX11-NEXT: s_waitcnt vmcnt(1) 2501; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v11 2502; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v10 2503; GFX11-NEXT: v_cvt_f16_f32_e32 v9, v9 2504; GFX11-NEXT: v_cvt_f16_f32_e32 v8, v8 2505; GFX11-NEXT: s_waitcnt vmcnt(0) 2506; GFX11-NEXT: v_cvt_f16_f32_e32 v10, v15 2507; GFX11-NEXT: v_cvt_f16_f32_e32 v11, v14 2508; GFX11-NEXT: v_cvt_f16_f32_e32 v13, v13 2509; GFX11-NEXT: v_cvt_f16_f32_e32 v12, v12 2510; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3 2511; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1 2512; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7 2513; GFX11-NEXT: v_pack_b32_f16 v7, v5, v4 2514; GFX11-NEXT: v_pack_b32_f16 v6, v8, v9 2515; GFX11-NEXT: v_pack_b32_f16 v5, v11, v10 2516; GFX11-NEXT: v_pack_b32_f16 v4, v12, v13 2517; GFX11-NEXT: v_pack_b32_f16 v0, v18, v17 2518; GFX11-NEXT: s_clause 0x1 2519; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 2520; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] 2521; GFX11-NEXT: s_endpgm 2522 %val = load <16 x float>, ptr addrspace(1) %in 2523 %cvt = fptrunc <16 x float> %val to <16 x half> 2524 store <16 x half> %cvt, ptr addrspace(1) %out 2525 ret void 2526} 2527 2528; FIXME: Unsafe math should fold conversions away 2529define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 { 2530; CI-LABEL: fadd_f16: 2531; CI: ; %bb.0: 2532; CI-NEXT: s_load_dword s0, s[8:9], 0x2 2533; CI-NEXT: s_waitcnt lgkmcnt(0) 2534; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 2535; CI-NEXT: s_lshr_b32 s0, s0, 16 2536; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 2537; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2538; CI-NEXT: v_add_f32_e32 v0, v0, v1 2539; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 2540; CI-NEXT: s_waitcnt lgkmcnt(0) 2541; CI-NEXT: v_mov_b32_e32 v0, s0 2542; CI-NEXT: v_mov_b32_e32 v1, s1 2543; CI-NEXT: flat_store_short v[0:1], v2 2544; CI-NEXT: s_endpgm 2545; 2546; VI-LABEL: fadd_f16: 2547; VI: ; %bb.0: 2548; VI-NEXT: s_load_dword s2, s[8:9], 0x8 2549; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 2550; VI-NEXT: s_waitcnt lgkmcnt(0) 2551; VI-NEXT: s_lshr_b32 s3, s2, 16 2552; VI-NEXT: v_mov_b32_e32 v0, s3 2553; VI-NEXT: v_add_f16_e32 v2, s2, v0 2554; VI-NEXT: v_mov_b32_e32 v0, s0 2555; VI-NEXT: v_mov_b32_e32 v1, s1 2556; VI-NEXT: flat_store_short v[0:1], v2 2557; VI-NEXT: s_endpgm 2558; 2559; GFX11-LABEL: fadd_f16: 2560; GFX11: ; %bb.0: 2561; GFX11-NEXT: s_clause 0x1 2562; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 2563; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2564; GFX11-NEXT: v_mov_b32_e32 v0, 0 2565; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2566; GFX11-NEXT: s_lshr_b32 s3, s2, 16 2567; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2568; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 2569; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 2570; GFX11-NEXT: s_endpgm 2571 %add = fadd half %a, %b 2572 store half %add, ptr addrspace(1) %out, align 4 2573 ret void 2574} 2575 2576define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 { 2577; CI-LABEL: fadd_v2f16: 2578; CI: ; %bb.0: 2579; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2580; CI-NEXT: s_waitcnt lgkmcnt(0) 2581; CI-NEXT: s_lshr_b32 s4, s2, 16 2582; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 2583; CI-NEXT: s_lshr_b32 s2, s3, 16 2584; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 2585; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 2586; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 2587; CI-NEXT: v_add_f32_e32 v0, v0, v1 2588; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2589; CI-NEXT: v_add_f32_e32 v1, v2, v3 2590; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2591; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2592; CI-NEXT: v_or_b32_e32 v2, v0, v1 2593; CI-NEXT: v_mov_b32_e32 v0, s0 2594; CI-NEXT: v_mov_b32_e32 v1, s1 2595; CI-NEXT: flat_store_dword v[0:1], v2 2596; CI-NEXT: s_endpgm 2597; 2598; VI-LABEL: fadd_v2f16: 2599; VI: ; %bb.0: 2600; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2601; VI-NEXT: s_waitcnt lgkmcnt(0) 2602; VI-NEXT: s_lshr_b32 s4, s3, 16 2603; VI-NEXT: s_lshr_b32 s5, s2, 16 2604; VI-NEXT: v_mov_b32_e32 v0, s3 2605; VI-NEXT: v_mov_b32_e32 v1, s4 2606; VI-NEXT: v_mov_b32_e32 v2, s5 2607; VI-NEXT: v_add_f16_e32 v0, s2, v0 2608; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2609; VI-NEXT: v_or_b32_e32 v2, v0, v1 2610; VI-NEXT: v_mov_b32_e32 v0, s0 2611; VI-NEXT: v_mov_b32_e32 v1, s1 2612; VI-NEXT: flat_store_dword v[0:1], v2 2613; VI-NEXT: s_endpgm 2614; 2615; GFX11-LABEL: fadd_v2f16: 2616; GFX11: ; %bb.0: 2617; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2618; GFX11-NEXT: v_mov_b32_e32 v0, 0 2619; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2620; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 2621; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 2622; GFX11-NEXT: s_endpgm 2623 %add = fadd <2 x half> %a, %b 2624 store <2 x half> %add, ptr addrspace(1) %out, align 8 2625 ret void 2626} 2627 2628define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2629; CI-LABEL: fadd_v4f16: 2630; CI: ; %bb.0: 2631; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2632; CI-NEXT: s_waitcnt lgkmcnt(0) 2633; CI-NEXT: v_mov_b32_e32 v0, s2 2634; CI-NEXT: v_mov_b32_e32 v1, s3 2635; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2636; CI-NEXT: v_mov_b32_e32 v4, s0 2637; CI-NEXT: v_mov_b32_e32 v5, s1 2638; CI-NEXT: s_waitcnt vmcnt(0) 2639; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 2640; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2641; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 2642; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2643; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 2644; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2645; CI-NEXT: v_cvt_f32_f16_e32 v9, v3 2646; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2647; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 2648; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 2649; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 2650; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 2651; CI-NEXT: v_add_f32_e32 v7, v7, v9 2652; CI-NEXT: v_add_f32_e32 v6, v6, v8 2653; CI-NEXT: v_add_f32_e32 v1, v1, v3 2654; CI-NEXT: v_add_f32_e32 v0, v0, v2 2655; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2656; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2657; CI-NEXT: v_cvt_f16_f32_e32 v2, v7 2658; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 2659; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2660; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2661; CI-NEXT: v_or_b32_e32 v1, v2, v1 2662; CI-NEXT: v_or_b32_e32 v0, v3, v0 2663; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2664; CI-NEXT: s_endpgm 2665; 2666; VI-LABEL: fadd_v4f16: 2667; VI: ; %bb.0: 2668; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2669; VI-NEXT: s_waitcnt lgkmcnt(0) 2670; VI-NEXT: v_mov_b32_e32 v0, s2 2671; VI-NEXT: v_mov_b32_e32 v1, s3 2672; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2673; VI-NEXT: v_mov_b32_e32 v4, s0 2674; VI-NEXT: v_mov_b32_e32 v5, s1 2675; VI-NEXT: s_waitcnt vmcnt(0) 2676; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2677; VI-NEXT: v_add_f16_e32 v1, v1, v3 2678; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2679; VI-NEXT: v_add_f16_e32 v0, v0, v2 2680; VI-NEXT: v_or_b32_e32 v1, v1, v6 2681; VI-NEXT: v_or_b32_e32 v0, v0, v3 2682; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2683; VI-NEXT: s_endpgm 2684; 2685; GFX11-LABEL: fadd_v4f16: 2686; GFX11: ; %bb.0: 2687; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2688; GFX11-NEXT: v_mov_b32_e32 v4, 0 2689; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2690; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 2691; GFX11-NEXT: s_waitcnt vmcnt(0) 2692; GFX11-NEXT: v_pk_add_f16 v1, v1, v3 2693; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 2694; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 2695; GFX11-NEXT: s_endpgm 2696 %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1 2697 %a = load <4 x half>, ptr addrspace(1) %in, align 16 2698 %b = load <4 x half>, ptr addrspace(1) %b_ptr, align 16 2699 %result = fadd <4 x half> %a, %b 2700 store <4 x half> %result, ptr addrspace(1) %out, align 16 2701 ret void 2702} 2703 2704define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 { 2705; CI-LABEL: fadd_v8f16: 2706; CI: ; %bb.0: 2707; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4 2708; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 2709; CI-NEXT: s_waitcnt lgkmcnt(0) 2710; CI-NEXT: s_lshr_b32 s10, s0, 16 2711; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 2712; CI-NEXT: s_lshr_b32 s0, s4, 16 2713; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 2714; CI-NEXT: s_lshr_b32 s0, s5, 16 2715; CI-NEXT: s_lshr_b32 s11, s1, 16 2716; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 2717; CI-NEXT: s_lshr_b32 s0, s6, 16 2718; CI-NEXT: s_lshr_b32 s12, s2, 16 2719; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 2720; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 2721; CI-NEXT: s_lshr_b32 s10, s3, 16 2722; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 2723; CI-NEXT: s_lshr_b32 s0, s7, 16 2724; CI-NEXT: v_cvt_f32_f16_e32 v2, s12 2725; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 2726; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 2727; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 2728; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 2729; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 2730; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 2731; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 2732; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 2733; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 2734; CI-NEXT: v_add_f32_e32 v1, v1, v9 2735; CI-NEXT: v_add_f32_e32 v0, v0, v8 2736; CI-NEXT: v_add_f32_e32 v3, v3, v11 2737; CI-NEXT: v_add_f32_e32 v2, v2, v10 2738; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2739; CI-NEXT: v_add_f32_e32 v5, v5, v13 2740; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2741; CI-NEXT: v_add_f32_e32 v4, v4, v12 2742; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2743; CI-NEXT: v_add_f32_e32 v7, v7, v14 2744; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2745; CI-NEXT: v_add_f32_e32 v6, v6, v15 2746; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 2747; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2748; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2749; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 2750; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2751; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2752; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2753; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2754; CI-NEXT: v_or_b32_e32 v1, v5, v1 2755; CI-NEXT: v_or_b32_e32 v0, v4, v0 2756; CI-NEXT: v_mov_b32_e32 v4, s8 2757; CI-NEXT: v_or_b32_e32 v3, v7, v3 2758; CI-NEXT: v_or_b32_e32 v2, v6, v2 2759; CI-NEXT: v_mov_b32_e32 v5, s9 2760; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2761; CI-NEXT: s_endpgm 2762; 2763; VI-LABEL: fadd_v8f16: 2764; VI: ; %bb.0: 2765; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10 2766; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0 2767; VI-NEXT: s_waitcnt lgkmcnt(0) 2768; VI-NEXT: s_lshr_b32 s10, s7, 16 2769; VI-NEXT: s_lshr_b32 s11, s3, 16 2770; VI-NEXT: v_mov_b32_e32 v0, s7 2771; VI-NEXT: v_mov_b32_e32 v1, s10 2772; VI-NEXT: v_mov_b32_e32 v2, s11 2773; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2774; VI-NEXT: v_add_f16_e32 v0, s3, v0 2775; VI-NEXT: s_lshr_b32 s3, s6, 16 2776; VI-NEXT: s_lshr_b32 s7, s2, 16 2777; VI-NEXT: v_or_b32_e32 v3, v0, v1 2778; VI-NEXT: v_mov_b32_e32 v0, s3 2779; VI-NEXT: v_mov_b32_e32 v1, s7 2780; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2781; VI-NEXT: v_mov_b32_e32 v1, s6 2782; VI-NEXT: v_add_f16_e32 v1, s2, v1 2783; VI-NEXT: s_lshr_b32 s2, s5, 16 2784; VI-NEXT: s_lshr_b32 s3, s1, 16 2785; VI-NEXT: v_or_b32_e32 v2, v1, v0 2786; VI-NEXT: v_mov_b32_e32 v0, s2 2787; VI-NEXT: v_mov_b32_e32 v1, s3 2788; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2789; VI-NEXT: v_mov_b32_e32 v1, s5 2790; VI-NEXT: v_add_f16_e32 v1, s1, v1 2791; VI-NEXT: s_lshr_b32 s1, s4, 16 2792; VI-NEXT: s_lshr_b32 s2, s0, 16 2793; VI-NEXT: v_or_b32_e32 v1, v1, v0 2794; VI-NEXT: v_mov_b32_e32 v0, s1 2795; VI-NEXT: v_mov_b32_e32 v4, s2 2796; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2797; VI-NEXT: v_mov_b32_e32 v4, s4 2798; VI-NEXT: v_add_f16_e32 v4, s0, v4 2799; VI-NEXT: v_or_b32_e32 v0, v4, v0 2800; VI-NEXT: v_mov_b32_e32 v4, s8 2801; VI-NEXT: v_mov_b32_e32 v5, s9 2802; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2803; VI-NEXT: s_endpgm 2804; 2805; GFX11-LABEL: fadd_v8f16: 2806; GFX11: ; %bb.0: 2807; GFX11-NEXT: s_clause 0x1 2808; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x10 2809; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 2810; GFX11-NEXT: v_mov_b32_e32 v4, 0 2811; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2812; GFX11-NEXT: v_pk_add_f16 v3, s11, s15 2813; GFX11-NEXT: v_pk_add_f16 v2, s10, s14 2814; GFX11-NEXT: v_pk_add_f16 v1, s9, s13 2815; GFX11-NEXT: v_pk_add_f16 v0, s8, s12 2816; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 2817; GFX11-NEXT: s_endpgm 2818 %add = fadd <8 x half> %a, %b 2819 store <8 x half> %add, ptr addrspace(1) %out, align 32 2820 ret void 2821} 2822 2823define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { 2824; CIVI-LABEL: test_bitcast_from_half: 2825; CIVI: ; %bb.0: 2826; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2827; CIVI-NEXT: s_waitcnt lgkmcnt(0) 2828; CIVI-NEXT: v_mov_b32_e32 v0, s0 2829; CIVI-NEXT: v_mov_b32_e32 v1, s1 2830; CIVI-NEXT: flat_load_ushort v2, v[0:1] 2831; CIVI-NEXT: v_mov_b32_e32 v0, s2 2832; CIVI-NEXT: v_mov_b32_e32 v1, s3 2833; CIVI-NEXT: s_waitcnt vmcnt(0) 2834; CIVI-NEXT: flat_store_short v[0:1], v2 2835; CIVI-NEXT: s_endpgm 2836; 2837; GFX11-LABEL: test_bitcast_from_half: 2838; GFX11: ; %bb.0: 2839; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2840; GFX11-NEXT: v_mov_b32_e32 v0, 0 2841; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2842; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] 2843; GFX11-NEXT: s_waitcnt vmcnt(0) 2844; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] 2845; GFX11-NEXT: s_endpgm 2846 %val = load half, ptr addrspace(1) %in 2847 %val_int = bitcast half %val to i16 2848 store i16 %val_int, ptr addrspace(1) %out 2849 ret void 2850} 2851 2852define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 2853; CIVI-LABEL: test_bitcast_to_half: 2854; CIVI: ; %bb.0: 2855; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2856; CIVI-NEXT: s_waitcnt lgkmcnt(0) 2857; CIVI-NEXT: v_mov_b32_e32 v0, s2 2858; CIVI-NEXT: v_mov_b32_e32 v1, s3 2859; CIVI-NEXT: flat_load_ushort v2, v[0:1] 2860; CIVI-NEXT: v_mov_b32_e32 v0, s0 2861; CIVI-NEXT: v_mov_b32_e32 v1, s1 2862; CIVI-NEXT: s_waitcnt vmcnt(0) 2863; CIVI-NEXT: flat_store_short v[0:1], v2 2864; CIVI-NEXT: s_endpgm 2865; 2866; GFX11-LABEL: test_bitcast_to_half: 2867; GFX11: ; %bb.0: 2868; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 2869; GFX11-NEXT: v_mov_b32_e32 v0, 0 2870; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2871; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 2872; GFX11-NEXT: s_waitcnt vmcnt(0) 2873; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 2874; GFX11-NEXT: s_endpgm 2875 %val = load i16, ptr addrspace(1) %in 2876 %val_fp = bitcast i16 %val to half 2877 store half %val_fp, ptr addrspace(1) %out 2878 ret void 2879} 2880 2881attributes #0 = { nounwind } 2882